From c5a7be297776c3e99c83dd0618f5fcca5b07833b Mon Sep 17 00:00:00 2001 From: Levente Nagy Date: Mon, 1 Dec 2025 15:45:42 +0100 Subject: [PATCH] COG-648 Pandas integration docs --- jupyter/06_pandas_integration_examples.ipynb | 310 +++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 jupyter/06_pandas_integration_examples.ipynb diff --git a/jupyter/06_pandas_integration_examples.ipynb b/jupyter/06_pandas_integration_examples.ipynb new file mode 100644 index 0000000..e8e430f --- /dev/null +++ b/jupyter/06_pandas_integration_examples.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "# __Pandas examples with ChemAxon molecules__", + "id": "b70058148a1f011b" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "You can see simpler examples of using pandas.DataGrame.read_table with ChemAxon molecules in the [Calculators](02_calculators.ipynb) and [Molecular similiarity notebook](03_molecular_similarity.ipynb). But here are some additional examples showing how to work with ChemAxon molecules in pandas DataFrames.", + "id": "74aa29c7fecce4b7" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "By default, molecules are printed in the _cxsmiles_ format. In case of issues with that, it is written in the _cxsmarts_ format. If none of them are possible for some reason, representation falls back to use the original format, which was recognized during the import process.", + "id": "b1ec732d397843a3" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-01T14:43:22.339727Z", + "start_time": "2025-12-01T14:43:21.966248Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from chemaxon import import_mol, open_for_import\n", + "\n", + "mol = import_mol('CC(=O)NC1=CC=C(O)C=C1')\n", + "\n", + "mol_lst = []\n", + "with open_for_import('/home/lnagy/IdeaProjects/python-api/chemaxon/resources/test.sdf') as mol_importer:\n", + " for m in mol_importer:\n", + " mol_lst.append(m)\n", + "\n", + "d = {'molecule': [mol] + mol_lst }\n", + "df_mols = pd.DataFrame(data=d)\n", + "\n", + "df_mols" + ], + "id": "7f80b7d19aaee0a3", + "outputs": [ + { + "data": { + "text/plain": [ + " molecule\n", + "0 CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|\n", + "1 [Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |..." + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule
0CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|
1[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |...
\n", + "
" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "In case of creating HTML output from pandas.DataFrame, you can use the helper function _mol_to_svg_formatter_ to visualize the molecules as SVG images.", + "id": "7d771f74cf01d2f6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-01T14:44:23.122745Z", + "start_time": "2025-12-01T14:44:22.978179Z" + } + }, + "cell_type": "code", + "source": [ + "from chemaxon import mol_to_svg_formatter\n", + "\n", + "df_mols.to_html('web_view.html', escape=False, formatters=dict(molecule=mol_to_svg_formatter))" + ], + "id": "27bf252df19e4e0f", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Since the __Molecule__ objects are being stored in the __DataFrame__, not just their representation, you can easily calculate properties for them and store the results in new columns.", + "id": "5cc167e89a64f2e6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-01T14:44:25.428052Z", + "start_time": "2025-12-01T14:44:25.414389Z" + } + }, + "cell_type": "code", + "source": [ + "from chemaxon import logp\n", + "\n", + "df_mols['LogP'] = df_mols['molecule'].apply(lambda m: logp(m))\n", + "df_mols" + ], + "id": "d4877596d4d17c61", + "outputs": [ + { + "data": { + "text/plain": [ + " molecule LogP\n", + "0 CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6| 0.92\n", + "1 [Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |... 4.06" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
moleculeLogP
0CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|0.92
1[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |...4.06
\n", + "
" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 4 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "You can also easily create new molecule columns based on existing columns, that contain molecules in any supported format.", + "id": "754df0780b6944f8" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-02T12:54:58.077678Z", + "start_time": "2025-12-02T12:54:58.057169Z" + } + }, + "cell_type": "code", + "source": [ + "d = {'SMILES': ['CN1C=NC2=C1C(=O)N(C)C(=O)N2C'], 'name': ['coffein'] }\n", + "df = pd.DataFrame(data=d)\n", + "\n", + "df = pd.concat([df, pd.DataFrame.from_records([{'SMILES' : 'CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)NC(=O)[C@@H](N)CSSC[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC1=O)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)NCC(N)=O', 'name' : 'oxytocin'}])])\n", + "\n", + "df['molecule'] = df['SMILES'].apply(lambda s: import_mol(s))\n", + "df\n" + ], + "id": "7cf77defedadb058", + "outputs": [ + { + "data": { + "text/plain": [ + " SMILES name \\\n", + "0 CN1C=NC2=C1C(=O)N(C)C(=O)N2C coffein \n", + "0 CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N... oxytocin \n", + "\n", + " molecule \n", + "0 CN1C=NC2=C1C(=O)N(C)C(=O)N2C |c:2,4| \n", + "0 CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N... " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SMILESnamemolecule
0CN1C=NC2=C1C(=O)N(C)C(=O)N2CcoffeinCN1C=NC2=C1C(=O)N(C)C(=O)N2C |c:2,4|
0CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...oxytocinCC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...
\n", + "
" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 6 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "b92a8580d7e03daa" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}