From c5a7be297776c3e99c83dd0618f5fcca5b07833b Mon Sep 17 00:00:00 2001
From: Levente Nagy <lnagy@chemaxon.com>
Date: Mon, 1 Dec 2025 15:45:42 +0100
Subject: [PATCH] COG-648 Pandas integration docs

---
 jupyter/06_pandas_integration_examples.ipynb | 310 +++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 jupyter/06_pandas_integration_examples.ipynb
diff --git a/jupyter/06_pandas_integration_examples.ipynb b/jupyter/06_pandas_integration_examples.ipynb
new file mode 100644
index 0000000..e8e430f
--- /dev/null
+++ b/jupyter/06_pandas_integration_examples.ipynb
@@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# __Pandas examples with ChemAxon molecules__",
+   "id": "b70058148a1f011b"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "You can see simpler examples of using pandas.DataGrame.read_table with ChemAxon molecules in the [Calculators](02_calculators.ipynb) and [Molecular similiarity notebook](03_molecular_similarity.ipynb). But here are some additional examples showing how to work with ChemAxon molecules in pandas DataFrames.",
+   "id": "74aa29c7fecce4b7"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "By default, molecules are printed in the _cxsmiles_ format. In case of issues with that, it is written in the _cxsmarts_ format. If none of them are possible for some reason, representation falls back to use the original format, which was recognized during the import process.",
+   "id": "b1ec732d397843a3"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-01T14:43:22.339727Z",
+     "start_time": "2025-12-01T14:43:21.966248Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import pandas as pd\n",
+    "from chemaxon import import_mol, open_for_import\n",
+    "\n",
+    "mol = import_mol('CC(=O)NC1=CC=C(O)C=C1')\n",
+    "\n",
+    "mol_lst = []\n",
+    "with open_for_import('/home/lnagy/IdeaProjects/python-api/chemaxon/resources/test.sdf') as mol_importer:\n",
+    "    for m in mol_importer:\n",
+    "        mol_lst.append(m)\n",
+    "\n",
+    "d = {'molecule': [mol] + mol_lst }\n",
+    "df_mols = pd.DataFrame(data=d)\n",
+    "\n",
+    "df_mols"
+   ],
+   "id": "7f80b7d19aaee0a3",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "                                            molecule\n",
+       "0                  CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|\n",
+       "1  [Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |..."
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 2
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "In case of creating HTML output from pandas.DataFrame, you can use the helper function _mol_to_svg_formatter_ to visualize the molecules as SVG images.",
+   "id": "7d771f74cf01d2f6"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-01T14:44:23.122745Z",
+     "start_time": "2025-12-01T14:44:22.978179Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from chemaxon import mol_to_svg_formatter\n",
+    "\n",
+    "df_mols.to_html('web_view.html', escape=False, formatters=dict(molecule=mol_to_svg_formatter))"
+   ],
+   "id": "27bf252df19e4e0f",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "Since the __Molecule__ objects are being stored in the __DataFrame__, not just their representation, you can easily calculate properties for them and store the results in new columns.",
+   "id": "5cc167e89a64f2e6"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-01T14:44:25.428052Z",
+     "start_time": "2025-12-01T14:44:25.414389Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from chemaxon import logp\n",
+    "\n",
+    "df_mols['LogP'] = df_mols['molecule'].apply(lambda m: logp(m))\n",
+    "df_mols"
+   ],
+   "id": "d4877596d4d17c61",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "                                            molecule  LogP\n",
+       "0                  CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|  0.92\n",
+       "1  [Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |...  4.06"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule</th>\n",
+       "      <th>LogP</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CC(=O)NC1=CC=C(O)C=C1 |c:9,t:4,6|</td>\n",
+       "      <td>0.92</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S |...</td>\n",
+       "      <td>4.06</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "You can also easily create new molecule columns based on existing columns, that contain molecules in any supported format.",
+   "id": "754df0780b6944f8"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-02T12:54:58.077678Z",
+     "start_time": "2025-12-02T12:54:58.057169Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "d = {'SMILES': ['CN1C=NC2=C1C(=O)N(C)C(=O)N2C'], 'name': ['coffein'] }\n",
+    "df = pd.DataFrame(data=d)\n",
+    "\n",
+    "df = pd.concat([df, pd.DataFrame.from_records([{'SMILES' : 'CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)NC(=O)[C@@H](N)CSSC[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CCC(N)=O)NC1=O)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CC(C)C)C(=O)NCC(N)=O', 'name' : 'oxytocin'}])])\n",
+    "\n",
+    "df['molecule'] = df['SMILES'].apply(lambda s: import_mol(s))\n",
+    "df\n"
+   ],
+   "id": "7cf77defedadb058",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "                                              SMILES      name  \\\n",
+       "0                       CN1C=NC2=C1C(=O)N(C)C(=O)N2C   coffein   \n",
+       "0  CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...  oxytocin   \n",
+       "\n",
+       "                                            molecule  \n",
+       "0               CN1C=NC2=C1C(=O)N(C)C(=O)N2C |c:2,4|  \n",
+       "0  CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...  "
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>SMILES</th>\n",
+       "      <th>name</th>\n",
+       "      <th>molecule</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CN1C=NC2=C1C(=O)N(C)C(=O)N2C</td>\n",
+       "      <td>coffein</td>\n",
+       "      <td>CN1C=NC2=C1C(=O)N(C)C(=O)N2C |c:2,4|</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...</td>\n",
+       "      <td>oxytocin</td>\n",
+       "      <td>CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "b92a8580d7e03daa"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

	molecule
0	CC(=O)NC1=CC=C(O)C=C1 \|c:9,t:4,6\|
1	[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S \|...
	molecule	LogP
0	CC(=O)NC1=CC=C(O)C=C1 \|c:9,t:4,6\|	0.92
1	[Li]C1=C(I)C(Br)=C(F)C2=C1C(Cl)=C(N)C(O)=C2S \|...	4.06
	SMILES	name	molecule
0	CN1C=NC2=C1C(=O)N(C)C(=O)N2C	coffein	CN1C=NC2=C1C(=O)N(C)C(=O)N2C \|c:2,4\|
0	CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...	oxytocin	CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=C(O)C=C2)N...