From 077b125e1b40f85f5761b4606b6b7a3c31a757ae Mon Sep 17 00:00:00 2001 From: Mieczyslaw Torchala Date: Sun, 4 May 2025 13:04:35 +0100 Subject: [PATCH 1/3] added mietek to index.md contributors section --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index aff6395..3a3cf0d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -117,3 +117,4 @@ Scikit-Mol has been developed as a community effort with contributions from peop - [@enricogandini](https://github.com/enricogandini) - [@mikemhenry](https://github.com/mikemhenry) - [@c-feldmann](https://github.com/c-feldmann) +- Mieczyslaw Torchala [@mieczyslaw](https://github.com/mieczyslaw) From cf96f3ec071fe6f6eb45f62b52354be8bbf23e18 Mon Sep 17 00:00:00 2001 From: Mieczyslaw Torchala Date: Sun, 4 May 2025 22:15:29 +0100 Subject: [PATCH 2/3] restore docs/notebooks/05_smiles_sanitization.ipynb --- docs/notebooks/05_smiles_sanitization.ipynb | 315 ++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 docs/notebooks/05_smiles_sanitization.ipynb diff --git a/docs/notebooks/05_smiles_sanitization.ipynb b/docs/notebooks/05_smiles_sanitization.ipynb new file mode 100644 index 0000000..2b0bb99 --- /dev/null +++ b/docs/notebooks/05_smiles_sanitization.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9b787560", + "metadata": {}, + "source": [ + "# SMILES sanitation\n", + "Sometimes we are faced with datasets which has SMILES that rdkit doesn't want to sanitize. This can be human entry errors, or differences between RDKits more strict sanitazion and other toolkits implementations of the parser. e.g. RDKit will not handle a tetravalent nitrogen when it has no charge, where other toolkits may simply build the graph anyway, disregarding the issues with the valence rules or guessing that the nitrogen should have a charge, where it could also by accident instead have a methyl group too many." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "612aa974", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:27.545695Z", + "iopub.status.busy": "2024-11-24T09:27:27.545293Z", + "iopub.status.idle": "2024-11-24T09:27:28.079174Z", + "shell.execute_reply": "2024-11-24T09:27:28.078490Z" + }, + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from rdkit.Chem import PandasTools\n", + "\n", + "csv_file = \"../tests/data/SLC6A4_active_excapedb_subset.csv\" # Hmm, maybe better to download directly\n", + "data = pd.read_csv(csv_file)" + ] + }, + { + "cell_type": "markdown", + "id": "0f957a69", + "metadata": {}, + "source": [ + "Now, this example dataset contain all sanitizable SMILES, so for demonstration purposes, we will corrupt one of them" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b09cfd6b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:28.082222Z", + "iopub.status.busy": "2024-11-24T09:27:28.081921Z", + "iopub.status.idle": "2024-11-24T09:27:28.086003Z", + "shell.execute_reply": "2024-11-24T09:27:28.085450Z" + } + }, + "outputs": [], + "source": [ + "data.loc[1, \"SMILES\"] = \"CN(C)(C)(C)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e20fb5cc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:28.088449Z", + "iopub.status.busy": "2024-11-24T09:27:28.088211Z", + "iopub.status.idle": "2024-11-24T09:27:28.130818Z", + "shell.execute_reply": "2024-11-24T09:27:28.130102Z" + }, + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset contains 1 unparsable mols\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[10:27:28] Explicit valence for atom # 1 N, 4, is greater than permitted\n" + ] + } + ], + "source": [ + "\n", + "PandasTools.AddMoleculeColumnToFrame(data, smilesCol=\"SMILES\")\n", + "print(f\"Dataset contains {data.ROMol.isna().sum()} unparsable mols\")" + ] + }, + { + "cell_type": "markdown", + "id": "f8dccd93", + "metadata": {}, + "source": [ + "If we use these SMILES for the scikit-learn pipeline, we would face an error, so we need to check and clean the dataset first. The CheckSmilesSanitation can help us with that." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3dbd50b3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:28.133745Z", + "iopub.status.busy": "2024-11-24T09:27:28.133507Z", + "iopub.status.idle": "2024-11-24T09:27:28.508377Z", + "shell.execute_reply": "2024-11-24T09:27:28.507130Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error in parsing 1 SMILES. Unparsable SMILES can be found in self.errors\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[10:27:28] Explicit valence for atom # 1 N, 4, is greater than permitted\n" + ] + } + ], + "source": [ + "from scikit_mol.utilities import CheckSmilesSanitazion\n", + "\n", + "smileschecker = CheckSmilesSanitazion()\n", + "\n", + "smiles_list_valid, y_valid, smiles_errors, y_errors = smileschecker.sanitize(\n", + " list(data.SMILES), list(data.pXC50)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c888d7da", + "metadata": {}, + "source": [ + "Now the smiles_list_valid should be all valid and the y_values filtered as well. Errors are returned, but also accesible after the call to .sanitize() in the .errors property" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5af5ea3d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:28.511261Z", + "iopub.status.busy": "2024-11-24T09:27:28.510945Z", + "iopub.status.idle": "2024-11-24T09:27:28.522024Z", + "shell.execute_reply": "2024-11-24T09:27:28.521232Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SMILESy
0CN(C)(C)(C)7.18046
\n", + "
" + ], + "text/plain": [ + " SMILES y\n", + "0 CN(C)(C)(C) 7.18046" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smileschecker.errors" + ] + }, + { + "cell_type": "markdown", + "id": "c2ce2677", + "metadata": {}, + "source": [ + "The checker can also be used only on X" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "84db07cc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-24T09:27:28.524982Z", + "iopub.status.busy": "2024-11-24T09:27:28.524717Z", + "iopub.status.idle": "2024-11-24T09:27:28.569119Z", + "shell.execute_reply": "2024-11-24T09:27:28.568473Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error in parsing 1 SMILES. Unparsable SMILES can be found in self.errors\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[10:27:28] Explicit valence for atom # 1 N, 4, is greater than permitted\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SMILES
0CN(C)(C)(C)
\n", + "
" + ], + "text/plain": [ + " SMILES\n", + "0 CN(C)(C)(C)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smiles_list_valid, X_errors = smileschecker.sanitize(list(data.SMILES))\n", + "smileschecker.errors" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.4 ('rdkit')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9f73fb2ee367a0d8b5ebf160bef6bd0ae50cb31c Mon Sep 17 00:00:00 2001 From: Mieczyslaw Torchala Date: Sun, 4 May 2025 22:22:51 +0100 Subject: [PATCH 3/3] added mietek to README.md contributors section --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index aff6395..3a3cf0d 100644 --- a/README.md +++ b/README.md @@ -117,3 +117,4 @@ Scikit-Mol has been developed as a community effort with contributions from peop - [@enricogandini](https://github.com/enricogandini) - [@mikemhenry](https://github.com/mikemhenry) - [@c-feldmann](https://github.com/c-feldmann) +- Mieczyslaw Torchala [@mieczyslaw](https://github.com/mieczyslaw)