diff --git a/README.md b/README.md index b4dcae3167..e0931a50e3 100644 --- a/README.md +++ b/README.md @@ -714,7 +714,7 @@ Set visual attributes through [quick data bindings](https://hub.graphistry.com/d Port Scan Attack
- Protein Interactions
Source: BioGRID + Protein Interactions
Source: BioGRID
BioGRID Demo Notebook Programming Languages
Source: Socio-PLT project diff --git a/demos/demos_by_use_case/bio/BiogridDemo.ipynb b/demos/demos_by_use_case/bio/BiogridDemo.ipynb index 96d0f88411..613c2a0529 100644 --- a/demos/demos_by_use_case/bio/BiogridDemo.ipynb +++ b/demos/demos_by_use_case/bio/BiogridDemo.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ "import graphistry\n", "\n", "# To specify Graphistry account & server, use:\n", - "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", + "graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", "# For more options, see https://github.com/graphistry/pygraphistry#configure\n" ] }, @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": { "scrolled": false }, @@ -48,26 +48,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" + "/var/folders/sx/x954rbdd44d932dd0ygfp1qc0000gn/T/ipykernel_39037/263179729.py:2: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " rawdata = pandas.read_table(url1, na_values=['-'], engine='c')#, compression='gzip')\n" ] }, { "data": { "text/html": [ "
\n", - "\n", "\n", " \n", @@ -134,17 +134,14 @@ "2 Wang T (1996) Low Throughput " ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "url1 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-ALL-3.3.123.tab2.txt.gz'\n", - "rawdata = pandas.read_table(url1, na_values=['-'], engine='c', compression='gzip')\n", - "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# pandas.read_table('./data/BIOGRID-ALL-3.3.123.tab2.txt', na_values=['-'], engine='c')\n", + "url1 = 'https://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-ALL-3.3.123.tab2.zip'\n", + "rawdata = pandas.read_table(url1, na_values=['-'], engine='c')#, compression='gzip')\n", "\n", "cols = ['BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Official Symbol Interactor A', \n", " 'Official Symbol Interactor B', 'Pubmed ID', 'Author', 'Throughput']\n", @@ -162,23 +159,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -186,7 +185,7 @@ "" ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -206,7 +205,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -215,18 +214,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -263,19 +262,15 @@ "22 3 Arabidopsis thaliana" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This downloads 170 MB, it might take some time.\n", - "url2 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt.gz'\n", - "raw_proteins = pandas.read_table(url2, na_values=['-'], engine='c', compression='gzip')\n", - "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# raw_proteins = pandas.read_table('./data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt', na_values=['-'], engine='c')\n", - "\n", + "url2 = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-IDENTIFIERS-3.3.123.tab.zip'\n", + "raw_proteins = pandas.read_table(url2, na_values=['-'], skiprows=28)#, engine='c', compression='gzip')\n", "\n", "protein_ids = raw_proteins[['BIOGRID_ID', 'ORGANISM_OFFICIAL_NAME']].drop_duplicates() \\\n", " .rename(columns={'ORGANISM_OFFICIAL_NAME': 'ORGANISM'})\n", @@ -291,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -300,18 +295,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -348,7 +343,7 @@ "2 106605 ACVR1" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -375,25 +370,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -434,7 +429,7 @@ "2 106605 ACVR1 Homo sapiens" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -453,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -470,25 +465,25 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -538,7 +533,7 @@ "2 Homo sapiens 0 " ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -557,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -566,18 +561,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -649,7 +644,7 @@ "2 Low Throughput " ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -673,32 +668,27 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading 7139 kB. This may take a while...\n" - ] - }, { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -706,7 +696,7 @@ "" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -716,15 +706,6 @@ "g2 = g.bind(node='BIOGRID_ID', edge_title='Author', point_title='SYMBOL', point_color='Color')\n", "g2.plot(interactions, protein_labels)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -743,7 +724,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb b/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb new file mode 100644 index 0000000000..25c62d95de --- /dev/null +++ b/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb @@ -0,0 +1,865 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "dzmJAwfiAi6k" + }, + "source": [ + "# Accelerated Chemical Mapping with [Graphistry](graphistry.com)\n", + "\n", + "This notebook visualizes a chemical dataset describing Blood Brain Barrier Permeability (BBBP) from [MoleculeNet](http://moleculenet.org/datasets-1) and [ECFPs](https://pubs.acs.org/doi/10.1021/ci100050t).\n", + "\n", + "Using these string formulations of molecular 3D structure we can take advantage of string-based computational algorithms. These string representations look like the following:\n", + "\n", + "\n", + "* OCC#Cc1cc(Cl)c(C(=O)Nc2ccnc(NC(=O)C3CC3)c2)c(Cl)c1\n", + "151276 \n", + "* CCNc1ncnc2c1nc(NC3CCCC3)n2[C@@H]4O[C@H](CO)[C@@H](O)[C@H]4O\n", + "172750 \n", + "* CCC(C1=C(O)C2=C(CCCCCC2)OC1=O)c3cccc(NS(=O)(=O)c4ccc(Cl)cc4)c3\n", + "155015 \n", + "* CC1CCN(CC1)c2nc(ccc2CNC(=O)Nc3ccc(CNS(=O)(=O)C)c(F)c3)C(F)(F)F\n", + "\n", + "\n", + "The formulation of the structure into linear form helps us immensely, and thus we are able to parse and reduce these complex molecules down to 2 dimensions using conventional statistical tools, namely UMAP. Ultimately we demonstrate how such an OPEN-SOURCE analysis can be sped-up and scaled-up massively with the [graphistry](graphistry.com) environment and toolkit\n", + "\n", + "\n", + "* Speedup: From minutes to seconds - 3 min to 10 seconds on a small T4 GPU\n", + "* Visual insight: Add interactivity, similarity edges, and visual scale to a traditional static scatterplot to better investigate pairwise correlations and overall clusters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w89wE473URRH" + }, + "source": [ + "# Import accelerator libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ABA00KeDT6Gx" + }, + "outputs": [], + "source": [ + "!pip install -q --extra-index-url=https://pypi.nvidia.com cuml-cu12\n", + "import cuml,cudf\n", + "print(cuml.__version__)\n", + "\n", + "!pip -q install graphistry[ai]\n", + "# !pip install -U -q --force git+https://github.com/graphistry/pygraphistry.git#@dev/depman_gpufeat\n", + "# !pip install cu_cat" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7_t1UL7YUAx3", + "outputId": "6f19365b-efd9-4b2e-fab6-2a9a92c020f8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.33.9\n" + ] + } + ], + "source": [ + "\n", + "import graphistry\n", + "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n", + "\n", + "print(graphistry.__version__)\n", + "\n", + "# import cu_cat\n", + "# print(cu_cat.__file__)\n", + "\n", + "import os\n", + "from collections import Counter\n", + "import cProfile\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from pstats import Stats\n", + "import cuml,cudf\n", + "from time import time\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "from typing import List\n", + "import seaborn as sns\n", + "pd.set_option('display.max_colwidth', 200)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EtU413LOUG_S", + "outputId": "0c6e2fd0-0c87-4dae-b53f-325316af75e0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tesla T4\n" + ] + } + ], + "source": [ + "!nvidia-smi --query-gpu=gpu_name --format=csv,noheader" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZfM8WRfW4gOY" + }, + "source": [ + "# Import Basics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "prDjzDTU384B" + }, + "outputs": [], + "source": [ + "!pip install -q rdkit\n", + "!pip install --pre -q deepchem\n", + "\n", + "from rdkit import Chem, DataStructs\n", + "from rdkit.Chem.rdchem import Mol\n", + "from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser\n", + "\n", + "from rdkit import RDLogger\n", + "lg = RDLogger.logger()\n", + "lg.setLevel(RDLogger.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L8NcwLVK5s_i" + }, + "source": [ + "# Embed BBBP in Global Chemical Space Approximation (Dataset-Agnostic Embedding)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gdXebUL45usJ" + }, + "source": [ + "### Read in and process ChEMBL data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "HUbDr8Dp5uOB" + }, + "outputs": [], + "source": [ + "# Read in data from MoleculeNet\n", + "chembl = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz\", compression='gzip')\n", + "\n", + "# Sample a random 20k\n", + "chembl = chembl.sample(n=20000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x9T_sR0oaJvq" + }, + "outputs": [], + "source": [ + "chemblA = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz\", compression='gzip')\n", + "\n", + "chem_data = chembl[\"smiles\"][chembl.smiles.str.len()<500] ## lets simplify and just look at \"short molecules\" for this exercise\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GAh6_ylDa74q", + "outputId": "a4cf8f91-851f-4bac-c30a-6d4c8baf3f4a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "201332 OCC#Cc1cc(Cl)c(C(=O)Nc2ccnc(NC(=O)C3CC3)c2)c(Cl)c1\n", + "151276 CCNc1ncnc2c1nc(NC3CCCC3)n2[C@@H]4O[C@H](CO)[C@@H](O)[C@H]4O\n", + "172750 CCC(C1=C(O)C2=C(CCCCCC2)OC1=O)c3cccc(NS(=O)(=O)c4ccc(Cl)cc4)c3\n", + "155015 CC1CCN(CC1)c2nc(ccc2CNC(=O)Nc3ccc(CNS(=O)(=O)C)c(F)c3)C(F)(F)F\n", + "231881 Cc1nc(cs1)C#Cc2cc(Cl)cc(c2)C#N\n", + " ... \n", + "197652 CN(C)C(=O)c1cc2cc(Nc3nccc(n3)c4cn(cn4)C5CC5)cc(Cl)c2[nH]1\n", + "63558 COc1cc(OC)cc(\\C=C\\2/CCC\\C(=C/c3ccccc3F)\\C2=O)c1\n", + "23052 CCN1CCC(=C(C1)C(=O)OCCc2ccccn2)c3ccccc3\n", + "154256 CN[C@@H]1CCN(C1)c2nc(N)nc3c2CCCc4ccccc34\n", + "72859 COc1ccc(cc1)C2(N=C(N)c3nc(C)sc23)c4cccc(c4)c5cncnc5\n", + "Name: smiles, Length: 19959, dtype: object" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chem_data.dropna()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2f9Zj5LGxYbo" + }, + "source": [ + " ## with CPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qDVN9-VgxWMu", + "outputId": "ed4bb475-3ff7-4bd8-9e53-d58f80c5690d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.umap_utils:* Ignoring target column of shape (19959, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 174.6 seconds passed\n" + ] + } + ], + "source": [ + "g2 = graphistry.nodes(chem_data)\n", + "\n", + "t=time()\n", + "g4=g2.umap(engine='umap_learn',metric = \"jaccard\",\n", + " n_neighbors = 25,\n", + " n_components = 2,\n", + " dbscan=True,\n", + " min_dist = 0.001)\n", + "j=time()-t\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-eahF167xaSR" + }, + "source": [ + "## and GPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DnXz47yx9MdU", + "outputId": "f01dcd41-7a45-4d3e-a475-d4c1fc85a662" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (19959, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 9.3 seconds passed\n" + ] + } + ], + "source": [ + "g2 = graphistry.nodes((chem_data))\n", + "\n", + "\n", + "t=time()\n", + "g4=g2.umap(engine='cuml',metric = \"jaccard\",\n", + " n_neighbors = 25,\n", + " n_components = 2,\n", + " dbscan=True,\n", + " min_dist = 0.001)\n", + "j=time()-t\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "v3SEPYwrcaNa", + "outputId": "f196de85-080f-45e2-c559-f877331a60a9" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g4.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jovz3xVc4M8X" + }, + "source": [ + "# Embed BBBP with UMAP" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZlQqzKMB4Qq7" + }, + "source": [ + "### Read in and process small data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "HR2S6BbP4JlB" + }, + "outputs": [], + "source": [ + "# Read in data from MoleculeNet\n", + "bbbp = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv\")\n", + "\n", + "# Clean up columnn names so they are easier to interpret\n", + "bbbp = bbbp[[\"smiles\", \"p_np\", \"name\"]].reset_index(drop=True).rename({\"p_np\": \"permeable\"}, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "i9iRe44glrPq", + "outputId": "69043bd0-9660-4b94-f1c4-cf8e8b9c8080" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"BBBP[['name','permeable']][BBBP\",\n \"rows\": 2020,\n \"fields\": [\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2020,\n \"samples\": [\n \"GR94839_I\",\n \"carbamazepine\",\n \"testolactone\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"permeable\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepermeable
0Propanolol1
1Terbutylchlorambucil1
2407301
3241
4cloxacillin1
.........
2045licostinel1
2046ademetionine(adenosyl-methionine)1
2047mesocarb1
2048tofisoline1
2049azidamfenicol1
\n", + "

2020 rows × 2 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n" + ], + "text/plain": [ + " name permeable\n", + "0 Propanolol 1\n", + "1 Terbutylchlorambucil 1\n", + "2 40730 1\n", + "3 24 1\n", + "4 cloxacillin 1\n", + "... ... ...\n", + "2045 licostinel 1\n", + "2046 ademetionine(adenosyl-methionine) 1\n", + "2047 mesocarb 1\n", + "2048 tofisoline 1\n", + "2049 azidamfenicol 1\n", + "\n", + "[2020 rows x 2 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BBBP=bbbp[~bbbp.name.duplicated(keep='first')]\n", + "BBBP[['name','permeable']][BBBP.smiles.str.len()>3]#.reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6L8TZ5qx530H" + }, + "source": [ + "### ... and with graphistry" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vA8LG2dFgKrc", + "outputId": "c4b1ceba-0f48-4239-b600-1302dcb9eda8" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (2020, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 43.0 seconds passed\n" + ] + } + ], + "source": [ + "BBBP=bbbp[~bbbp.name.duplicated(keep='first')]\n", + "\n", + "g = graphistry.nodes(cudf.from_pandas(BBBP[['smiles','permeable']][BBBP.smiles.str.len()>3]))\n", + "t=time()\n", + "# g2=g.featurize(feature_engine='cu_cat',memoize=True)\n", + "g3=g.umap(engine='cuml',metric = \"jaccard\",\n", + " n_neighbors = 25,\n", + " n_components = 2,\n", + " low_memory = False,\n", + " min_dist = 0.001)\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "WMzz0EfaqR03", + "outputId": "c386f527-42c1-4e07-88a4-701238d73908" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g3.encode_point_color('permeable',palette=[\"hotpink\", \"dodgerblue\"],as_continuous=True).plot()\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "jovz3xVc4M8X" + ], + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb b/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb new file mode 100644 index 0000000000..5061f79320 --- /dev/null +++ b/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb @@ -0,0 +1,1073 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ozrca88hza85" + }, + "source": [ + "# Accelerating metagenomic analysis with [Graphistry](graphistry.com) focusing on viral tracing over time\n", + "\n", + "## [viral calling pipeline here](https://github.com/dcolinmorgan/viral_snake)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tLRHg2VEzoYy" + }, + "source": [ + "Using GPU-accelerated UMAP + DBScan analysis & visualization, metagenomic samples' bacterial compositions can be clustered and compared faster and much more easily explored.\n", + "\n", + "* Task: Analyze metagenomic samples for similarity\n", + "* Data: time series samples\n", + "* 563 samples collected from 84 donors, producing 4 dense long-term time series (up to 1 sample every other day during 18 months)\n", + "* Clustering: the species component extracted from time-stamped patient samples, e.g., tuple of \n", + "* Each **node** is , and clustering is on species=abc text similarity\n", + "* *n.b.* since species text is full taxa information, text comparison can return degree similarity\n", + "\n", + "* [data](https://www.ebi.ac.uk/ena/browser/view/PRJNA544527)\n", + "* [metadata](https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx)\n", + "* [paper](https://sci-hub.se/10.1038/s41591-019-0559-3)\n", + "\n", + "\n", + "**Insight/ Result:**\n", + "\n", + "* 43s to umap and dbscan vs 2342s on a small T4 GPU\n", + "* over **50X** faster for a single run\n", + "* since [the reference paper for this analysis](https://journals.asm.org/doi/full/10.1128/msystems.00118-23) runs this analysis 12x per dataset (here we only have 1 dataset), we could expect to save nearly the entire 8hrs for this dataset, taking less than 10 minutes in total" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PjnS_PCWaClg" + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pFKuEaZClwXa" + }, + "outputs": [], + "source": [ + "!pip install -q --extra-index-url=https://pypi.nvidia.com cuml-cu12\n", + "import cuml,cudf\n", + "print(cuml.__version__)\n", + "\n", + "!pip -q install graphistry[ai]\n", + "\n", + "!pip install -q Biopython" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ob5QrPuiAf-Q" + }, + "outputs": [], + "source": [ + "import locale\n", + "def getpreferredencoding(do_setlocale = True):\n", + " return \"UTF-8\"\n", + "locale.getpreferredencoding = getpreferredencoding" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HK_8_7UB0mhx" + }, + "source": [ + "# import /configure\n", + "\n", + "visualization step, get a free API key at https://hub.graphistry.com\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i42QrQ_ejC4h", + "outputId": "f8c19225-336f-4dd9-e2eb-7b23e184bc51" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24.06.01\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import graphistry\n", + "from time import time\n", + "\n", + "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n", + "graphistry.__version__\n", + "\n", + "import cuml,cudf\n", + "print(cuml.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_Y1VlCGy4_FR" + }, + "source": [ + "# bio-ml dataset\n", + "\n", + "\n", + "1. [3 subjects x 10 time points](\n", + "https://www.ebi.ac.uk/ena/browser/view/PRJNA544527)\n", + "\n", + "2. [metadata](\n", + "https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hO1HFIMO6ttV" + }, + "outputs": [], + "source": [ + "!wget https://gist.githubusercontent.com/lmeyerov/61a6a7d5fa0dbe51e786ed52408ac360/raw/11a11aa0b865ceb96880b2cd2ae3b12f1ef947c8/gistfile1.txt -O PRJNA544527_mpa4out.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CBN6Z_77Sduq" + }, + "outputs": [], + "source": [ + "%%bash\n", + "if [ ! -f PRJNA544527_mpa4out.txt ]; then\n", + " !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR922/006/SRR9224006/SRR9224006_1.fastq.gz\n", + " !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR922/006/SRR9224006/SRR9224006_2.fastq.gz\n", + "\n", + " !gunzip SRR9224006_1.fastq.gz\n", + " !gunzip SRR9224006_2.fastq.gz\n", + "\n", + " !head /content/SRR9224006_1.fastq\n", + "fi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UWDGFCWxSpv_" + }, + "outputs": [], + "source": [ + "import os\n", + "if not os.path.exists('PRJNA544527_mpa4out.txt'):\n", + " from Bio import SeqIO\n", + " import glob,os\n", + " import pandas as pd\n", + " B=pd.DataFrame()\n", + " for i in glob.glob('/content/*.fastq'):\n", + " # j=os.path.basename(i)\n", + " fasta_sequences = SeqIO.parse(open(i),'fastq')\n", + " identifiers = []\n", + " sequences = []\n", + " for fasta in fasta_sequences:\n", + " name, sequence = fasta.id, str(fasta.seq)\n", + " identifiers.append(name)\n", + " sequences.append(sequence)\n", + "\n", + " A=pd.DataFrame([identifiers,sequences]).T\n", + " A.columns=['ID','seq']\n", + " A.dropna(inplace=True)\n", + " B=B.append(A)\n", + " # A['ID']#=A.ID.str.split('-')[0:1]\n", + " # B['ID']=B['ID'].str.split('-').str[0]+'_'+B['ID'].str.split('-').str[1]#.cat()\n", + " B['ID']=B.ID.str.split('_length').str[0]\n", + " B.index=B.ID" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fDvymRFjHPNB" + }, + "source": [ + "# install [HUMAnN 3](https://huttenhower.sph.harvard.edu/humann), a method for efficiently and accurately profiling the abundance of microbial metabolic pathways and other molecular functions from metagenomic or metatranscriptomic sequencing data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gtOG0QeoUoX6" + }, + "source": [ + "### takes very long for running all samples\n", + " (1day+ run on cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yIeeDXPBHN6D" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "if [ ! -f PRJNA544527_mpa4out.txt ]; then\n", + "\n", + " pip install humann --no-binary :all:\n", + " pip install metaphlan\n", + "\n", + " humann_databases --download utility_mapping full /path/to/databases --update-config yes\n", + "\n", + " # humann_test\n", + " wget https://github.com/biobakery/humann/raw/master/examples/demo.fastq.gz\n", + " humann -i demo.fastq.gz -o sample_results\n", + "\n", + "\n", + " mkdir assemble epi_sam_out mpa4_out\n", + " humann -i /content/All_MAGs/Sample_101_S75_bin_1.fa -o test_out\n", + "\n", + "\n", + " seq=$(ls /content/*.fastq | cut -d / -f2| cut -d _ -f1)\n", + "\n", + " for i in $(eval \"echo \"$seq\" | cut -d _ -f1\")\n", + "\n", + " do\n", + " metaphlan /content/${i}.fa --nproc 40 --input_type fasta -o /content/assemble/${i}/h4_out.txt -t rel_ab_w_read_stats\n", + " done\n", + "fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dHbwBIEn6Wxr" + }, + "source": [ + "# umap and dbscan\n", + "\n", + "idea for metagenomic analysis based on [Quantifying Shared and Unique Gene Content across 17 Microbial Ecosystems\n", + "](https://journals.asm.org/doi/full/10.1128/msystems.00118-23)\n", + "\n", + "(analyze all samples run on cluster)\n", + "\n", + "also this [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x?ref=https://codemonkey.link#Sec7) and [method](https://github.com/marbl/Mash/blob/master/INSTALL.txt)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RoIBLY3-670T", + "outputId": "7c1d8f5f-fc9f-44d3-9a3a-a268edc4c00c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-07-09 07:23:29-- https://raw.githubusercontent.com/dcolinmorgan/grph/main/PRJNA544527-meta_inf.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 45603 (45K) [text/plain]\n", + "Saving to: ‘PRJNA544527-meta_inf.txt’\n", + "\n", + "\rPRJNA544527-meta_in 0%[ ] 0 --.-KB/s \rPRJNA544527-meta_in 100%[===================>] 44.53K --.-KB/s in 0.007s \n", + "\n", + "2024-07-09 07:23:29 (6.40 MB/s) - ‘PRJNA544527-meta_inf.txt’ saved [45603/45603]\n", + "\n", + "--2024-07-09 07:23:29-- https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx\n", + "Resolving static-content.springer.com (static-content.springer.com)... 151.101.0.95, 151.101.64.95, 151.101.128.95, ...\n", + "Connecting to static-content.springer.com (static-content.springer.com)|151.101.0.95|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3556857 (3.4M) [application/octet-stream]\n", + "Saving to: ‘41591_2019_559_MOESM3_ESM.xlsx’\n", + "\n", + "41591_2019_559_MOES 100%[===================>] 3.39M --.-KB/s in 0.05s \n", + "\n", + "2024-07-09 07:23:31 (69.7 MB/s) - ‘41591_2019_559_MOESM3_ESM.xlsx’ saved [3556857/3556857]\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed\n", + " warn(msg)\n" + ] + } + ], + "source": [ + "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n", + "data.index=data.reset_index().clade_name.str.split('|',expand=True)[6]\n", + "data=data.reset_index().dropna(axis=0)\n", + "data.index=data[6]\n", + "data=data.drop(columns=6)\n", + "\n", + "!wget https://gist.githubusercontent.com/lmeyerov/b650f1ef9e56c3f1888ebb009bc5ed46/raw/76dda5fabcdfbcdf0cc58450982fbeb4b2e38a98/PRJNA544527-meta_inf.txt\n", + "meta=pd.read_csv('/content/PRJNA544527-meta_inf.txt',sep='\\t',header=None)\n", + "\n", + "mm=pd.merge(data.T,meta[[3,5]],left_index=True,right_on=3)\n", + "\n", + "mm['id']=mm[5].str.split('-').str[0]\n", + "mm['time']=mm[5].str.split('_').str[0].str.split('-').str[1]\n", + "\n", + "!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx\n", + "metaa=pd.read_excel('/content/41591_2019_559_MOESM3_ESM.xlsx',sheet_name='SupTable2',skiprows=3)\n", + "metaa=metaa[['Donor','Age','Sex','BMI']]\n", + "\n", + "Full_table=pd.merge(mm,metaa,left_on='id',right_on='Donor')\n", + "Full_table=Full_table.drop(columns=[3,\t5,\t'id'])\n", + "\n", + "data2=Full_table.melt(id_vars=['time','Donor','Age','Sex','BMI'])\n", + "\n", + "data2=data2.rename(columns={'variable':'species'})\n", + "data2=data2.sort_values(by=['Donor','time','value'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3FvFtKukoI7E", + "outputId": "9077830f-9d07-4855-c391-50bd6071dde2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2678 s__Bacteroides_clarus_aa\n", + "5378 s__Bacteroides_intestinalis_aa\n", + "9158 s__Ruminococcus_bromii_aa\n", + "12938 s__GGB6601_SGB9333_aa\n", + "13478 s__GGB3256_SGB4303_aa\n", + " ... \n", + "86343 s__Faecalibacterium_prausnitzii_dl\n", + "2103 s__Phocaeicola_massiliensis_dl\n", + "67983 s__Phocaeicola_massiliensis_dl\n", + "5883 s__Faecalibacterium_prausnitzii_dl\n", + "178143 s__Phocaeicola_plebeius_dl\n", + "Length: 208440, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(data2.species)+'_'+(data2.Donor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "4savEas1jZyX", + "outputId": "a9f7c2a2-5c50-402e-b03b-b7f1daf857a4" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"data2[data2['value']>1]\",\n \"rows\": 14957,\n \"fields\": [\n {\n \"column\": \"time\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 219,\n \"samples\": [\n \"0169\",\n \"0098\",\n \"0043\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Donor\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 84,\n \"samples\": [\n \"db\",\n \"aa\",\n \"ck\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 19,\n \"max\": 45,\n \"num_unique_values\": 23,\n \"samples\": [\n 43,\n 28,\n 29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Female\",\n \"Male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.092398931918041,\n \"min\": 17.6,\n \"max\": 35.1,\n \"num_unique_values\": 57,\n \"samples\": [\n 24.1,\n 23.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 112,\n \"samples\": [\n \"s__Bacteroides_cellulosilyticus\",\n \"s__Clostridiaceae_bacterium\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.07963397515598,\n \"min\": 1.00024,\n \"max\": 100.0,\n \"num_unique_values\": 8355,\n \"samples\": [\n 2.30652,\n 9.30662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timeDonorAgeSexBMIspeciesvalue
399380154aa29Male24.1s__Desulfovibrio_piger1.00422
113180154aa29Male24.1s__Odoribacter_splanchnicus1.12785
771980154aa29Male24.1s__Odoribacter_splanchnicus1.12785
734180154aa29Male24.1s__Faecalibacterium_prausnitzii1.14483
1835780154aa29Male24.1s__GGB3304_SGB43671.24406
........................
863430006dl32Male26.1s__Faecalibacterium_prausnitzii2.21002
21030006dl32Male26.1s__Phocaeicola_massiliensis3.84088
679830006dl32Male26.1s__Phocaeicola_massiliensis3.84088
58830006dl32Male26.1s__Faecalibacterium_prausnitzii4.37472
1781430006dl32Male26.1s__Phocaeicola_plebeius28.49330
\n", + "

14957 rows × 7 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " time Donor Age Sex BMI species value\n", + "39938 0154 aa 29 Male 24.1 s__Desulfovibrio_piger 1.00422\n", + "11318 0154 aa 29 Male 24.1 s__Odoribacter_splanchnicus 1.12785\n", + "77198 0154 aa 29 Male 24.1 s__Odoribacter_splanchnicus 1.12785\n", + "73418 0154 aa 29 Male 24.1 s__Faecalibacterium_prausnitzii 1.14483\n", + "183578 0154 aa 29 Male 24.1 s__GGB3304_SGB4367 1.24406\n", + "... ... ... ... ... ... ... ...\n", + "86343 0006 dl 32 Male 26.1 s__Faecalibacterium_prausnitzii 2.21002\n", + "2103 0006 dl 32 Male 26.1 s__Phocaeicola_massiliensis 3.84088\n", + "67983 0006 dl 32 Male 26.1 s__Phocaeicola_massiliensis 3.84088\n", + "5883 0006 dl 32 Male 26.1 s__Faecalibacterium_prausnitzii 4.37472\n", + "178143 0006 dl 32 Male 26.1 s__Phocaeicola_plebeius 28.49330\n", + "\n", + "[14957 rows x 7 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data2[data2['value']>1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WRYYF1brbdgz" + }, + "source": [ + "## UMAP by species via CPU\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HrJErY1SZSfh", + "outputId": "a5e6cd2b-e198-460e-ca02-46bb6cc592a6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.umap_utils:* Ignoring target column of shape (679, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 24.8 seconds passed\n" + ] + } + ], + "source": [ + "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n", + "\n", + "g = graphistry.nodes(cudf.from_pandas(data.dropna()))\n", + "\n", + "t=time()\n", + "g3=g.umap(dbscan=True,engine='umap_learn')\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n", + "\n", + "g3.plot()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l7aLIxP6ZY9s" + }, + "source": [ + "## UMAP by species via GPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6Zoo17Ui9zxh", + "outputId": "6af64cb4-1528-4033-f606-75fb73af4686" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (679, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 0.7 seconds passed\n" + ] + } + ], + "source": [ + "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n", + "\n", + "g = graphistry.nodes(cudf.from_pandas(data.dropna()))\n", + "\n", + "t=time()\n", + "g3=g.umap(dbscan=True,engine='cuml')\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "kjSdlnmeaOuL", + "outputId": "beaa1218-7157-4fda-da19-32534e92de5d" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g3.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UZt6aGWnbS9f" + }, + "source": [ + "## UMAP for patient by time stamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SJvc8pOsi3Dv" + }, + "outputs": [], + "source": [ + "data2=data2[data2.value>0]\n", + "data2=data2.reset_index(drop = True)\n", + "data2=data2.drop_duplicates()\n", + "\n", + "data2[\"Label\"] = (\n", + " data2.groupby(\"Donor\")\n", + " .apply(lambda x: x.groupby(\"time\", sort=False).ngroup() + 1)\n", + " .values\n", + ")\n", + "\n", + "cc=pd.unique(data2[data2.Label<5].Donor)\n", + "data2=data2.loc[ data2.Donor.isin(cc), : ]\n", + "data2=data2[data2.Label<5]\n", + "\n", + "data2[\"rank\"] = data2.groupby(\"Donor\")[\"value\"].rank(method=\"dense\", ascending=False)\n", + "data2=data2[data2['rank']<10.0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkWIAyP2oKHN" + }, + "outputs": [], + "source": [ + "data2['id_time']=data2['Donor']+'_'+data2['Label'].apply(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W4TCI3-YolwR" + }, + "outputs": [], + "source": [ + "data3=data2[['id_time','species','value']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + }, + "id": "aiw8WeENpRHT", + "outputId": "2eb8bb20-f4e3-4e7c-bab4-c242a2daacbc" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (169, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 0.3 seconds passed\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = data3.pivot_table(index=['id_time'],columns='species')\n", + "df3=df2.fillna(0).reset_index()\n", + "df4=df3.droplevel(0, axis=1)\n", + "df4.index=df4.iloc[:,0]\n", + "df4=df4.loc[:, df4.columns.str.startswith('s__')]\n", + "\n", + "g = graphistry.nodes(cudf.from_pandas(df4))\n", + "\n", + "t=time()\n", + "\n", + "g3=g.umap(dbscan=True,engine='cuml')\n", + "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n", + "\n", + "g3.plot()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/demos/demos_by_use_case/bio/SingleCellDemo.ipynb b/demos/demos_by_use_case/bio/SingleCellDemo.ipynb new file mode 100644 index 0000000000..2d3f56af33 --- /dev/null +++ b/demos/demos_by_use_case/bio/SingleCellDemo.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5n26EioarG4C", + "metadata": { + "id": "5n26EioarG4C" + }, + "source": [ + "## Tutorial: Single-Cell Transcriptomics using UMAP" + ] + }, + { + "cell_type": "markdown", + "id": "0f3a459c-fa15-445b-8d65-cb701c063ac9", + "metadata": { + "id": "0f3a459c-fa15-445b-8d65-cb701c063ac9" + }, + "source": [ + "\n", + "Single cell gene expression can be analyzed faster and more easily explored using GPU-accelerated UMAP analysis & visualization. Using UMAP in this way, **the user can cluster cell types by patterns of gene expression**\n", + "\n", + "* Task: Analyze single cell's gene expression for clustering\n", + "* Data: 5 independent datasets of roughly 30K rows of 200 columns of single cell\n", + "* [data](https://cytotrace.stanford.edu/#shiny-tab-dataset_download)\n", + "* [paper](https://arxiv.org/pdf/2208.05229.pdf)\n", + "\n", + "**Insight/ Result:**\n", + "\n", + " 1. Speed: Go from minutes to seconds for entire ~10000 cell samples (102s vs 18s on a small T4 GPU),\n", + " 2. Visualization: Add interactivity, similarity edges, and GPU scale to otherwise hard-to-read static scatter plots" + ] + }, + { + "cell_type": "markdown", + "id": "JAgYkgAsvbnG", + "metadata": { + "id": "JAgYkgAsvbnG" + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "6lK_VrzZT1n7", + "metadata": { + "id": "6lK_VrzZT1n7" + }, + "source": [ + "### For the GPU-cloud-accelerated visualization step, get a free API key at https://hub.graphistry.com\n", + "\n", + "### For raw data, get a free Kaggle account from https://www.kaggle.com/docs/api" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33ae99ce-e386-4e97-b5a0-99b86fb6f68a", + "metadata": { + "id": "33ae99ce-e386-4e97-b5a0-99b86fb6f68a" + }, + "outputs": [], + "source": [ + "import os, time\n", + "from collections import Counter\n", + "import cProfile\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from pstats import Stats\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "pd.set_option('display.max_colwidth', 200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "UYVtE57QBy92", + "metadata": { + "id": "UYVtE57QBy92" + }, + "outputs": [], + "source": [ + "!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12\n", + "\n", + "!pip install graphistry[ai]\n", + "\n", + "!pip install -q Biopython\n", + "!pip install -q scanpy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "FQUmN0NcTy8z", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "FQUmN0NcTy8z", + "outputId": "68d17e2d-76df-4248-fe5b-fcb0fb7ea843" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'0.33.0+97.ga86be5c'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import scanpy as sc\n", + "import anndata\n", + "\n", + "import graphistry\n", + "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n", + "\n", + "graphistry.__version__\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0i3vYvSw-OyK", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0i3vYvSw-OyK", + "outputId": "98e55a32-c279-4f40-9544-81dd4459d0ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mon Jul 8 12:42:32 2024 \n", + "+---------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", + "|-----------------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+======================+======================|\n", + "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 44C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n", + "| | | N/A |\n", + "+-----------------------------------------+----------------------+----------------------+\n", + " \n", + "+---------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=======================================================================================|\n", + "| No running processes found |\n", + "+---------------------------------------------------------------------------------------+\n" + ] + } + ], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "y_CdnuiH-Ras", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "y_CdnuiH-Ras", + "outputId": "1d42ea6c-0c23-475e-b656-bd6b4dfc5543" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'24.06.01'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import cuml, cudf\n", + "cuml.__version__" + ] + }, + { + "cell_type": "markdown", + "id": "lqWE36v0vU6l", + "metadata": { + "id": "lqWE36v0vU6l" + }, + "source": [ + "## Data Download & Description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49GjBuuezGSS", + "metadata": { + "id": "49GjBuuezGSS" + }, + "outputs": [], + "source": [ + "import locale\n", + "locale.getpreferredencoding = lambda: \"UTF-8\"\n", + "!pip install kaggle -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "kjZSLidBSgd-", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "kjZSLidBSgd-", + "outputId": "dda18fd3-b28c-4b4e-d351-84eb584970f7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving kaggle.json to kaggle.json\n", + "User uploaded file \"kaggle.json\" with length 62 bytes\n" + ] + } + ], + "source": [ + "from google.colab import files\n", + "\n", + "uploaded = files.upload()\n", + "\n", + "for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + "\n", + "# Then move kaggle.json into the folder where the API expects to find it.\n", + "!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Mc-Q8Y-A0aLS", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mc-Q8Y-A0aLS", + "outputId": "30754e5b-69d0-4080-9c16-19cf9ffa85ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/alexandervc/scrnaseq-collection-of-datasets\n", + "Dataset URL: https://www.kaggle.com/datasets/alexandervc/scrnaseq-collection-of-datasets\n", + "Archive: GSE107910_40.h5ad.zip\n", + " inflating: GSE107910_40.h5ad \n", + "Archive: GSE67123_6.h5ad.zip\n", + " inflating: GSE67123_6.h5ad \n" + ] + } + ], + "source": [ + "#download 2 single cell datasets\n", + "import kaggle as kg\n", + "import pandas as pd\n", + "import os\n", + "\n", + "os.environ['KAGGLE_USERNAME'] = kaggle_user\n", + "os.environ['KAGGLE_KEY'] = kaggle_pass\n", + "\n", + "kg.api.authenticate()\n", + "kg.api.dataset_download_file(dataset = \"alexandervc/scrnaseq-collection-of-datasets\", file_name='Cytotrace/GSE67123_6.h5ad')\n", + "kg.api.dataset_download_file(dataset = \"alexandervc/scrnaseq-collection-of-datasets\", file_name='Cytotrace/GSE107910_40.h5ad')\n", + "\n", + "!unzip -o GSE107910_40.h5ad.zip\n", + "!unzip -o GSE67123_6.h5ad.zip\n", + "\n", + "!mkdir -p single_cell\n", + "!mv *.h5ad single_cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff46a8c8-5882-41a6-83ce-eb8c00c2fc70", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ff46a8c8-5882-41a6-83ce-eb8c00c2fc70", + "outputId": "735b9597-2012-4916-f4e3-c2270c680b43" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Thymus (Drop-seq)' 'Validation' '15429' '9307.0' 'nan' '9307' '19530'\n", + " '8' '8' 'UMI' 'Mouse' '1' 'Thymus' 'Drop-seq' 'Timepoints' 'in vivo'\n", + " '29884461' '20180619' 'GSE107910' 'Immunity'\n", + " 'Only hematopoietic cells, selected based on detectable Ptprc expression, were considered in this dataset. ']\n", + "\n", + "['Embryonic HSCs (Tang et al.)' 'Validation' '143' 'nan' 'nan' '143'\n", + " '24028' '5' '5' 'TPM/FPKM' 'Mouse' '1' 'Embryo' 'Tang et al.'\n", + " 'Timepoints' 'in vivo' '27225119' '20160526' 'GSE67123' 'Nature' 'nan']\n", + "\n" + ] + } + ], + "source": [ + "list_files = []\n", + "for dirname, _, filenames in os.walk('single_cell'):\n", + " for filename in filenames:\n", + " list_files.append(os.path.join(dirname, filename))\n", + "\n", + "for fn in list_files:\n", + " adata = sc.read(fn)\n", + " print( adata.uns['info'] )\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "dQaaHeHu7X54", + "metadata": { + "id": "dQaaHeHu7X54" + }, + "source": [ + "# compute UMAP on GPU for GSE107910_40 Murine Thymus cells\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6f3beb-13c9-4d11-b8b2-35676f711f50", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cc6f3beb-13c9-4d11-b8b2-35676f711f50", + "outputId": "7b79b744-e23d-4182-b2bb-5082977e92db" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (18335, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 38.1 seconds passed\n" + ] + } + ], + "source": [ + "fn='single_cell/GSE107910_40.h5ad'\n", + "adata = sc.read(fn)\n", + "str_data_inf = fn.split('/')[1].split('.')[0] + ' ' + str(adata.X.shape)+'\\n' + adata.uns['info'][0]\n", + "\n", + "EE=pd.DataFrame(adata.X,columns=adata.uns['gcsGenesNames'],index=adata.uns['allcellnames'])\n", + "g1=graphistry.nodes(cudf.from_pandas(EE.T))\n", + "t0 = time.time()\n", + "\n", + "g22 = g1.umap(\n", + " use_scaler='robust', ## zscale, minmax, standard, normal,\n", + " n_components=2,\n", + " n_neighbors=12,\n", + " engine='cuml' ## cannot even run in available RAM, try by switching to engine='umap_learn'\n", + " )\n", + "\n", + "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8USJgBC34bEt", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "8USJgBC34bEt", + "outputId": "cab7972c-e452-4d43-d216-fab920b572eb" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emb2=g22._node_embedding\n", + "\n", + "A=emb2.reset_index()['index'].to_pandas()\n", + "\n", + "B=g22._edges\n", + "B['_src_implicit'] = B['_src_implicit'].replace(A, regex=True)\n", + "B['_dst_implicit'] = B['_dst_implicit'].replace(A, regex=True)\n", + "\n", + "g33=graphistry.nodes(emb2.reset_index(),'index').edges(g11._edges.dropna(),'_src_implicit','_dst_implicit').bind(point_x=\"x\",point_y=\"y\").settings(url_params={\"play\":0})\n", + "\n", + "g33.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "ToIOD-XzO7JD", + "metadata": { + "id": "ToIOD-XzO7JD" + }, + "source": [ + "## this paper was specifically interested in peak mitosis genes, ie [\"Tirosh\" genes](https://genome.cshlp.org/content/25/12/1860.short), so lets zoom in on those" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "B_qu0elxdY_w", + "metadata": { + "id": "B_qu0elxdY_w" + }, + "outputs": [], + "source": [ + "fn='single_cell/GSE107910_40.h5ad'\n", + "import scanpy as sc\n", + "import anndata\n", + "adata = sc.read(fn)\n", + "str_data_inf = fn.split('/')[1].split('.')[0] + ' ' + str(adata.X.shape)+'\\n' + adata.uns['info'][0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Ty0JI0fed1po", + "metadata": { + "id": "Ty0JI0fed1po" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "XW2PejpWO4Ta", + "metadata": { + "id": "XW2PejpWO4Ta" + }, + "outputs": [], + "source": [ + "S_phase_genes_Tirosh = ['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']\n", + "G2_M_genes_Tirosh = ['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']\n", + "u = 'allgenenames'\n", + "list_genes_upper = [t.upper() for t in adata.uns[u] ]\n", + "I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh + G2_M_genes_Tirosh ) )[0]\n" + ] + }, + { + "cell_type": "markdown", + "id": "bJ7h6QjQemw_", + "metadata": { + "id": "bJ7h6QjQemw_" + }, + "source": [ + "## CPU UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "YW2M1hwTekeW", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YW2M1hwTekeW", + "outputId": "7661f1d1-5747-4b9a-ae13-5e3754e6f1cf" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.umap_utils:* Ignoring target column of shape (96, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 26.6 seconds passed\n" + ] + } + ], + "source": [ + "EE=pd.DataFrame(adata.X[:,I],columns=adata.uns['gcsGenesNames'][I],index=adata.uns['allcellnames'])\n", + "g1=graphistry.nodes(cudf.from_pandas(EE.T))\n", + "t0 = time.time()\n", + "\n", + "g11 = g1.umap(\n", + " use_scaler='robust', ## zscale, minmax, standard, normal,\n", + " n_components=2,\n", + " n_neighbors=12,\n", + " engine='umap_learn'\n", + " )\n", + "\n", + "\n", + "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n" + ] + }, + { + "cell_type": "markdown", + "id": "tnH6euKnenSJ", + "metadata": { + "id": "tnH6euKnenSJ" + }, + "source": [ + "### GPU UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0X3aqsRCO9gR", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0X3aqsRCO9gR", + "outputId": "22d64c62-8f9f-4499-a8e7-cc5f098bcddd" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (96, 0) in UMAP fit, as it is not one dimensional" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Total 15.5 seconds passed\n" + ] + } + ], + "source": [ + "EE=pd.DataFrame(adata.X[:,I],columns=adata.uns['gcsGenesNames'][I],index=adata.uns['allcellnames'])\n", + "g1=graphistry.nodes(cudf.from_pandas(EE.T)) #,columns=adata1.uns['gcsGenesNames']))\n", + "\n", + "t0 = time.time()\n", + "\n", + "g11 = g1.umap(\n", + " use_scaler='robust', ## zscale, minmax, standard, normal,\n", + " n_components=2,\n", + " n_neighbors=12,\n", + " engine='cuml'\n", + " )\n", + "\n", + "\n", + "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n" + ] + }, + { + "cell_type": "markdown", + "id": "F9yg_KST7-oK", + "metadata": { + "id": "F9yg_KST7-oK" + }, + "source": [ + "### Visualize\n", + "\n", + "* Nodes are cells\n", + "* Edges are similarity relationships\n", + "* Initial layout is from the UMAP dimensionality reduction to 2D\n", + "* Interactive layout is an aesthetically-optimized force-directed graph over the similarity graph, which is more interpretable for dense clusters\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "OIxdQw4DO9jF", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 543 + }, + "id": "OIxdQw4DO9jF", + "outputId": "395b79f0-be12-407f-9653-a17508b76a2e" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "g11.plot()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}