diff --git a/README.md b/README.md
index b4dcae3167..e0931a50e3 100644
--- a/README.md
+++ b/README.md
@@ -714,7 +714,7 @@ Set visual attributes through [quick data bindings](https://hub.graphistry.com/d
Port Scan Attack
 |
- Protein Interactions
Source: BioGRID |
+ Protein Interactions
Source: BioGRID BioGRID Demo Notebook |
Programming Languages
Source: Socio-PLT project |
diff --git a/demos/demos_by_use_case/bio/BiogridDemo.ipynb b/demos/demos_by_use_case/bio/BiogridDemo.ipynb
index 96d0f88411..613c2a0529 100644
--- a/demos/demos_by_use_case/bio/BiogridDemo.ipynb
+++ b/demos/demos_by_use_case/bio/BiogridDemo.ipynb
@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
"import graphistry\n",
"\n",
"# To specify Graphistry account & server, use:\n",
- "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n",
+ "graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n",
"# For more options, see https://github.com/graphistry/pygraphistry#configure\n"
]
},
@@ -39,7 +39,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {
"scrolled": false
},
@@ -48,26 +48,26 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " interactivity=interactivity, compiler=compiler, result=result)\n"
+ "/var/folders/sx/x954rbdd44d932dd0ygfp1qc0000gn/T/ipykernel_39037/263179729.py:2: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " rawdata = pandas.read_table(url1, na_values=['-'], engine='c')#, compression='gzip')\n"
]
},
{
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -134,17 +134,14 @@
"2 Wang T (1996) Low Throughput "
]
},
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "url1 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-ALL-3.3.123.tab2.txt.gz'\n",
- "rawdata = pandas.read_table(url1, na_values=['-'], engine='c', compression='gzip')\n",
- "\n",
- "# If using local data, comment the two lines above and uncomment the line below\n",
- "# pandas.read_table('./data/BIOGRID-ALL-3.3.123.tab2.txt', na_values=['-'], engine='c')\n",
+ "url1 = 'https://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-ALL-3.3.123.tab2.zip'\n",
+ "rawdata = pandas.read_table(url1, na_values=['-'], engine='c')#, compression='gzip')\n",
"\n",
"cols = ['BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Official Symbol Interactor A', \n",
" 'Official Symbol Interactor B', 'Pubmed ID', 'Author', 'Throughput']\n",
@@ -162,23 +159,25 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
- " \n",
" \n",
" \n",
" "
],
@@ -186,7 +185,7 @@
""
]
},
- "execution_count": 3,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -206,7 +205,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 7,
"metadata": {
"scrolled": true
},
@@ -215,18 +214,18 @@
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -263,19 +262,15 @@
"22 3 Arabidopsis thaliana"
]
},
- "execution_count": 4,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# This downloads 170 MB, it might take some time.\n",
- "url2 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt.gz'\n",
- "raw_proteins = pandas.read_table(url2, na_values=['-'], engine='c', compression='gzip')\n",
- "\n",
- "# If using local data, comment the two lines above and uncomment the line below\n",
- "# raw_proteins = pandas.read_table('./data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt', na_values=['-'], engine='c')\n",
- "\n",
+ "url2 = 'http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-IDENTIFIERS-3.3.123.tab.zip'\n",
+ "raw_proteins = pandas.read_table(url2, na_values=['-'], skiprows=28)#, engine='c', compression='gzip')\n",
"\n",
"protein_ids = raw_proteins[['BIOGRID_ID', 'ORGANISM_OFFICIAL_NAME']].drop_duplicates() \\\n",
" .rename(columns={'ORGANISM_OFFICIAL_NAME': 'ORGANISM'})\n",
@@ -291,7 +286,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {
"scrolled": true
},
@@ -300,18 +295,18 @@
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -348,7 +343,7 @@
"2 106605 ACVR1"
]
},
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -375,25 +370,25 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -434,7 +429,7 @@
"2 106605 ACVR1 Homo sapiens"
]
},
- "execution_count": 6,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -453,7 +448,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -470,25 +465,25 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -538,7 +533,7 @@
"2 Homo sapiens 0 "
]
},
- "execution_count": 8,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -557,7 +552,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 12,
"metadata": {
"scrolled": true
},
@@ -566,18 +561,18 @@
"data": {
"text/html": [
"\n",
- "\n",
"
\n",
" \n",
@@ -649,7 +644,7 @@
"2 Low Throughput "
]
},
- "execution_count": 9,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -673,32 +668,27 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 13,
"metadata": {
"scrolled": false
},
"outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Uploading 7139 kB. This may take a while...\n"
- ]
- },
{
"data": {
"text/html": [
"\n",
- " \n",
" \n",
" \n",
" "
],
@@ -706,7 +696,7 @@
""
]
},
- "execution_count": 10,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -716,15 +706,6 @@
"g2 = g.bind(node='BIOGRID_ID', edge_title='Author', point_title='SYMBOL', point_color='Color')\n",
"g2.plot(interactions, protein_labels)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -743,7 +724,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.11"
+ "version": "3.12.2"
}
},
"nbformat": 4,
diff --git a/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb b/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb
new file mode 100644
index 0000000000..25c62d95de
--- /dev/null
+++ b/demos/demos_by_use_case/bio/ChemicalMappingDemo.ipynb
@@ -0,0 +1,865 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dzmJAwfiAi6k"
+ },
+ "source": [
+ "# Accelerated Chemical Mapping with [Graphistry](graphistry.com)\n",
+ "\n",
+ "This notebook visualizes a chemical dataset describing Blood Brain Barrier Permeability (BBBP) from [MoleculeNet](http://moleculenet.org/datasets-1) and [ECFPs](https://pubs.acs.org/doi/10.1021/ci100050t).\n",
+ "\n",
+ "Using these string formulations of molecular 3D structure we can take advantage of string-based computational algorithms. These string representations look like the following:\n",
+ "\n",
+ "\n",
+ "* OCC#Cc1cc(Cl)c(C(=O)Nc2ccnc(NC(=O)C3CC3)c2)c(Cl)c1\n",
+ "151276 \n",
+ "* CCNc1ncnc2c1nc(NC3CCCC3)n2[C@@H]4O[C@H](CO)[C@@H](O)[C@H]4O\n",
+ "172750 \n",
+ "* CCC(C1=C(O)C2=C(CCCCCC2)OC1=O)c3cccc(NS(=O)(=O)c4ccc(Cl)cc4)c3\n",
+ "155015 \n",
+ "* CC1CCN(CC1)c2nc(ccc2CNC(=O)Nc3ccc(CNS(=O)(=O)C)c(F)c3)C(F)(F)F\n",
+ "\n",
+ "\n",
+ "The formulation of the structure into linear form helps us immensely, and thus we are able to parse and reduce these complex molecules down to 2 dimensions using conventional statistical tools, namely UMAP. Ultimately we demonstrate how such an OPEN-SOURCE analysis can be sped-up and scaled-up massively with the [graphistry](graphistry.com) environment and toolkit\n",
+ "\n",
+ "\n",
+ "* Speedup: From minutes to seconds - 3 min to 10 seconds on a small T4 GPU\n",
+ "* Visual insight: Add interactivity, similarity edges, and visual scale to a traditional static scatterplot to better investigate pairwise correlations and overall clusters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "w89wE473URRH"
+ },
+ "source": [
+ "# Import accelerator libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ABA00KeDT6Gx"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -q --extra-index-url=https://pypi.nvidia.com cuml-cu12\n",
+ "import cuml,cudf\n",
+ "print(cuml.__version__)\n",
+ "\n",
+ "!pip -q install graphistry[ai]\n",
+ "# !pip install -U -q --force git+https://github.com/graphistry/pygraphistry.git#@dev/depman_gpufeat\n",
+ "# !pip install cu_cat"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7_t1UL7YUAx3",
+ "outputId": "6f19365b-efd9-4b2e-fab6-2a9a92c020f8"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.33.9\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "import graphistry\n",
+ "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n",
+ "\n",
+ "print(graphistry.__version__)\n",
+ "\n",
+ "# import cu_cat\n",
+ "# print(cu_cat.__file__)\n",
+ "\n",
+ "import os\n",
+ "from collections import Counter\n",
+ "import cProfile\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "from pstats import Stats\n",
+ "import cuml,cudf\n",
+ "from time import time\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "from typing import List\n",
+ "import seaborn as sns\n",
+ "pd.set_option('display.max_colwidth', 200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EtU413LOUG_S",
+ "outputId": "0c6e2fd0-0c87-4dae-b53f-325316af75e0"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Tesla T4\n"
+ ]
+ }
+ ],
+ "source": [
+ "!nvidia-smi --query-gpu=gpu_name --format=csv,noheader"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZfM8WRfW4gOY"
+ },
+ "source": [
+ "# Import Basics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "prDjzDTU384B"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -q rdkit\n",
+ "!pip install --pre -q deepchem\n",
+ "\n",
+ "from rdkit import Chem, DataStructs\n",
+ "from rdkit.Chem.rdchem import Mol\n",
+ "from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser\n",
+ "\n",
+ "from rdkit import RDLogger\n",
+ "lg = RDLogger.logger()\n",
+ "lg.setLevel(RDLogger.CRITICAL)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "L8NcwLVK5s_i"
+ },
+ "source": [
+ "# Embed BBBP in Global Chemical Space Approximation (Dataset-Agnostic Embedding)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gdXebUL45usJ"
+ },
+ "source": [
+ "### Read in and process ChEMBL data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "HUbDr8Dp5uOB"
+ },
+ "outputs": [],
+ "source": [
+ "# Read in data from MoleculeNet\n",
+ "chembl = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz\", compression='gzip')\n",
+ "\n",
+ "# Sample a random 20k\n",
+ "chembl = chembl.sample(n=20000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "x9T_sR0oaJvq"
+ },
+ "outputs": [],
+ "source": [
+ "chemblA = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz\", compression='gzip')\n",
+ "\n",
+ "chem_data = chembl[\"smiles\"][chembl.smiles.str.len()<500] ## lets simplify and just look at \"short molecules\" for this exercise\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GAh6_ylDa74q",
+ "outputId": "a4cf8f91-851f-4bac-c30a-6d4c8baf3f4a"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "201332 OCC#Cc1cc(Cl)c(C(=O)Nc2ccnc(NC(=O)C3CC3)c2)c(Cl)c1\n",
+ "151276 CCNc1ncnc2c1nc(NC3CCCC3)n2[C@@H]4O[C@H](CO)[C@@H](O)[C@H]4O\n",
+ "172750 CCC(C1=C(O)C2=C(CCCCCC2)OC1=O)c3cccc(NS(=O)(=O)c4ccc(Cl)cc4)c3\n",
+ "155015 CC1CCN(CC1)c2nc(ccc2CNC(=O)Nc3ccc(CNS(=O)(=O)C)c(F)c3)C(F)(F)F\n",
+ "231881 Cc1nc(cs1)C#Cc2cc(Cl)cc(c2)C#N\n",
+ " ... \n",
+ "197652 CN(C)C(=O)c1cc2cc(Nc3nccc(n3)c4cn(cn4)C5CC5)cc(Cl)c2[nH]1\n",
+ "63558 COc1cc(OC)cc(\\C=C\\2/CCC\\C(=C/c3ccccc3F)\\C2=O)c1\n",
+ "23052 CCN1CCC(=C(C1)C(=O)OCCc2ccccn2)c3ccccc3\n",
+ "154256 CN[C@@H]1CCN(C1)c2nc(N)nc3c2CCCc4ccccc34\n",
+ "72859 COc1ccc(cc1)C2(N=C(N)c3nc(C)sc23)c4cccc(c4)c5cncnc5\n",
+ "Name: smiles, Length: 19959, dtype: object"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chem_data.dropna()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2f9Zj5LGxYbo"
+ },
+ "source": [
+ " ## with CPU"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qDVN9-VgxWMu",
+ "outputId": "ed4bb475-3ff7-4bd8-9e53-d58f80c5690d"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.umap_utils:* Ignoring target column of shape (19959, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 174.6 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "g2 = graphistry.nodes(chem_data)\n",
+ "\n",
+ "t=time()\n",
+ "g4=g2.umap(engine='umap_learn',metric = \"jaccard\",\n",
+ " n_neighbors = 25,\n",
+ " n_components = 2,\n",
+ " dbscan=True,\n",
+ " min_dist = 0.001)\n",
+ "j=time()-t\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-eahF167xaSR"
+ },
+ "source": [
+ "## and GPU"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "DnXz47yx9MdU",
+ "outputId": "f01dcd41-7a45-4d3e-a475-d4c1fc85a662"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (19959, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 9.3 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "g2 = graphistry.nodes((chem_data))\n",
+ "\n",
+ "\n",
+ "t=time()\n",
+ "g4=g2.umap(engine='cuml',metric = \"jaccard\",\n",
+ " n_neighbors = 25,\n",
+ " n_components = 2,\n",
+ " dbscan=True,\n",
+ " min_dist = 0.001)\n",
+ "j=time()-t\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "v3SEPYwrcaNa",
+ "outputId": "f196de85-080f-45e2-c559-f877331a60a9"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g4.plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jovz3xVc4M8X"
+ },
+ "source": [
+ "# Embed BBBP with UMAP"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ZlQqzKMB4Qq7"
+ },
+ "source": [
+ "### Read in and process small data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "HR2S6BbP4JlB"
+ },
+ "outputs": [],
+ "source": [
+ "# Read in data from MoleculeNet\n",
+ "bbbp = pd.read_csv(\"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv\")\n",
+ "\n",
+ "# Clean up columnn names so they are easier to interpret\n",
+ "bbbp = bbbp[[\"smiles\", \"p_np\", \"name\"]].reset_index(drop=True).rename({\"p_np\": \"permeable\"}, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "i9iRe44glrPq",
+ "outputId": "69043bd0-9660-4b94-f1c4-cf8e8b9c8080"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"BBBP[['name','permeable']][BBBP\",\n \"rows\": 2020,\n \"fields\": [\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2020,\n \"samples\": [\n \"GR94839_I\",\n \"carbamazepine\",\n \"testolactone\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"permeable\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " permeable | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Propanolol | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Terbutylchlorambucil | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 40730 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 24 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " cloxacillin | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 2045 | \n",
+ " licostinel | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2046 | \n",
+ " ademetionine(adenosyl-methionine) | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2047 | \n",
+ " mesocarb | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2048 | \n",
+ " tofisoline | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2049 | \n",
+ " azidamfenicol | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2020 rows × 2 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " name permeable\n",
+ "0 Propanolol 1\n",
+ "1 Terbutylchlorambucil 1\n",
+ "2 40730 1\n",
+ "3 24 1\n",
+ "4 cloxacillin 1\n",
+ "... ... ...\n",
+ "2045 licostinel 1\n",
+ "2046 ademetionine(adenosyl-methionine) 1\n",
+ "2047 mesocarb 1\n",
+ "2048 tofisoline 1\n",
+ "2049 azidamfenicol 1\n",
+ "\n",
+ "[2020 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "BBBP=bbbp[~bbbp.name.duplicated(keep='first')]\n",
+ "BBBP[['name','permeable']][BBBP.smiles.str.len()>3]#.reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6L8TZ5qx530H"
+ },
+ "source": [
+ "### ... and with graphistry"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vA8LG2dFgKrc",
+ "outputId": "c4b1ceba-0f48-4239-b600-1302dcb9eda8"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (2020, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 43.0 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "BBBP=bbbp[~bbbp.name.duplicated(keep='first')]\n",
+ "\n",
+ "g = graphistry.nodes(cudf.from_pandas(BBBP[['smiles','permeable']][BBBP.smiles.str.len()>3]))\n",
+ "t=time()\n",
+ "# g2=g.featurize(feature_engine='cu_cat',memoize=True)\n",
+ "g3=g.umap(engine='cuml',metric = \"jaccard\",\n",
+ " n_neighbors = 25,\n",
+ " n_components = 2,\n",
+ " low_memory = False,\n",
+ " min_dist = 0.001)\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "WMzz0EfaqR03",
+ "outputId": "c386f527-42c1-4e07-88a4-701238d73908"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g3.encode_point_color('permeable',palette=[\"hotpink\", \"dodgerblue\"],as_continuous=True).plot()\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [
+ "jovz3xVc4M8X"
+ ],
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb b/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb
new file mode 100644
index 0000000000..5061f79320
--- /dev/null
+++ b/demos/demos_by_use_case/bio/MetagenomicDemo.ipynb
@@ -0,0 +1,1073 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ozrca88hza85"
+ },
+ "source": [
+ "# Accelerating metagenomic analysis with [Graphistry](graphistry.com) focusing on viral tracing over time\n",
+ "\n",
+ "## [viral calling pipeline here](https://github.com/dcolinmorgan/viral_snake)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tLRHg2VEzoYy"
+ },
+ "source": [
+ "Using GPU-accelerated UMAP + DBScan analysis & visualization, metagenomic samples' bacterial compositions can be clustered and compared faster and much more easily explored.\n",
+ "\n",
+ "* Task: Analyze metagenomic samples for similarity\n",
+ "* Data: time series samples\n",
+ "* 563 samples collected from 84 donors, producing 4 dense long-term time series (up to 1 sample every other day during 18 months)\n",
+ "* Clustering: the species component extracted from time-stamped patient samples, e.g., tuple of \n",
+ "* Each **node** is , and clustering is on species=abc text similarity\n",
+ "* *n.b.* since species text is full taxa information, text comparison can return degree similarity\n",
+ "\n",
+ "* [data](https://www.ebi.ac.uk/ena/browser/view/PRJNA544527)\n",
+ "* [metadata](https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx)\n",
+ "* [paper](https://sci-hub.se/10.1038/s41591-019-0559-3)\n",
+ "\n",
+ "\n",
+ "**Insight/ Result:**\n",
+ "\n",
+ "* 43s to umap and dbscan vs 2342s on a small T4 GPU\n",
+ "* over **50X** faster for a single run\n",
+ "* since [the reference paper for this analysis](https://journals.asm.org/doi/full/10.1128/msystems.00118-23) runs this analysis 12x per dataset (here we only have 1 dataset), we could expect to save nearly the entire 8hrs for this dataset, taking less than 10 minutes in total"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "PjnS_PCWaClg"
+ },
+ "source": [
+ "# Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "pFKuEaZClwXa"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -q --extra-index-url=https://pypi.nvidia.com cuml-cu12\n",
+ "import cuml,cudf\n",
+ "print(cuml.__version__)\n",
+ "\n",
+ "!pip -q install graphistry[ai]\n",
+ "\n",
+ "!pip install -q Biopython"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Ob5QrPuiAf-Q"
+ },
+ "outputs": [],
+ "source": [
+ "import locale\n",
+ "def getpreferredencoding(do_setlocale = True):\n",
+ " return \"UTF-8\"\n",
+ "locale.getpreferredencoding = getpreferredencoding"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HK_8_7UB0mhx"
+ },
+ "source": [
+ "# import /configure\n",
+ "\n",
+ "visualization step, get a free API key at https://hub.graphistry.com\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "i42QrQ_ejC4h",
+ "outputId": "f8c19225-336f-4dd9-e2eb-7b23e184bc51"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "24.06.01\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import graphistry\n",
+ "from time import time\n",
+ "\n",
+ "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n",
+ "graphistry.__version__\n",
+ "\n",
+ "import cuml,cudf\n",
+ "print(cuml.__version__)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_Y1VlCGy4_FR"
+ },
+ "source": [
+ "# bio-ml dataset\n",
+ "\n",
+ "\n",
+ "1. [3 subjects x 10 time points](\n",
+ "https://www.ebi.ac.uk/ena/browser/view/PRJNA544527)\n",
+ "\n",
+ "2. [metadata](\n",
+ "https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hO1HFIMO6ttV"
+ },
+ "outputs": [],
+ "source": [
+ "!wget https://gist.githubusercontent.com/lmeyerov/61a6a7d5fa0dbe51e786ed52408ac360/raw/11a11aa0b865ceb96880b2cd2ae3b12f1ef947c8/gistfile1.txt -O PRJNA544527_mpa4out.txt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "CBN6Z_77Sduq"
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "if [ ! -f PRJNA544527_mpa4out.txt ]; then\n",
+ " !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR922/006/SRR9224006/SRR9224006_1.fastq.gz\n",
+ " !wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR922/006/SRR9224006/SRR9224006_2.fastq.gz\n",
+ "\n",
+ " !gunzip SRR9224006_1.fastq.gz\n",
+ " !gunzip SRR9224006_2.fastq.gz\n",
+ "\n",
+ " !head /content/SRR9224006_1.fastq\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "UWDGFCWxSpv_"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "if not os.path.exists('PRJNA544527_mpa4out.txt'):\n",
+ " from Bio import SeqIO\n",
+ " import glob,os\n",
+ " import pandas as pd\n",
+ " B=pd.DataFrame()\n",
+ " for i in glob.glob('/content/*.fastq'):\n",
+ " # j=os.path.basename(i)\n",
+ " fasta_sequences = SeqIO.parse(open(i),'fastq')\n",
+ " identifiers = []\n",
+ " sequences = []\n",
+ " for fasta in fasta_sequences:\n",
+ " name, sequence = fasta.id, str(fasta.seq)\n",
+ " identifiers.append(name)\n",
+ " sequences.append(sequence)\n",
+ "\n",
+ " A=pd.DataFrame([identifiers,sequences]).T\n",
+ " A.columns=['ID','seq']\n",
+ " A.dropna(inplace=True)\n",
+ " B=B.append(A)\n",
+ " # A['ID']#=A.ID.str.split('-')[0:1]\n",
+ " # B['ID']=B['ID'].str.split('-').str[0]+'_'+B['ID'].str.split('-').str[1]#.cat()\n",
+ " B['ID']=B.ID.str.split('_length').str[0]\n",
+ " B.index=B.ID"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fDvymRFjHPNB"
+ },
+ "source": [
+ "# install [HUMAnN 3](https://huttenhower.sph.harvard.edu/humann), a method for efficiently and accurately profiling the abundance of microbial metabolic pathways and other molecular functions from metagenomic or metatranscriptomic sequencing data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gtOG0QeoUoX6"
+ },
+ "source": [
+ "### takes very long for running all samples\n",
+ " (1day+ run on cluster)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yIeeDXPBHN6D"
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "\n",
+ "if [ ! -f PRJNA544527_mpa4out.txt ]; then\n",
+ "\n",
+ " pip install humann --no-binary :all:\n",
+ " pip install metaphlan\n",
+ "\n",
+ " humann_databases --download utility_mapping full /path/to/databases --update-config yes\n",
+ "\n",
+ " # humann_test\n",
+ " wget https://github.com/biobakery/humann/raw/master/examples/demo.fastq.gz\n",
+ " humann -i demo.fastq.gz -o sample_results\n",
+ "\n",
+ "\n",
+ " mkdir assemble epi_sam_out mpa4_out\n",
+ " humann -i /content/All_MAGs/Sample_101_S75_bin_1.fa -o test_out\n",
+ "\n",
+ "\n",
+ " seq=$(ls /content/*.fastq | cut -d / -f2| cut -d _ -f1)\n",
+ "\n",
+ " for i in $(eval \"echo \"$seq\" | cut -d _ -f1\")\n",
+ "\n",
+ " do\n",
+ " metaphlan /content/${i}.fa --nproc 40 --input_type fasta -o /content/assemble/${i}/h4_out.txt -t rel_ab_w_read_stats\n",
+ " done\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dHbwBIEn6Wxr"
+ },
+ "source": [
+ "# umap and dbscan\n",
+ "\n",
+ "idea for metagenomic analysis based on [Quantifying Shared and Unique Gene Content across 17 Microbial Ecosystems\n",
+ "](https://journals.asm.org/doi/full/10.1128/msystems.00118-23)\n",
+ "\n",
+ "(analyze all samples run on cluster)\n",
+ "\n",
+ "also this [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x?ref=https://codemonkey.link#Sec7) and [method](https://github.com/marbl/Mash/blob/master/INSTALL.txt)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "RoIBLY3-670T",
+ "outputId": "7c1d8f5f-fc9f-44d3-9a3a-a268edc4c00c"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2024-07-09 07:23:29-- https://raw.githubusercontent.com/dcolinmorgan/grph/main/PRJNA544527-meta_inf.txt\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 45603 (45K) [text/plain]\n",
+ "Saving to: ‘PRJNA544527-meta_inf.txt’\n",
+ "\n",
+ "\rPRJNA544527-meta_in 0%[ ] 0 --.-KB/s \rPRJNA544527-meta_in 100%[===================>] 44.53K --.-KB/s in 0.007s \n",
+ "\n",
+ "2024-07-09 07:23:29 (6.40 MB/s) - ‘PRJNA544527-meta_inf.txt’ saved [45603/45603]\n",
+ "\n",
+ "--2024-07-09 07:23:29-- https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx\n",
+ "Resolving static-content.springer.com (static-content.springer.com)... 151.101.0.95, 151.101.64.95, 151.101.128.95, ...\n",
+ "Connecting to static-content.springer.com (static-content.springer.com)|151.101.0.95|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 3556857 (3.4M) [application/octet-stream]\n",
+ "Saving to: ‘41591_2019_559_MOESM3_ESM.xlsx’\n",
+ "\n",
+ "41591_2019_559_MOES 100%[===================>] 3.39M --.-KB/s in 0.05s \n",
+ "\n",
+ "2024-07-09 07:23:31 (69.7 MB/s) - ‘41591_2019_559_MOESM3_ESM.xlsx’ saved [3556857/3556857]\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed\n",
+ " warn(msg)\n"
+ ]
+ }
+ ],
+ "source": [
+ "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n",
+ "data.index=data.reset_index().clade_name.str.split('|',expand=True)[6]\n",
+ "data=data.reset_index().dropna(axis=0)\n",
+ "data.index=data[6]\n",
+ "data=data.drop(columns=6)\n",
+ "\n",
+ "!wget https://gist.githubusercontent.com/lmeyerov/b650f1ef9e56c3f1888ebb009bc5ed46/raw/76dda5fabcdfbcdf0cc58450982fbeb4b2e38a98/PRJNA544527-meta_inf.txt\n",
+ "meta=pd.read_csv('/content/PRJNA544527-meta_inf.txt',sep='\\t',header=None)\n",
+ "\n",
+ "mm=pd.merge(data.T,meta[[3,5]],left_index=True,right_on=3)\n",
+ "\n",
+ "mm['id']=mm[5].str.split('-').str[0]\n",
+ "mm['time']=mm[5].str.split('_').str[0].str.split('-').str[1]\n",
+ "\n",
+ "!wget https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0559-3/MediaObjects/41591_2019_559_MOESM3_ESM.xlsx\n",
+ "metaa=pd.read_excel('/content/41591_2019_559_MOESM3_ESM.xlsx',sheet_name='SupTable2',skiprows=3)\n",
+ "metaa=metaa[['Donor','Age','Sex','BMI']]\n",
+ "\n",
+ "Full_table=pd.merge(mm,metaa,left_on='id',right_on='Donor')\n",
+ "Full_table=Full_table.drop(columns=[3,\t5,\t'id'])\n",
+ "\n",
+ "data2=Full_table.melt(id_vars=['time','Donor','Age','Sex','BMI'])\n",
+ "\n",
+ "data2=data2.rename(columns={'variable':'species'})\n",
+ "data2=data2.sort_values(by=['Donor','time','value'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "3FvFtKukoI7E",
+ "outputId": "9077830f-9d07-4855-c391-50bd6071dde2"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2678 s__Bacteroides_clarus_aa\n",
+ "5378 s__Bacteroides_intestinalis_aa\n",
+ "9158 s__Ruminococcus_bromii_aa\n",
+ "12938 s__GGB6601_SGB9333_aa\n",
+ "13478 s__GGB3256_SGB4303_aa\n",
+ " ... \n",
+ "86343 s__Faecalibacterium_prausnitzii_dl\n",
+ "2103 s__Phocaeicola_massiliensis_dl\n",
+ "67983 s__Phocaeicola_massiliensis_dl\n",
+ "5883 s__Faecalibacterium_prausnitzii_dl\n",
+ "178143 s__Phocaeicola_plebeius_dl\n",
+ "Length: 208440, dtype: object"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(data2.species)+'_'+(data2.Donor)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 424
+ },
+ "id": "4savEas1jZyX",
+ "outputId": "a9f7c2a2-5c50-402e-b03b-b7f1daf857a4"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"data2[data2['value']>1]\",\n \"rows\": 14957,\n \"fields\": [\n {\n \"column\": \"time\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 219,\n \"samples\": [\n \"0169\",\n \"0098\",\n \"0043\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Donor\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 84,\n \"samples\": [\n \"db\",\n \"aa\",\n \"ck\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 19,\n \"max\": 45,\n \"num_unique_values\": 23,\n \"samples\": [\n 43,\n 28,\n 29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Female\",\n \"Male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.092398931918041,\n \"min\": 17.6,\n \"max\": 35.1,\n \"num_unique_values\": 57,\n \"samples\": [\n 24.1,\n 23.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"species\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 112,\n \"samples\": [\n \"s__Bacteroides_cellulosilyticus\",\n \"s__Clostridiaceae_bacterium\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7.07963397515598,\n \"min\": 1.00024,\n \"max\": 100.0,\n \"num_unique_values\": 8355,\n \"samples\": [\n 2.30652,\n 9.30662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe"
+ },
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " time | \n",
+ " Donor | \n",
+ " Age | \n",
+ " Sex | \n",
+ " BMI | \n",
+ " species | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 39938 | \n",
+ " 0154 | \n",
+ " aa | \n",
+ " 29 | \n",
+ " Male | \n",
+ " 24.1 | \n",
+ " s__Desulfovibrio_piger | \n",
+ " 1.00422 | \n",
+ "
\n",
+ " \n",
+ " | 11318 | \n",
+ " 0154 | \n",
+ " aa | \n",
+ " 29 | \n",
+ " Male | \n",
+ " 24.1 | \n",
+ " s__Odoribacter_splanchnicus | \n",
+ " 1.12785 | \n",
+ "
\n",
+ " \n",
+ " | 77198 | \n",
+ " 0154 | \n",
+ " aa | \n",
+ " 29 | \n",
+ " Male | \n",
+ " 24.1 | \n",
+ " s__Odoribacter_splanchnicus | \n",
+ " 1.12785 | \n",
+ "
\n",
+ " \n",
+ " | 73418 | \n",
+ " 0154 | \n",
+ " aa | \n",
+ " 29 | \n",
+ " Male | \n",
+ " 24.1 | \n",
+ " s__Faecalibacterium_prausnitzii | \n",
+ " 1.14483 | \n",
+ "
\n",
+ " \n",
+ " | 183578 | \n",
+ " 0154 | \n",
+ " aa | \n",
+ " 29 | \n",
+ " Male | \n",
+ " 24.1 | \n",
+ " s__GGB3304_SGB4367 | \n",
+ " 1.24406 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 86343 | \n",
+ " 0006 | \n",
+ " dl | \n",
+ " 32 | \n",
+ " Male | \n",
+ " 26.1 | \n",
+ " s__Faecalibacterium_prausnitzii | \n",
+ " 2.21002 | \n",
+ "
\n",
+ " \n",
+ " | 2103 | \n",
+ " 0006 | \n",
+ " dl | \n",
+ " 32 | \n",
+ " Male | \n",
+ " 26.1 | \n",
+ " s__Phocaeicola_massiliensis | \n",
+ " 3.84088 | \n",
+ "
\n",
+ " \n",
+ " | 67983 | \n",
+ " 0006 | \n",
+ " dl | \n",
+ " 32 | \n",
+ " Male | \n",
+ " 26.1 | \n",
+ " s__Phocaeicola_massiliensis | \n",
+ " 3.84088 | \n",
+ "
\n",
+ " \n",
+ " | 5883 | \n",
+ " 0006 | \n",
+ " dl | \n",
+ " 32 | \n",
+ " Male | \n",
+ " 26.1 | \n",
+ " s__Faecalibacterium_prausnitzii | \n",
+ " 4.37472 | \n",
+ "
\n",
+ " \n",
+ " | 178143 | \n",
+ " 0006 | \n",
+ " dl | \n",
+ " 32 | \n",
+ " Male | \n",
+ " 26.1 | \n",
+ " s__Phocaeicola_plebeius | \n",
+ " 28.49330 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
14957 rows × 7 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " time Donor Age Sex BMI species value\n",
+ "39938 0154 aa 29 Male 24.1 s__Desulfovibrio_piger 1.00422\n",
+ "11318 0154 aa 29 Male 24.1 s__Odoribacter_splanchnicus 1.12785\n",
+ "77198 0154 aa 29 Male 24.1 s__Odoribacter_splanchnicus 1.12785\n",
+ "73418 0154 aa 29 Male 24.1 s__Faecalibacterium_prausnitzii 1.14483\n",
+ "183578 0154 aa 29 Male 24.1 s__GGB3304_SGB4367 1.24406\n",
+ "... ... ... ... ... ... ... ...\n",
+ "86343 0006 dl 32 Male 26.1 s__Faecalibacterium_prausnitzii 2.21002\n",
+ "2103 0006 dl 32 Male 26.1 s__Phocaeicola_massiliensis 3.84088\n",
+ "67983 0006 dl 32 Male 26.1 s__Phocaeicola_massiliensis 3.84088\n",
+ "5883 0006 dl 32 Male 26.1 s__Faecalibacterium_prausnitzii 4.37472\n",
+ "178143 0006 dl 32 Male 26.1 s__Phocaeicola_plebeius 28.49330\n",
+ "\n",
+ "[14957 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data2[data2['value']>1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WRYYF1brbdgz"
+ },
+ "source": [
+ "## UMAP by species via CPU\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HrJErY1SZSfh",
+ "outputId": "a5e6cd2b-e198-460e-ca02-46bb6cc592a6"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.umap_utils:* Ignoring target column of shape (679, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 24.8 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n",
+ "\n",
+ "g = graphistry.nodes(cudf.from_pandas(data.dropna()))\n",
+ "\n",
+ "t=time()\n",
+ "g3=g.umap(dbscan=True,engine='umap_learn')\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n",
+ "\n",
+ "g3.plot()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "l7aLIxP6ZY9s"
+ },
+ "source": [
+ "## UMAP by species via GPU"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6Zoo17Ui9zxh",
+ "outputId": "6af64cb4-1528-4033-f606-75fb73af4686"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (679, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 0.7 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "data=pd.read_csv('/content/PRJNA544527_mpa4out.txt',sep='\\t',skiprows=1,index_col=0)\n",
+ "\n",
+ "g = graphistry.nodes(cudf.from_pandas(data.dropna()))\n",
+ "\n",
+ "t=time()\n",
+ "g3=g.umap(dbscan=True,engine='cuml')\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "kjSdlnmeaOuL",
+ "outputId": "beaa1218-7157-4fda-da19-32534e92de5d"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g3.plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UZt6aGWnbS9f"
+ },
+ "source": [
+ "## UMAP for patient by time stamp"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "SJvc8pOsi3Dv"
+ },
+ "outputs": [],
+ "source": [
+ "data2=data2[data2.value>0]\n",
+ "data2=data2.reset_index(drop = True)\n",
+ "data2=data2.drop_duplicates()\n",
+ "\n",
+ "data2[\"Label\"] = (\n",
+ " data2.groupby(\"Donor\")\n",
+ " .apply(lambda x: x.groupby(\"time\", sort=False).ngroup() + 1)\n",
+ " .values\n",
+ ")\n",
+ "\n",
+ "cc=pd.unique(data2[data2.Label<5].Donor)\n",
+ "data2=data2.loc[ data2.Donor.isin(cc), : ]\n",
+ "data2=data2[data2.Label<5]\n",
+ "\n",
+ "data2[\"rank\"] = data2.groupby(\"Donor\")[\"value\"].rank(method=\"dense\", ascending=False)\n",
+ "data2=data2[data2['rank']<10.0]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "dkWIAyP2oKHN"
+ },
+ "outputs": [],
+ "source": [
+ "data2['id_time']=data2['Donor']+'_'+data2['Label'].apply(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "W4TCI3-YolwR"
+ },
+ "outputs": [],
+ "source": [
+ "data3=data2[['id_time','species','value']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 579
+ },
+ "id": "aiw8WeENpRHT",
+ "outputId": "2eb8bb20-f4e3-4e7c-bab4-c242a2daacbc"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (169, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 0.3 seconds passed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2 = data3.pivot_table(index=['id_time'],columns='species')\n",
+ "df3=df2.fillna(0).reset_index()\n",
+ "df4=df3.droplevel(0, axis=1)\n",
+ "df4.index=df4.iloc[:,0]\n",
+ "df4=df4.loc[:, df4.columns.str.startswith('s__')]\n",
+ "\n",
+ "g = graphistry.nodes(cudf.from_pandas(df4))\n",
+ "\n",
+ "t=time()\n",
+ "\n",
+ "g3=g.umap(dbscan=True,engine='cuml')\n",
+ "print('\\n Total ', np.round(time() - t,1), 'seconds passed')\n",
+ "\n",
+ "g3.plot()"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/demos/demos_by_use_case/bio/SingleCellDemo.ipynb b/demos/demos_by_use_case/bio/SingleCellDemo.ipynb
new file mode 100644
index 0000000000..2d3f56af33
--- /dev/null
+++ b/demos/demos_by_use_case/bio/SingleCellDemo.ipynb
@@ -0,0 +1,911 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5n26EioarG4C",
+ "metadata": {
+ "id": "5n26EioarG4C"
+ },
+ "source": [
+ "## Tutorial: Single-Cell Transcriptomics using UMAP"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f3a459c-fa15-445b-8d65-cb701c063ac9",
+ "metadata": {
+ "id": "0f3a459c-fa15-445b-8d65-cb701c063ac9"
+ },
+ "source": [
+ "\n",
+ "Single cell gene expression can be analyzed faster and more easily explored using GPU-accelerated UMAP analysis & visualization. Using UMAP in this way, **the user can cluster cell types by patterns of gene expression**\n",
+ "\n",
+ "* Task: Analyze single cell's gene expression for clustering\n",
+ "* Data: 5 independent datasets of roughly 30K rows of 200 columns of single cell\n",
+ "* [data](https://cytotrace.stanford.edu/#shiny-tab-dataset_download)\n",
+ "* [paper](https://arxiv.org/pdf/2208.05229.pdf)\n",
+ "\n",
+ "**Insight/ Result:**\n",
+ "\n",
+ " 1. Speed: Go from minutes to seconds for entire ~10000 cell samples (102s vs 18s on a small T4 GPU),\n",
+ " 2. Visualization: Add interactivity, similarity edges, and GPU scale to otherwise hard-to-read static scatter plots"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "JAgYkgAsvbnG",
+ "metadata": {
+ "id": "JAgYkgAsvbnG"
+ },
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6lK_VrzZT1n7",
+ "metadata": {
+ "id": "6lK_VrzZT1n7"
+ },
+ "source": [
+ "### For the GPU-cloud-accelerated visualization step, get a free API key at https://hub.graphistry.com\n",
+ "\n",
+ "### For raw data, get a free Kaggle account from https://www.kaggle.com/docs/api"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "33ae99ce-e386-4e97-b5a0-99b86fb6f68a",
+ "metadata": {
+ "id": "33ae99ce-e386-4e97-b5a0-99b86fb6f68a"
+ },
+ "outputs": [],
+ "source": [
+ "import os, time\n",
+ "from collections import Counter\n",
+ "import cProfile\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "from pstats import Stats\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "pd.set_option('display.max_colwidth', 200)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "UYVtE57QBy92",
+ "metadata": {
+ "id": "UYVtE57QBy92"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12\n",
+ "\n",
+ "!pip install graphistry[ai]\n",
+ "\n",
+ "!pip install -q Biopython\n",
+ "!pip install -q scanpy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "FQUmN0NcTy8z",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 35
+ },
+ "id": "FQUmN0NcTy8z",
+ "outputId": "68d17e2d-76df-4248-fe5b-fcb0fb7ea843"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ },
+ "text/plain": [
+ "'0.33.0+97.ga86be5c'"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import scanpy as sc\n",
+ "import anndata\n",
+ "\n",
+ "import graphistry\n",
+ "graphistry.register(api=3,protocol=\"https\", server=\"hub.graphistry.com\", username=g_user, password=g_pass) ## key id, secret key\n",
+ "\n",
+ "graphistry.__version__\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0i3vYvSw-OyK",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0i3vYvSw-OyK",
+ "outputId": "98e55a32-c279-4f40-9544-81dd4459d0ff"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Mon Jul 8 12:42:32 2024 \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n",
+ "|-----------------------------------------+----------------------+----------------------+\n",
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
+ "| | | MIG M. |\n",
+ "|=========================================+======================+======================|\n",
+ "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
+ "| N/A 44C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n",
+ "| | | N/A |\n",
+ "+-----------------------------------------+----------------------+----------------------+\n",
+ " \n",
+ "+---------------------------------------------------------------------------------------+\n",
+ "| Processes: |\n",
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
+ "| ID ID Usage |\n",
+ "|=======================================================================================|\n",
+ "| No running processes found |\n",
+ "+---------------------------------------------------------------------------------------+\n"
+ ]
+ }
+ ],
+ "source": [
+ "!nvidia-smi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "y_CdnuiH-Ras",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 35
+ },
+ "id": "y_CdnuiH-Ras",
+ "outputId": "1d42ea6c-0c23-475e-b656-bd6b4dfc5543"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ },
+ "text/plain": [
+ "'24.06.01'"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import cuml, cudf\n",
+ "cuml.__version__"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "lqWE36v0vU6l",
+ "metadata": {
+ "id": "lqWE36v0vU6l"
+ },
+ "source": [
+ "## Data Download & Description"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49GjBuuezGSS",
+ "metadata": {
+ "id": "49GjBuuezGSS"
+ },
+ "outputs": [],
+ "source": [
+ "import locale\n",
+ "locale.getpreferredencoding = lambda: \"UTF-8\"\n",
+ "!pip install kaggle -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "kjZSLidBSgd-",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 92
+ },
+ "id": "kjZSLidBSgd-",
+ "outputId": "dda18fd3-b28c-4b4e-d351-84eb584970f7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Saving kaggle.json to kaggle.json\n",
+ "User uploaded file \"kaggle.json\" with length 62 bytes\n"
+ ]
+ }
+ ],
+ "source": [
+ "from google.colab import files\n",
+ "\n",
+ "uploaded = files.upload()\n",
+ "\n",
+ "for fn in uploaded.keys():\n",
+ " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+ " name=fn, length=len(uploaded[fn])))\n",
+ "\n",
+ "# Then move kaggle.json into the folder where the API expects to find it.\n",
+ "!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "Mc-Q8Y-A0aLS",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Mc-Q8Y-A0aLS",
+ "outputId": "30754e5b-69d0-4080-9c16-19cf9ffa85ff"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dataset URL: https://www.kaggle.com/datasets/alexandervc/scrnaseq-collection-of-datasets\n",
+ "Dataset URL: https://www.kaggle.com/datasets/alexandervc/scrnaseq-collection-of-datasets\n",
+ "Archive: GSE107910_40.h5ad.zip\n",
+ " inflating: GSE107910_40.h5ad \n",
+ "Archive: GSE67123_6.h5ad.zip\n",
+ " inflating: GSE67123_6.h5ad \n"
+ ]
+ }
+ ],
+ "source": [
+ "#download 2 single cell datasets\n",
+ "import kaggle as kg\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "\n",
+ "os.environ['KAGGLE_USERNAME'] = kaggle_user\n",
+ "os.environ['KAGGLE_KEY'] = kaggle_pass\n",
+ "\n",
+ "kg.api.authenticate()\n",
+ "kg.api.dataset_download_file(dataset = \"alexandervc/scrnaseq-collection-of-datasets\", file_name='Cytotrace/GSE67123_6.h5ad')\n",
+ "kg.api.dataset_download_file(dataset = \"alexandervc/scrnaseq-collection-of-datasets\", file_name='Cytotrace/GSE107910_40.h5ad')\n",
+ "\n",
+ "!unzip -o GSE107910_40.h5ad.zip\n",
+ "!unzip -o GSE67123_6.h5ad.zip\n",
+ "\n",
+ "!mkdir -p single_cell\n",
+ "!mv *.h5ad single_cell"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff46a8c8-5882-41a6-83ce-eb8c00c2fc70",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ff46a8c8-5882-41a6-83ce-eb8c00c2fc70",
+ "outputId": "735b9597-2012-4916-f4e3-c2270c680b43"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Thymus (Drop-seq)' 'Validation' '15429' '9307.0' 'nan' '9307' '19530'\n",
+ " '8' '8' 'UMI' 'Mouse' '1' 'Thymus' 'Drop-seq' 'Timepoints' 'in vivo'\n",
+ " '29884461' '20180619' 'GSE107910' 'Immunity'\n",
+ " 'Only hematopoietic cells, selected based on detectable Ptprc expression, were considered in this dataset. ']\n",
+ "\n",
+ "['Embryonic HSCs (Tang et al.)' 'Validation' '143' 'nan' 'nan' '143'\n",
+ " '24028' '5' '5' 'TPM/FPKM' 'Mouse' '1' 'Embryo' 'Tang et al.'\n",
+ " 'Timepoints' 'in vivo' '27225119' '20160526' 'GSE67123' 'Nature' 'nan']\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "list_files = []\n",
+ "for dirname, _, filenames in os.walk('single_cell'):\n",
+ " for filename in filenames:\n",
+ " list_files.append(os.path.join(dirname, filename))\n",
+ "\n",
+ "for fn in list_files:\n",
+ " adata = sc.read(fn)\n",
+ " print( adata.uns['info'] )\n",
+ " print()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dQaaHeHu7X54",
+ "metadata": {
+ "id": "dQaaHeHu7X54"
+ },
+ "source": [
+ "# compute UMAP on GPU for GSE107910_40 Murine Thymus cells\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc6f3beb-13c9-4d11-b8b2-35676f711f50",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cc6f3beb-13c9-4d11-b8b2-35676f711f50",
+ "outputId": "7b79b744-e23d-4182-b2bb-5082977e92db"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (18335, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 38.1 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "fn='single_cell/GSE107910_40.h5ad'\n",
+ "adata = sc.read(fn)\n",
+ "str_data_inf = fn.split('/')[1].split('.')[0] + ' ' + str(adata.X.shape)+'\\n' + adata.uns['info'][0]\n",
+ "\n",
+ "EE=pd.DataFrame(adata.X,columns=adata.uns['gcsGenesNames'],index=adata.uns['allcellnames'])\n",
+ "g1=graphistry.nodes(cudf.from_pandas(EE.T))\n",
+ "t0 = time.time()\n",
+ "\n",
+ "g22 = g1.umap(\n",
+ " use_scaler='robust', ## zscale, minmax, standard, normal,\n",
+ " n_components=2,\n",
+ " n_neighbors=12,\n",
+ " engine='cuml' ## cannot even run in available RAM, try by switching to engine='umap_learn'\n",
+ " )\n",
+ "\n",
+ "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8USJgBC34bEt",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "8USJgBC34bEt",
+ "outputId": "cab7972c-e452-4d43-d216-fab920b572eb"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "emb2=g22._node_embedding\n",
+ "\n",
+ "A=emb2.reset_index()['index'].to_pandas()\n",
+ "\n",
+ "B=g22._edges\n",
+ "B['_src_implicit'] = B['_src_implicit'].replace(A, regex=True)\n",
+ "B['_dst_implicit'] = B['_dst_implicit'].replace(A, regex=True)\n",
+ "\n",
+ "g33=graphistry.nodes(emb2.reset_index(),'index').edges(g11._edges.dropna(),'_src_implicit','_dst_implicit').bind(point_x=\"x\",point_y=\"y\").settings(url_params={\"play\":0})\n",
+ "\n",
+ "g33.plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ToIOD-XzO7JD",
+ "metadata": {
+ "id": "ToIOD-XzO7JD"
+ },
+ "source": [
+ "## this paper was specifically interested in peak mitosis genes, ie [\"Tirosh\" genes](https://genome.cshlp.org/content/25/12/1860.short), so lets zoom in on those"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "B_qu0elxdY_w",
+ "metadata": {
+ "id": "B_qu0elxdY_w"
+ },
+ "outputs": [],
+ "source": [
+ "fn='single_cell/GSE107910_40.h5ad'\n",
+ "import scanpy as sc\n",
+ "import anndata\n",
+ "adata = sc.read(fn)\n",
+ "str_data_inf = fn.split('/')[1].split('.')[0] + ' ' + str(adata.X.shape)+'\\n' + adata.uns['info'][0]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "Ty0JI0fed1po",
+ "metadata": {
+ "id": "Ty0JI0fed1po"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "XW2PejpWO4Ta",
+ "metadata": {
+ "id": "XW2PejpWO4Ta"
+ },
+ "outputs": [],
+ "source": [
+ "S_phase_genes_Tirosh = ['MCM5', 'PCNA', 'TYMS', 'FEN1', 'MCM2', 'MCM4', 'RRM1', 'UNG', 'GINS2', 'MCM6', 'CDCA7', 'DTL', 'PRIM1', 'UHRF1', 'MLF1IP', 'HELLS', 'RFC2', 'RPA2', 'NASP', 'RAD51AP1', 'GMNN', 'WDR76', 'SLBP', 'CCNE2', 'UBR7', 'POLD3', 'MSH2', 'ATAD2', 'RAD51', 'RRM2', 'CDC45', 'CDC6', 'EXO1', 'TIPIN', 'DSCC1', 'BLM', 'CASP8AP2', 'USP1', 'CLSPN', 'POLA1', 'CHAF1B', 'BRIP1', 'E2F8']\n",
+ "G2_M_genes_Tirosh = ['HMGB2', 'CDK1', 'NUSAP1', 'UBE2C', 'BIRC5', 'TPX2', 'TOP2A', 'NDC80', 'CKS2', 'NUF2', 'CKS1B', 'MKI67', 'TMPO', 'CENPF', 'TACC3', 'FAM64A', 'SMC4', 'CCNB2', 'CKAP2L', 'CKAP2', 'AURKB', 'BUB1', 'KIF11', 'ANP32E', 'TUBB4B', 'GTSE1', 'KIF20B', 'HJURP', 'CDCA3', 'HN1', 'CDC20', 'TTK', 'CDC25C', 'KIF2C', 'RANGAP1', 'NCAPD2', 'DLGAP5', 'CDCA2', 'CDCA8', 'ECT2', 'KIF23', 'HMMR', 'AURKA', 'PSRC1', 'ANLN', 'LBR', 'CKAP5', 'CENPE', 'CTCF', 'NEK2', 'G2E3', 'GAS2L3', 'CBX5', 'CENPA']\n",
+ "u = 'allgenenames'\n",
+ "list_genes_upper = [t.upper() for t in adata.uns[u] ]\n",
+ "I = np.where( pd.Series(list_genes_upper).isin( S_phase_genes_Tirosh + G2_M_genes_Tirosh ) )[0]\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bJ7h6QjQemw_",
+ "metadata": {
+ "id": "bJ7h6QjQemw_"
+ },
+ "source": [
+ "## CPU UMAP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "YW2M1hwTekeW",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YW2M1hwTekeW",
+ "outputId": "7661f1d1-5747-4b9a-ae13-5e3754e6f1cf"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.umap_utils:* Ignoring target column of shape (96, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 26.6 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "EE=pd.DataFrame(adata.X[:,I],columns=adata.uns['gcsGenesNames'][I],index=adata.uns['allcellnames'])\n",
+ "g1=graphistry.nodes(cudf.from_pandas(EE.T))\n",
+ "t0 = time.time()\n",
+ "\n",
+ "g11 = g1.umap(\n",
+ " use_scaler='robust', ## zscale, minmax, standard, normal,\n",
+ " n_components=2,\n",
+ " n_neighbors=12,\n",
+ " engine='umap_learn'\n",
+ " )\n",
+ "\n",
+ "\n",
+ "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "tnH6euKnenSJ",
+ "metadata": {
+ "id": "tnH6euKnenSJ"
+ },
+ "source": [
+ "### GPU UMAP"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0X3aqsRCO9gR",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0X3aqsRCO9gR",
+ "outputId": "22d64c62-8f9f-4499-a8e7-cc5f098bcddd"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:graphistry.util.memoization:! Failed umap speedup attempt. Continuing without memoization speedups.WARNING:graphistry.umap_utils:* Ignoring target column of shape (96, 0) in UMAP fit, as it is not one dimensional"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ " Total 15.5 seconds passed\n"
+ ]
+ }
+ ],
+ "source": [
+ "EE=pd.DataFrame(adata.X[:,I],columns=adata.uns['gcsGenesNames'][I],index=adata.uns['allcellnames'])\n",
+ "g1=graphistry.nodes(cudf.from_pandas(EE.T)) #,columns=adata1.uns['gcsGenesNames']))\n",
+ "\n",
+ "t0 = time.time()\n",
+ "\n",
+ "g11 = g1.umap(\n",
+ " use_scaler='robust', ## zscale, minmax, standard, normal,\n",
+ " n_components=2,\n",
+ " n_neighbors=12,\n",
+ " engine='cuml'\n",
+ " )\n",
+ "\n",
+ "\n",
+ "print('\\n Total ', np.round(time.time() - t0,1), 'seconds passed')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "F9yg_KST7-oK",
+ "metadata": {
+ "id": "F9yg_KST7-oK"
+ },
+ "source": [
+ "### Visualize\n",
+ "\n",
+ "* Nodes are cells\n",
+ "* Edges are similarity relationships\n",
+ "* Initial layout is from the UMAP dimensionality reduction to 2D\n",
+ "* Interactive layout is an aesthetically-optimized force-directed graph over the similarity graph, which is more interpretable for dense clusters\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "OIxdQw4DO9jF",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 543
+ },
+ "id": "OIxdQw4DO9jF",
+ "outputId": "395b79f0-be12-407f-9653-a17508b76a2e"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "g11.plot()"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}