From a3a166072846bc26e312ba7b95f1f7fc11ce5e0b Mon Sep 17 00:00:00 2001 From: Yue Zhang Date: Tue, 28 Apr 2026 22:03:02 -0700 Subject: [PATCH 1/7] fix test error --- tests/test_nameres.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_nameres.py b/tests/test_nameres.py index 3e926b5..d9650a0 100644 --- a/tests/test_nameres.py +++ b/tests/test_nameres.py @@ -42,7 +42,7 @@ def test_nameres_status(): assert status['status'] == 'ok' assert status['babel_version'] != '' assert status['babel_version_url'] != '' - assert status['numDocs'] > 425_000_000 + assert status['solr']['numDocs'] > 425_000_000 def test_nameres_incorrect(): From afffeee5045c65d824f728e1adbf16bd4b5f258e Mon Sep 17 00:00:00 2001 From: Yue Zhang Date: Thu, 30 Apr 2026 03:52:32 -0700 Subject: [PATCH 2/7] adding scoring to pathfinder output --- TCT/TCT_pathfinder.py | 56 +++++- notebooks/Pathfinder_new.ipynb | 328 +++++++++++++++++++++++++-------- 2 files changed, 306 insertions(+), 78 deletions(-) diff --git a/TCT/TCT_pathfinder.py b/TCT/TCT_pathfinder.py index f7a5e28..eff8128 100644 --- a/TCT/TCT_pathfinder.py +++ b/TCT/TCT_pathfinder.py @@ -103,10 +103,50 @@ def build_query_graph(start_node_id, end_node_id, start_node_categories=None, en return q +def generate_score_results(results, method='infores'): + """ + Generates a score dict, and a list of "analyses". + method can be 'infores' or 'edges' + """ + graph_scores = {} + max_score = 0 + auxiliary_graphs = results['auxiliary_graphs'] + for k, graph in auxiliary_graphs.items(): + if method == 'infores': + sources = set() + for edge_index in graph: + edge = results['knowledge_graph']['edges'][edge_index] + for resource in edge['sources']: + sources.add(resource['resource_id']) + score = len(sources) + if score > max_score: + max_score = score + else: + score = len(graph) + if score > max_score: + max_score = score + graph_scores[k] = score + graph_scores_formatted = [] + for k in graph_scores.keys(): + graph_scores[k] = graph_scores[k]/max_score + graph_scores_formatted.append({ + 'attributes': None, + 'path_bindings': { + 'p0': [{'id': k}]}, + 'resource_id': 'infores:tct', + 'score': graph_scores[k], + 'scoring_method': None, + 'support_graphs': None + }) + return graph_scores, graph_scores_formatted + + def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict, - start_node_categories=None, end_node_categories=None): + start_node_categories=None, end_node_categories=None, + scoring_method='infores'): """ Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs. + scoring_method is how the node scores are generated, and could be 'infores' or 'edges'. """ # TODO: parse results... # nodes @@ -155,6 +195,7 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic keys = [x[0] for x in e1s] + [x[0] for x in e2s] all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys i += 1 + # generate output json output = { 'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories), 'knowledge_graph': {'nodes': {x: {} for x in connection_counts.keys()}, @@ -163,11 +204,17 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic 'results': [{'analyses': []}], 'auxiliary_graphs': all_auxiliary_graphs } + graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method) + output['results'][0]['analyses'] = graph_scores_formatted return output def pathfinder(input_node1_id:str, input_node2_id:str, - intermediate_categories:list, APInames, metaKG, API_predicates): + intermediate_categories:list, APInames, metaKG, API_predicates, + scoring_method='infores'): + """ + Returns a Pathfinder output for the given pair of nodes. scoring_method could be 'infores' or 'edges'. + """ # get categories for input nodes normalized_node_dict = node_normalizer.get_normalized_nodes([input_node1_id, input_node2_id]) input_node1_info = normalized_node_dict[input_node1_id] @@ -227,7 +274,8 @@ def pathfinder(input_node1_id:str, input_node2_id:str, output = parse_results_for_pathfinder(input_node1_id, input_node2_id, result1, result2, start_node_categories=input_node1_category, - end_node_categories=input_node2_category) + end_node_categories=input_node2_category, + scoring_method=scoring_method) return result1, result2, output, paths @@ -278,4 +326,4 @@ def query_arax_pathfinder(node1_id, node1_category, node2_id, node2_category): ARAX_endpoint = 'https://arax.ci.transltr.io/api/arax/v1.4/query' query_current = format_pathfinder_query(node1_id, node1_category, node2_id, node2_category) response = requests.post(ARAX_endpoint, json=query_current) - return response \ No newline at end of file + return response diff --git a/notebooks/Pathfinder_new.ipynb b/notebooks/Pathfinder_new.ipynb index 67512d1..6e33b5f 100644 --- a/notebooks/Pathfinder_new.ipynb +++ b/notebooks/Pathfinder_new.ipynb @@ -108,14 +108,15 @@ "output_type": "stream", "text": [ "MONDO:0100096\n", - "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n", + "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", "Automat-genome-alliance(Trapi v1.5.0): Success!\n", "RTX KG2 - TRAPI 1.5.0: Success!\n", "Automat-cam-kp(Trapi v1.5.0): Success!\n", "Automat-hetionet(Trapi v1.5.0): Success!\n", - "Automat-robokop(Trapi v1.5.0): Success!\n", + "Service Provider TRAPI: Success!\n", "BioThings Explorer (BTE) TRAPI: Success!\n", + "Automat-robokop(Trapi v1.5.0): Success!\n", "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", "Clinical Trials KP - TRAPI 1.5.0: Success!\n", "Microbiome KP - TRAPI 1.5.0: Success!\n", @@ -141,6 +142,14 @@ " API_predicates=API_predicates)" ] }, + { + "cell_type": "markdown", + "id": "bf0506c3-6615-421b-bc45-ab4e124e3477", + "metadata": {}, + "source": [ + "### Examining pathfinder outputs" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -185,36 +194,36 @@ " PDCD1\n", " \n", " \n", - " TNF\n", - " 0.133333\n", - " NCBIGene:7124\n", - " biolink:genetically_interacts_with; biolink:in...\n", - " biolink:gene_associated_with_condition; biolin...\n", - " TNF\n", - " \n", - " \n", - " IL13\n", + " IL17A\n", " 0.133333\n", - " NCBIGene:3596\n", + " NCBIGene:3605\n", " biolink:interacts_with; biolink:affects\n", " biolink:related_to; biolink:gene_associated_wi...\n", - " IL13\n", + " IL17A\n", " \n", " \n", " NLRP3\n", " 0.133333\n", " NCBIGene:114548\n", - " biolink:affects; biolink:physically_interacts_...\n", + " biolink:physically_interacts_with; biolink:aff...\n", " biolink:related_to; biolink:gene_associated_wi...\n", " NLRP3\n", " \n", " \n", - " IL17A\n", + " IL13\n", " 0.133333\n", - " NCBIGene:3605\n", + " NCBIGene:3596\n", " biolink:interacts_with; biolink:affects\n", " biolink:related_to; biolink:gene_associated_wi...\n", - " IL17A\n", + " IL13\n", + " \n", + " \n", + " TNF\n", + " 0.133333\n", + " NCBIGene:7124\n", + " biolink:genetically_interacts_with; biolink:in...\n", + " biolink:gene_associated_with_condition; biolin...\n", + " TNF\n", " \n", " \n", " ...\n", @@ -225,44 +234,44 @@ " ...\n", " \n", " \n", - " LIF\n", + " IFIT2\n", " 0.033333\n", - " NCBIGene:3976\n", - " biolink:affects\n", + " NCBIGene:3433\n", + " biolink:physically_interacts_with\n", " biolink:gene_associated_with_condition\n", - " LIF\n", + " IFIT2\n", " \n", " \n", - " FAS\n", + " GPX3\n", " 0.033333\n", - " NCBIGene:355\n", - " biolink:regulates; biolink:affects\n", + " NCBIGene:2878\n", + " biolink:physically_interacts_with\n", " biolink:gene_associated_with_condition\n", - " FAS\n", + " GPX3\n", " \n", " \n", - " MEFV\n", + " RETNLB\n", " 0.033333\n", - " NCBIGene:4210\n", - " biolink:affects\n", + " NCBIGene:84666\n", + " biolink:interacts_with; biolink:physically_int...\n", " biolink:gene_associated_with_condition\n", - " MEFV\n", + " RETNLB\n", " \n", " \n", - " PXDNL\n", + " NT5E\n", " 0.033333\n", - " NCBIGene:137902\n", - " biolink:interacts_with\n", + " NCBIGene:4907\n", + " biolink:affects\n", " biolink:gene_associated_with_condition\n", - " PXDNL\n", + " NT5E\n", " \n", " \n", - " THBD\n", + " H2BC21\n", " 0.033333\n", - " NCBIGene:7056\n", - " biolink:affects; biolink:physically_interacts_...\n", + " NCBIGene:8349\n", + " biolink:physically_interacts_with\n", " biolink:gene_associated_with_condition\n", - " THBD\n", + " H2BC21\n", " \n", " \n", "\n", @@ -270,44 +279,44 @@ "" ], "text/plain": [ - " score output_node \\\n", - "PDCD1 0.133333 NCBIGene:5133 \n", - "TNF 0.133333 NCBIGene:7124 \n", - "IL13 0.133333 NCBIGene:3596 \n", - "NLRP3 0.133333 NCBIGene:114548 \n", - "IL17A 0.133333 NCBIGene:3605 \n", - "... ... ... \n", - "LIF 0.033333 NCBIGene:3976 \n", - "FAS 0.033333 NCBIGene:355 \n", - "MEFV 0.033333 NCBIGene:4210 \n", - "PXDNL 0.033333 NCBIGene:137902 \n", - "THBD 0.033333 NCBIGene:7056 \n", + " score output_node \\\n", + "PDCD1 0.133333 NCBIGene:5133 \n", + "IL17A 0.133333 NCBIGene:3605 \n", + "NLRP3 0.133333 NCBIGene:114548 \n", + "IL13 0.133333 NCBIGene:3596 \n", + "TNF 0.133333 NCBIGene:7124 \n", + "... ... ... \n", + "IFIT2 0.033333 NCBIGene:3433 \n", + "GPX3 0.033333 NCBIGene:2878 \n", + "RETNLB 0.033333 NCBIGene:84666 \n", + "NT5E 0.033333 NCBIGene:4907 \n", + "H2BC21 0.033333 NCBIGene:8349 \n", "\n", - " predicates1 \\\n", - "PDCD1 biolink:interacts_with; biolink:affects \n", - "TNF biolink:genetically_interacts_with; biolink:in... \n", - "IL13 biolink:interacts_with; biolink:affects \n", - "NLRP3 biolink:affects; biolink:physically_interacts_... \n", - "IL17A biolink:interacts_with; biolink:affects \n", - "... ... \n", - "LIF biolink:affects \n", - "FAS biolink:regulates; biolink:affects \n", - "MEFV biolink:affects \n", - "PXDNL biolink:interacts_with \n", - "THBD biolink:affects; biolink:physically_interacts_... \n", + " predicates1 \\\n", + "PDCD1 biolink:interacts_with; biolink:affects \n", + "IL17A biolink:interacts_with; biolink:affects \n", + "NLRP3 biolink:physically_interacts_with; biolink:aff... \n", + "IL13 biolink:interacts_with; biolink:affects \n", + "TNF biolink:genetically_interacts_with; biolink:in... \n", + "... ... \n", + "IFIT2 biolink:physically_interacts_with \n", + "GPX3 biolink:physically_interacts_with \n", + "RETNLB biolink:interacts_with; biolink:physically_int... \n", + "NT5E biolink:affects \n", + "H2BC21 biolink:physically_interacts_with \n", "\n", - " predicates2 output_node_name \n", - "PDCD1 biolink:related_to; biolink:gene_associated_wi... PDCD1 \n", - "TNF biolink:gene_associated_with_condition; biolin... TNF \n", - "IL13 biolink:related_to; biolink:gene_associated_wi... IL13 \n", - "NLRP3 biolink:related_to; biolink:gene_associated_wi... NLRP3 \n", - "IL17A biolink:related_to; biolink:gene_associated_wi... IL17A \n", - "... ... ... \n", - "LIF biolink:gene_associated_with_condition LIF \n", - "FAS biolink:gene_associated_with_condition FAS \n", - "MEFV biolink:gene_associated_with_condition MEFV \n", - "PXDNL biolink:gene_associated_with_condition PXDNL \n", - "THBD biolink:gene_associated_with_condition THBD \n", + " predicates2 output_node_name \n", + "PDCD1 biolink:related_to; biolink:gene_associated_wi... PDCD1 \n", + "IL17A biolink:related_to; biolink:gene_associated_wi... IL17A \n", + "NLRP3 biolink:related_to; biolink:gene_associated_wi... NLRP3 \n", + "IL13 biolink:related_to; biolink:gene_associated_wi... IL13 \n", + "TNF biolink:gene_associated_with_condition; biolin... TNF \n", + "... ... ... \n", + "IFIT2 biolink:gene_associated_with_condition IFIT2 \n", + "GPX3 biolink:gene_associated_with_condition GPX3 \n", + "RETNLB biolink:gene_associated_with_condition RETNLB \n", + "NT5E biolink:gene_associated_with_condition NT5E \n", + "H2BC21 biolink:gene_associated_with_condition H2BC21 \n", "\n", "[680 rows x 5 columns]" ] @@ -344,9 +353,180 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "9847dbb5-0d63-496b-8969-4dd330f3897c", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'attributes': [{'attribute_source': 'infores:rtx-kg2',\n", + " 'attribute_type_id': 'biolink:original_predicate',\n", + " 'description': 'The IDs of the original RTX-KG2pre edge(s) corresponding to this edge prior to any synonymization or remapping.',\n", + " 'value': ['UMLS:C0017337---SEMMEDDB:interacts_with---None---None---None---UMLS:C0021745---SEMMEDDB:',\n", + " 'UMLS:C0017337---SEMMEDDB:interacts_with---None---None---None---UMLS:C1334085---SEMMEDDB:'],\n", + " 'value_type_id': 'metatype:String'},\n", + " {'attribute_source': 'infores:rtx-kg2',\n", + " 'attribute_type_id': 'biolink:knowledge_level',\n", + " 'value': 'prediction'},\n", + " {'attribute_source': 'infores:rtx-kg2',\n", + " 'attribute_type_id': 'biolink:publications',\n", + " 'value': ['PMID:25676343',\n", + " 'PMID:3934559',\n", + " 'PMID:35453072',\n", + " 'PMID:36111345'],\n", + " 'value_type_id': 'biolink:Uriorcurie'},\n", + " {'attribute_source': 'infores:rtx-kg2',\n", + " 'attribute_type_id': 'biolink:agent_type',\n", + " 'value': 'text_mining_agent'},\n", + " {'attribute_source': 'infores:rtx-kg2',\n", + " 'attribute_type_id': 'biolink:supporting_text',\n", + " 'value': {'PMID:25676343': {'object score': '851',\n", + " 'publication date': '2015 Mar',\n", + " 'sentence': 'OBJECTIVES: Atypical familial mycobacteriosis (AFM, OMIM #209950) is caused by mutations in genes regulating IL12/IFNG pathway.',\n", + " 'subject score': '1000'},\n", + " 'PMID:35453072': {'object score': '1000',\n", + " 'publication date': '2022 Apr 18',\n", + " 'sentence': 'Innate immune responses were attenuated reflected by decreased expression of genes involved in interferon-gamma, leukocyte migration and neutrophil mediated immune response in convalescent COVID-19 patients.',\n", + " 'subject score': '1000'},\n", + " 'PMID:36111345': {'object score': '901',\n", + " 'publication date': '2022',\n", + " 'sentence': 'Differentially expressed genes (DEGs) between groups were enriched in allograft rejection, hypoxia, glycolysis, TNFalpha signaling via NF-kappaB, and interferon-gamma responses via Gene set enrichment analysis (GSEA).',\n", + " 'subject score': '790'},\n", + " 'PMID:3934559': {'object score': '1000',\n", + " 'publication date': '1985 Nov 21-27',\n", + " 'sentence': 'This gene controls a function or a product necessary for the action of gamma-interferon on class II genes.',\n", + " 'subject score': '1000'}}}],\n", + " 'object': 'NCBIGene:3458',\n", + " 'predicate': 'biolink:interacts_with',\n", + " 'sources': [{'resource_id': 'infores:semmeddb',\n", + " 'resource_role': 'primary_knowledge_source'},\n", + " {'resource_id': 'infores:rtx-kg2',\n", + " 'resource_role': 'aggregator_knowledge_source',\n", + " 'upstream_resource_ids': ['infores:semmeddb']}],\n", + " 'subject': 'UMLS:C0017337'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(formatted_output['knowledge_graph']['edges'].values())[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['12026540',\n", + " '13008760',\n", + " '13008998',\n", + " '1451206',\n", + " '25948818',\n", + " '12006224',\n", + " '12009572',\n", + " '12011840',\n", + " '12014211',\n", + " '12034620']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(formatted_output['auxiliary_graphs'].values())[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "94377dae-4f2a-4cc4-bdc0-8d7c380ed152", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'resource_id': 'infores:semmeddb',\n", + " 'resource_role': 'primary_knowledge_source'},\n", + " {'resource_id': 'infores:rtx-kg2',\n", + " 'resource_role': 'aggregator_knowledge_source',\n", + " 'upstream_resource_ids': ['infores:semmeddb']}]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "formatted_output['knowledge_graph']['edges']['12026540']['sources']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "56886736-9e0a-4e9a-9773-308447fcce3a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'resource_id': 'infores:semmeddb',\n", + " 'resource_role': 'primary_knowledge_source'},\n", + " {'resource_id': 'infores:rtx-kg2',\n", + " 'resource_role': 'aggregator_knowledge_source',\n", + " 'upstream_resource_ids': ['infores:semmeddb']}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "formatted_output['knowledge_graph']['edges']['13008760']['sources']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8540fc14-dd13-444f-b9d9-afae72b31b49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'attributes': None,\n", + " 'path_bindings': {'p0': [{'id': 'aux_1_UMLS:C0017337'}]},\n", + " 'resource_id': 'infores:tct',\n", + " 'score': 0.14285714285714285,\n", + " 'scoring_method': None,\n", + " 'support_graphs': None}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "formatted_output['results'][0]['analyses'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6ba8e09-c1aa-4010-86cf-722bc9c27058", + "metadata": {}, "outputs": [], "source": [] } From 43df311158a789dc148f8ff6cf012ec716354ec3 Mon Sep 17 00:00:00 2001 From: yjzhang Date: Thu, 30 Apr 2026 18:20:54 -0700 Subject: [PATCH 3/7] add nodes to TCT_pathfinder --- TCT/TCT_pathfinder.py | 76 ++++-- TCT/name_resolver.py | 49 +++- notebooks/Pathfinder_new.ipynb | 441 +++++++++++++++++---------------- 3 files changed, 326 insertions(+), 240 deletions(-) diff --git a/TCT/TCT_pathfinder.py b/TCT/TCT_pathfinder.py index eff8128..435a125 100644 --- a/TCT/TCT_pathfinder.py +++ b/TCT/TCT_pathfinder.py @@ -143,36 +143,59 @@ def generate_score_results(results, method='infores'): def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict, start_node_categories=None, end_node_categories=None, + get_node_info=True, scoring_method='infores'): """ Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs. scoring_method is how the node scores are generated, and could be 'infores' or 'edges'. """ - # TODO: parse results... # nodes + # TODO: get some node info? node attributes + node_info = {} # edges is a dict of intermediate nodes intermediate_node_edges = {} for k, v in result1.items(): i1 = v['subject'] i2 = v['object'] + s_o = 'object' if i1 == start_node_id: intermediate_node_id = i2 + s_o = 'object' elif i2 == start_node_id: intermediate_node_id = i1 + s_o = 'subject' else: continue if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges: intermediate_node_edges[intermediate_node_id].append((k, v)) else: intermediate_node_edges[intermediate_node_id] = [(k, v)] + # add node dict + if intermediate_node_id not in node_info: + node_dict = { + } + node_info[intermediate_node_id] = node_dict + else: + node_dict = node_info[intermediate_node_id] + for attribute in v['attributes']: + if attribute['attribute_type_id'] == f'{s_o}_category': + if 'categories' not in node_dict: + node_dict['categories'] = set([attribute['value']]) + else: + node_dict['categories'].add(attribute['value']) + if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: + node_dict['name'] = attribute['value'] + node_info[intermediate_node_id] = node_dict connecting_intermediate_nodes = {} for k, v in result2.items(): i1 = v['subject'] i2 = v['object'] if i1 == end_node_id: intermediate_node_id = i2 + s_o = 'object' elif i2 == end_node_id: intermediate_node_id = i1 + s_o = 'subject' else: continue if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges: @@ -180,6 +203,24 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v)) else: connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]} + if intermediate_node_id not in node_info: + node_dict = { + } + node_info[intermediate_node_id] = node_dict + else: + node_dict = node_info[intermediate_node_id] + for attribute in v['attributes']: + if attribute['attribute_type_id'] == f'{s_o}_category': + if 'categories' not in node_dict: + node_dict['categories'] = set([attribute['value']]) + else: + node_dict['categories'].add(attribute['value']) + if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: + node_dict['name'] = attribute['value'] + node_info[intermediate_node_id] = node_dict + for k, v in node_info.items(): + if 'categories' in v: + v['categories'] = list(v['categories']) all_edges = {} all_auxiliary_graphs = {} i = 1 @@ -198,7 +239,8 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic # generate output json output = { 'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories), - 'knowledge_graph': {'nodes': {x: {} for x in connection_counts.keys()}, + # TODO: don't drop the nodes + 'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()}, 'edges': all_edges, }, 'results': [{'analyses': []}], @@ -206,6 +248,16 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic } graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method) output['results'][0]['analyses'] = graph_scores_formatted + if get_node_info: + from .node_normalizer import get_normalized_nodes + nodes_to_add = [] + for k, v in output['knowledge_graph']['nodes'].items(): + if 'name' not in v or 'categories' not in v: + nodes_to_add.append(k) + normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post') + for node_id in nodes_to_add: + nn = normalized_nodes[node_id] + output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types} return output @@ -257,27 +309,13 @@ def pathfinder(input_node1_id:str, input_node2_id:str, APInames=APInames, API_predicates=API_predicates, max_workers=len(sele_APIs2)) - - result_parsed1 = parse_KG(result1) - # Step 7: Ranking the results. This ranking method is based on the number of unique - # primary infores. It can only be used to rank the results with one defined node. - result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed1, input_node1_id) # input_node1_id is the curie id of the - - result_parsed2 = parse_KG(result2) - result_ranked_by_primary_infores2 = rank_by_primary_infores(result_parsed2, input_node2_id) # input_node2_id is the curie id of the - - possible_paths = len(set(result_ranked_by_primary_infores1['output_node']).intersection(set(result_ranked_by_primary_infores2['output_node']))) - print("Number of possible paths: ", possible_paths) - - paths = merge_ranking_by_number_of_infores(result_ranked_by_primary_infores1, result_ranked_by_primary_infores2, - plot=False) - output = parse_results_for_pathfinder(input_node1_id, input_node2_id, result1, result2, start_node_categories=input_node1_category, end_node_categories=input_node2_category, - scoring_method=scoring_method) + scoring_method=scoring_method, + get_node_info=True) - return result1, result2, output, paths + return result1, result2, output diff --git a/TCT/name_resolver.py b/TCT/name_resolver.py index 387698e..cf78e6a 100644 --- a/TCT/name_resolver.py +++ b/TCT/name_resolver.py @@ -72,14 +72,14 @@ def lookup(query: str, return_top_response:bool=True, return_synonyms:bool=False raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response)) -def synonyms(query: str, **kwargs): +def synonyms(query: str|list, **kwargs): """ - A wrapper around the `synonyms` api endpoint. Given a list of CURIEs, this returns a dict of CURIE id : TranslatorNode for all synonyms for the given query. + A wrapper around the `synonyms` api endpoint. Given a CURIE or a list of CURIEs, this returns a dict of CURIE id : TranslatorNode for all synonyms for the given query. Parameters ---------- - query : str - Query CURIE + query : str|list + Query CURIE or list of CURIEs **kwargs Other arguments to `synonyms` @@ -93,7 +93,7 @@ def synonyms(query: str, **kwargs): if response.status_code == 200: result = response.json() if len(result) == 0: - raise LookupError('No matching CURIE found for the given string ' + query) + raise LookupError('No matching CURIE found for the given string ' + str(query)) else: all_nodes = {} for k, node in result.items(): @@ -172,3 +172,42 @@ def batch_lookup(strings:list[str], size: int=25, return_top_response:bool=True, else: raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response)) return curies + + +def batch_synonyms(strings:list[str], size:int=50, **kwargs) -> dict: + """ + A wrapper around the `synonyms` API endpoint, using POST. Given a list of CURIEs, this returns a dict of CURIE:TranslatorNode, where each TranslatorNode contains all synonyms for the given CURIE. + + Parameters + ---------- + strings : list[str] + List of CURIEs. + size : int + Desired chunking size, default is 50. + + Returns + ------- + Dict of CURIE : TranslatorNode + """ + chunks = chunk_list(strings, size) + path = urllib.parse.urljoin(URL, 'synonyms') + curies = {} + for chunk in chunks: + # set autocomplete to be false by default + response = requests.post(path, json={'preferred_curies': chunk, **kwargs}) + if response.status_code == 200: + result = response.json() + if len(result) == 0: + raise LookupError('No matching CURIE found for the given CURIEs ' + str(chunks)) + else: + for k, node in result.items(): + if not node: + # If node is empty or None. + curies[k] = None + else: + curies[k] = TranslatorNode.from_dict(node, return_synonyms=True) + else: + raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response)) + return curies + + diff --git a/notebooks/Pathfinder_new.ipynb b/notebooks/Pathfinder_new.ipynb index 6e33b5f..c4c1db1 100644 --- a/notebooks/Pathfinder_new.ipynb +++ b/notebooks/Pathfinder_new.ipynb @@ -108,33 +108,24 @@ "output_type": "stream", "text": [ "MONDO:0100096\n", - "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n", - "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", "Automat-genome-alliance(Trapi v1.5.0): Success!\n", + "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", + "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n", "RTX KG2 - TRAPI 1.5.0: Success!\n", "Automat-cam-kp(Trapi v1.5.0): Success!\n", "Automat-hetionet(Trapi v1.5.0): Success!\n", + "Automat-robokop(Trapi v1.5.0): Success!\n", "Service Provider TRAPI: Success!\n", "BioThings Explorer (BTE) TRAPI: Success!\n", - "Automat-robokop(Trapi v1.5.0): Success!\n", "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", "Clinical Trials KP - TRAPI 1.5.0: Success!\n", "Microbiome KP - TRAPI 1.5.0: Success!\n", - "RTX KG2 - TRAPI 1.5.0: Success!\n", - "ENSEMBL:ENSP00000423463: no preferred name\n", - "ENSEMBL:ENSP00000423463: no preferred name\n", - "UniProtKB:P01308-1: no preferred name\n", - "UniProtKB:P01308-1: no preferred name\n", - "UniProtKB:P12544-1: no preferred name\n", - "UniProtKB:P12544-1: no preferred name\n", - "NodeNorm does not know about these identifiers: NCIT:C16612,UniProtKB:P05113-1,UniProtKB:P05113-1\n", - "NodeNorm does not know about these identifiers: UMLS:C5911035,UMLS:C5943250,UMLS:C5959582,UMLS:C5908975,UMLS:C5926549,UMLS:C5943245,CHEMBL.TARGET:CHEMBL1697664,CHEMBL.TARGET:CHEMBL6188,CHEMBL.TARGET:CHEMBL4295579,CHEMBL.TARGET:CHEMBL1075031,CHEMBL.TARGET:CHEMBL4295609,CHEMBL.TARGET:CHEMBL1287621,CHEMBL.TARGET:CHEMBL3499,CHEMBL.TARGET:CHEMBL4295625,CHEMBL.TARGET:CHEMBL4295542,CHEMBL.TARGET:CHEMBL1741198,CHEMBL.TARGET:CHEMBL3309030,CHEMBL.TARGET:CHEMBL4882,CHEMBL.TARGET:CHEMBL3309037\n", - "Number of possible paths: 680\n" + "RTX KG2 - TRAPI 1.5.0: Success!\n" ] } ], "source": [ - "result1, result2, formatted_output, paths = TCT_pathfinder.pathfinder(input_node1_id='NCBIGene:3458', #IFNG\n", + "result1, result2, formatted_output = TCT_pathfinder.pathfinder(input_node1_id='NCBIGene:3458', #IFNG\n", " input_node2_id= 'MONDO:0100096', #COVID-19\n", " intermediate_categories=['biolink:Protein', 'biolink:Gene' ], \n", " APInames=select_APIs, \n", @@ -153,172 +144,13 @@ { "cell_type": "code", "execution_count": 5, - "id": "c1e3f672-a900-4e45-b197-8874a8f44c32", + "id": "fcfb4342-ed1b-401d-8ffa-862978f0c275", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
scoreoutput_nodepredicates1predicates2output_node_name
PDCD10.133333NCBIGene:5133biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...PDCD1
IL17A0.133333NCBIGene:3605biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...IL17A
NLRP30.133333NCBIGene:114548biolink:physically_interacts_with; biolink:aff...biolink:related_to; biolink:gene_associated_wi...NLRP3
IL130.133333NCBIGene:3596biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...IL13
TNF0.133333NCBIGene:7124biolink:genetically_interacts_with; biolink:in...biolink:gene_associated_with_condition; biolin...TNF
..................
IFIT20.033333NCBIGene:3433biolink:physically_interacts_withbiolink:gene_associated_with_conditionIFIT2
GPX30.033333NCBIGene:2878biolink:physically_interacts_withbiolink:gene_associated_with_conditionGPX3
RETNLB0.033333NCBIGene:84666biolink:interacts_with; biolink:physically_int...biolink:gene_associated_with_conditionRETNLB
NT5E0.033333NCBIGene:4907biolink:affectsbiolink:gene_associated_with_conditionNT5E
H2BC210.033333NCBIGene:8349biolink:physically_interacts_withbiolink:gene_associated_with_conditionH2BC21
\n", - "

680 rows × 5 columns

\n", - "
" - ], "text/plain": [ - " score output_node \\\n", - "PDCD1 0.133333 NCBIGene:5133 \n", - "IL17A 0.133333 NCBIGene:3605 \n", - "NLRP3 0.133333 NCBIGene:114548 \n", - "IL13 0.133333 NCBIGene:3596 \n", - "TNF 0.133333 NCBIGene:7124 \n", - "... ... ... \n", - "IFIT2 0.033333 NCBIGene:3433 \n", - "GPX3 0.033333 NCBIGene:2878 \n", - "RETNLB 0.033333 NCBIGene:84666 \n", - "NT5E 0.033333 NCBIGene:4907 \n", - "H2BC21 0.033333 NCBIGene:8349 \n", - "\n", - " predicates1 \\\n", - "PDCD1 biolink:interacts_with; biolink:affects \n", - "IL17A biolink:interacts_with; biolink:affects \n", - "NLRP3 biolink:physically_interacts_with; biolink:aff... \n", - "IL13 biolink:interacts_with; biolink:affects \n", - "TNF biolink:genetically_interacts_with; biolink:in... \n", - "... ... \n", - "IFIT2 biolink:physically_interacts_with \n", - "GPX3 biolink:physically_interacts_with \n", - "RETNLB biolink:interacts_with; biolink:physically_int... \n", - "NT5E biolink:affects \n", - "H2BC21 biolink:physically_interacts_with \n", - "\n", - " predicates2 output_node_name \n", - "PDCD1 biolink:related_to; biolink:gene_associated_wi... PDCD1 \n", - "IL17A biolink:related_to; biolink:gene_associated_wi... IL17A \n", - "NLRP3 biolink:related_to; biolink:gene_associated_wi... NLRP3 \n", - "IL13 biolink:related_to; biolink:gene_associated_wi... IL13 \n", - "TNF biolink:gene_associated_with_condition; biolin... TNF \n", - "... ... ... \n", - "IFIT2 biolink:gene_associated_with_condition IFIT2 \n", - "GPX3 biolink:gene_associated_with_condition GPX3 \n", - "RETNLB biolink:gene_associated_with_condition RETNLB \n", - "NT5E biolink:gene_associated_with_condition NT5E \n", - "H2BC21 biolink:gene_associated_with_condition H2BC21 \n", - "\n", - "[680 rows x 5 columns]" + "dict_keys(['query_graph', 'knowledge_graph', 'results', 'auxiliary_graphs'])" ] }, "execution_count": 5, @@ -327,19 +159,59 @@ } ], "source": [ - "paths" + "formatted_output.keys()" ] }, { "cell_type": "code", "execution_count": 6, - "id": "fcfb4342-ed1b-401d-8ffa-862978f0c275", + "id": "9847dbb5-0d63-496b-8969-4dd330f3897c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['query_graph', 'knowledge_graph', 'results', 'auxiliary_graphs'])" + "{'sources': [{'resource_id': 'infores:automat-cam-kp',\n", + " 'resource_role': 'aggregator_knowledge_source',\n", + " 'upstream_resource_ids': ['infores:cam-kp']},\n", + " {'resource_id': 'infores:ctd',\n", + " 'resource_role': 'primary_knowledge_source',\n", + " 'upstream_resource_ids': []},\n", + " {'resource_id': 'infores:cam-kp',\n", + " 'resource_role': 'aggregator_knowledge_source',\n", + " 'upstream_resource_ids': ['infores:ctd']}],\n", + " 'qualifiers': [],\n", + " 'predicate': 'biolink:affects',\n", + " 'attributes': [{'attribute_type_id': 'biolink:xref',\n", + " 'value': ['http://ctdbase.org/detail.go?type=relationship&ixnId=4118485',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=3478223',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=8453602',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2683440',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=7638084',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2746570',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=3269154',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=6583144',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2763529',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2691105',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2749488',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=4935193',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=2763531',\n", + " 'http://ctdbase.org/detail.go?type=relationship&ixnId=8453590'],\n", + " 'original_attribute_name': 'xref'},\n", + " {'attribute_type_id': 'biolink:knowledge_level',\n", + " 'value': 'knowledge_assertion',\n", + " 'original_attribute_name': 'knowledge_level'},\n", + " {'attribute_type_id': 'biolink:agent_type',\n", + " 'value': 'manual_agent',\n", + " 'original_attribute_name': 'agent_type'},\n", + " {'attribute_type_id': 'biolink:original_object',\n", + " 'value': 'NCBIGene:7124',\n", + " 'original_attribute_name': 'original_object'},\n", + " {'attribute_type_id': 'biolink:original_subject',\n", + " 'value': 'NCBIGene:3458',\n", + " 'original_attribute_name': 'original_subject'}],\n", + " 'subject': 'NCBIGene:3458',\n", + " 'object': 'NCBIGene:7124'}" ] }, "execution_count": 6, @@ -348,14 +220,83 @@ } ], "source": [ - "formatted_output.keys()" + "list(formatted_output['knowledge_graph']['edges'].values())[0]" ] }, { "cell_type": "code", "execution_count": 7, - "id": "9847dbb5-0d63-496b-8969-4dd330f3897c", + "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['19a467d8',\n", + " 'e_222',\n", + " '6ab7f02f',\n", + " '82968adc',\n", + " '978f6ce1',\n", + " '3571016d',\n", + " '119d5072',\n", + " '73f57667f20d4136d56ce1c56180ed4d',\n", + " '64df3ca7c1bb4570261ba9a485ab6ea2',\n", + " '5866d665014fe74cbd4e9d8b5744d6d0',\n", + " '9983f5c2918b0090337d2d597305d8c2',\n", + " 'd15478a27e98d66ab143989bd157dce5',\n", + " '946805402aa3b12a8a80e5c8ef4833cd',\n", + " 'a846d99d439b616b0e14a474f8254349',\n", + " '710009b429dc63bbd2c32329e1df16ec',\n", + " 'b5c61c0cce4e6273234ff58e111c06a2',\n", + " '26c765858975b8618f2184804a19ba05',\n", + " '3f01ede548609170f5828b6ffaf7d972',\n", + " 'e82906ca6f38a7ead24a067a3e435b38',\n", + " '0169afa0ab05defb81aafa9fce39e242',\n", + " '58d466d9f807a877150013e0691a25fc',\n", + " 'b21949aec9d249b6da58d3c67c8df73e',\n", + " 'd1e2ef5eec04efbc499889be6ce1eea7',\n", + " '4e5d8e79b1308e3ae15fb55298536387',\n", + " 'bf585df1c25702eb7449d34671891147',\n", + " '60975f91744b9f127a59a6322c2f7cb7',\n", + " '665990a36b9cba77e5e6d6645b19bbc3',\n", + " '78e8ccbbb83c35759418c71decafc356',\n", + " '109a1f3bfa18c2975ea82a36c31255e8',\n", + " '3c660b48a0849cc783ad03cbfded02ab',\n", + " '3f0b12128de418c82724fefb0452b2eb',\n", + " '3a5203d87d6f9e6f584b9843155aac05',\n", + " '28d7f20f51ddf666407ab90ae87bc55b',\n", + " 'e02741c698e72353f95c4a529ba01438',\n", + " 'f35a81cd97153cce97d982d7710b558c',\n", + " '712ff9a3be50edc0901d3e02757589f2',\n", + " 'fbe597bca092f6478afb3d2f8f31ee86',\n", + " '34d8d05a6d2bdd0541f72690b8f2f6e9',\n", + " 'd6f6af8c653dd95f918f99e15b4c0b0a',\n", + " '84e026bbdab961e56f2bdf6f30f9486b',\n", + " 'bb54be6d72593ab41809f3c7294a31a7',\n", + " '7b393fa1d78193a0e6e05fd1b67f6352',\n", + " '10e45932c3590ec64fd0dce9f7f69468',\n", + " '86de16e01a16264dcae011be9ba31b2e',\n", + " 'e97a340841d8f91890e6d10fb8def2d1',\n", + " '45319800',\n", + " '8155745']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(formatted_output['auxiliary_graphs'].values())[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e63554d9-5d04-4636-8c37-3e7aaf2ab641", + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -407,43 +348,13 @@ " 'subject': 'UMLS:C0017337'}" ] }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(formatted_output['knowledge_graph']['edges'].values())[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['12026540',\n", - " '13008760',\n", - " '13008998',\n", - " '1451206',\n", - " '25948818',\n", - " '12006224',\n", - " '12009572',\n", - " '12011840',\n", - " '12014211',\n", - " '12034620']" - ] - }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "list(formatted_output['auxiliary_graphs'].values())[0]" + "formatted_output['knowledge_graph']['edges']['12026540']" ] }, { @@ -498,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "8540fc14-dd13-444f-b9d9-afae72b31b49", "metadata": {}, "outputs": [ @@ -506,14 +417,14 @@ "data": { "text/plain": [ "{'attributes': None,\n", - " 'path_bindings': {'p0': [{'id': 'aux_1_UMLS:C0017337'}]},\n", + " 'path_bindings': {'p0': [{'id': 'aux_1_NCBIGene:7124'}]},\n", " 'resource_id': 'infores:tct',\n", - " 'score': 0.14285714285714285,\n", + " 'score': 0.7391304347826086,\n", " 'scoring_method': None,\n", " 'support_graphs': None}" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -524,9 +435,107 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "a6ba8e09-c1aa-4010-86cf-722bc9c27058", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'attributes': [{'attribute_source': 'infores:catrax-pharmacogenomics',\n", + " 'attribute_type_id': 'biolink:publications',\n", + " 'value': 'PMID:36243968|PMID:9397972',\n", + " 'value_type_id': 'biolink:Uriorcurie'},\n", + " {'attribute_type_id': 'knowledge_source', 'value': 'SIGNOR-267487'},\n", + " {'attribute_type_id': 'subject_category', 'value': 'biolink:Gene'},\n", + " {'attribute_source': 'infores:catrax-pharmacogenomics',\n", + " 'attribute_type_id': 'biolink:knowledge_level',\n", + " 'value': 'knowledge_assertion'},\n", + " {'attribute_type_id': 'object_name', 'value': 'DIO1'},\n", + " {'attribute_type_id': 'subject_name', 'value': 'IFNG'},\n", + " {'attribute_source': 'infores:catrax-pharmacogenomics',\n", + " 'attribute_type_id': 'biolink:agent_type',\n", + " 'value': 'automated_agent'},\n", + " {'attribute_type_id': 'object_category', 'value': 'biolink:Gene'},\n", + " {'attribute_type_id': 'provided_by', 'value': 'SIGNOR'}],\n", + " 'object': 'NCBIGene:1733',\n", + " 'predicate': 'biolink:regulates',\n", + " 'qualifiers': [{'qualifier_type_id': 'biolink:object_direction_qualifier',\n", + " 'qualifier_value': 'downregulated'}],\n", + " 'sources': [{'resource_id': 'infores:catrax-pharmacogenomics',\n", + " 'resource_role': 'primary_knowledge_source'},\n", + " {'resource_id': 'N/A', 'resource_role': 'supporting_data_source'}],\n", + " 'subject': 'NCBIGene:3458'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result1['01a09566-2a29-5eae-9160-e44a18241c09']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "baa9a580-c3e8-4b27-834f-d7c5c1c297d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6822" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(result1)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ad5eaf3e-205d-43cb-8c53-d14b489ba5a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2658" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(result2)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "51fe55e0-5871-4714-ae46-8fe0647a6c99", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('result.json', 'w') as f:\n", + " json.dump(formatted_output, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6460b38-8bab-4247-b693-ef90c5a4e3d7", + "metadata": {}, "outputs": [], "source": [] } From 384c56a3e8ae39edafcbd2161f8cf8908a4c25fa Mon Sep 17 00:00:00 2001 From: yjzhang Date: Thu, 30 Apr 2026 18:37:43 -0700 Subject: [PATCH 4/7] start on neighborhood finder --- TCT/neighborhood_finder.py | 324 +++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 TCT/neighborhood_finder.py diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py new file mode 100644 index 0000000..5310794 --- /dev/null +++ b/TCT/neighborhood_finder.py @@ -0,0 +1,324 @@ + + +def format_query_json_for_pathfinder(subject_ids, object_ids=None, + subject_categories=None, + object_categories=None, + predicates=None): + ''' + Example input: + subject_ids = ["NCBIGene:3845"] + object_ids = [] + subject_categories = ["biolink:Gene"] + object_categories = ["biolink:Gene"] + predicates = ["biolink:positively_correlated_with", "biolink:physically_interacts_with"] + ''' + query_json_temp = { + "message": { + "query_graph": { + + "edges": { + "e00": { + "subject": "n00", + "object": "n01", + "predicates": predicates + } + }, + "nodes": { + "n00": { + "ids":subject_ids, # required + #"categories":[] # optional, if not provided, it will be empty + }, + "n01": { + #"ids":[], + "categories":[] # required + } + } + } + } + } + + if len(subject_ids) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n00"]["ids"] = subject_ids + + if object_ids is not None and len(object_ids) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n01"]["ids"] = object_ids + + if subject_categories is not None and len(subject_categories) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n00"]["categories"] = subject_categories + + if object_categories is not None and len(object_categories) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n01"]["categories"] = object_categories + + if predicates is not None and len(predicates) > 0: + query_json_temp["message"]["query_graph"]["edges"]["e00"]["predicates"] = predicates + + return query_json_temp + + +def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None): + """ + start_node_categories and end_node_categories are lists of categories. + """ + q = { + "nodes": { + "on": { + "categories": end_node_categories, + "constraints": [], + "ids": [ + end_node_id + ], + "is_set": False, + "option_group_id": None, + "set_id": None, + "set_interpretation": "BATCH" + }, + "sn": { + "categories": start_node_categories, + "constraints": [], + "ids": [ + start_node_id + ], + "is_set": False, + "option_group_id": None, + "set_id": None, + "set_interpretation": "BATCH" + } + }, + "paths": { + "p0": { + "constraints": None, + "object": "on", + "predicates": None, + "subject": "sn" + } + } + } + return q + + +def generate_score_results(results, method='infores'): + """ + Generates a score dict, and a list of "analyses". + method can be 'infores' or 'edges' + """ + graph_scores = {} + max_score = 0 + auxiliary_graphs = results['auxiliary_graphs'] + for k, graph in auxiliary_graphs.items(): + if method == 'infores': + sources = set() + for edge_index in graph: + edge = results['knowledge_graph']['edges'][edge_index] + for resource in edge['sources']: + sources.add(resource['resource_id']) + score = len(sources) + if score > max_score: + max_score = score + else: + score = len(graph) + if score > max_score: + max_score = score + graph_scores[k] = score + graph_scores_formatted = [] + for k in graph_scores.keys(): + graph_scores[k] = graph_scores[k]/max_score + graph_scores_formatted.append({ + 'attributes': None, + 'path_bindings': { + 'p0': [{'id': k}]}, + 'resource_id': 'infores:tct', + 'score': graph_scores[k], + 'scoring_method': None, + 'support_graphs': None + }) + return graph_scores, graph_scores_formatted + + +def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict, + start_node_categories=None, end_node_categories=None, + get_node_info=True, + scoring_method='infores'): + """ + Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs. + scoring_method is how the node scores are generated, and could be 'infores' or 'edges'. + """ + # nodes + # TODO: get some node info? node attributes + node_info = {} + # edges is a dict of intermediate nodes + intermediate_node_edges = {} + for k, v in result1.items(): + i1 = v['subject'] + i2 = v['object'] + s_o = 'object' + if i1 == start_node_id: + intermediate_node_id = i2 + s_o = 'object' + elif i2 == start_node_id: + intermediate_node_id = i1 + s_o = 'subject' + else: + continue + if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges: + intermediate_node_edges[intermediate_node_id].append((k, v)) + else: + intermediate_node_edges[intermediate_node_id] = [(k, v)] + # add node dict + if intermediate_node_id not in node_info: + node_dict = { + } + node_info[intermediate_node_id] = node_dict + else: + node_dict = node_info[intermediate_node_id] + for attribute in v['attributes']: + if attribute['attribute_type_id'] == f'{s_o}_category': + if 'categories' not in node_dict: + node_dict['categories'] = set([attribute['value']]) + else: + node_dict['categories'].add(attribute['value']) + if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: + node_dict['name'] = attribute['value'] + node_info[intermediate_node_id] = node_dict + connecting_intermediate_nodes = {} + for k, v in result2.items(): + i1 = v['subject'] + i2 = v['object'] + if i1 == end_node_id: + intermediate_node_id = i2 + s_o = 'object' + elif i2 == end_node_id: + intermediate_node_id = i1 + s_o = 'subject' + else: + continue + if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges: + if intermediate_node_id in connecting_intermediate_nodes: + connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v)) + else: + connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]} + if intermediate_node_id not in node_info: + node_dict = { + } + node_info[intermediate_node_id] = node_dict + else: + node_dict = node_info[intermediate_node_id] + for attribute in v['attributes']: + if attribute['attribute_type_id'] == f'{s_o}_category': + if 'categories' not in node_dict: + node_dict['categories'] = set([attribute['value']]) + else: + node_dict['categories'].add(attribute['value']) + if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: + node_dict['name'] = attribute['value'] + node_info[intermediate_node_id] = node_dict + for k, v in node_info.items(): + if 'categories' in v: + v['categories'] = list(v['categories']) + all_edges = {} + all_auxiliary_graphs = {} + i = 1 + # sort connecting_intermediate_nodes by total number of connections + connection_counts = Counter({k: len(v['e1'])*len(v['e2']) for k, v in connecting_intermediate_nodes.items()}) + for i1, count in connection_counts.most_common(): + kv = connecting_intermediate_nodes[i1] + e1s = kv['e1'] + e2s = kv['e2'] + edges = {k: v for k, v in e1s} + edges.update({k: v for k, v in e2s}) + all_edges.update(edges) + keys = [x[0] for x in e1s] + [x[0] for x in e2s] + all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys + i += 1 + # generate output json + output = { + 'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories), + # TODO: don't drop the nodes + 'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()}, + 'edges': all_edges, + }, + 'results': [{'analyses': []}], + 'auxiliary_graphs': all_auxiliary_graphs + } + graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method) + output['results'][0]['analyses'] = graph_scores_formatted + if get_node_info: + from .node_normalizer import get_normalized_nodes + nodes_to_add = [] + for k, v in output['knowledge_graph']['nodes'].items(): + if 'name' not in v or 'categories' not in v: + nodes_to_add.append(k) + normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post') + for node_id in nodes_to_add: + nn = normalized_nodes[node_id] + output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types} + return output + +def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_predicates, input_node_category = []): + """ + This function is used to find the neighborhood of a given input node with intermediate categories. + + -------------- + Parameters: + input_node (str): The input node - should be a CURIE id. + node2_categories (list): A list of intermediate categories to be used in the neighborhood finding process. + APInames (dict): A dictionary containing the names of the APIs to be used. + metaKG (DataFrame): The metadata knowledge graph containing information about the APIs and their predicates. + API_predicates (dict): A dictionary containing the predicates for each API. + input_node_category (list): Optional. A list of categories for the input node. If empty, it will be derived from the input node's types. + + -------------- + Returns: + input_node_id (str): The curie id of the input node. + result (dict): The result of the query for the input node. + result_parsed (DataFrame): The parsed results for the input node. + result_ranked_by_primary_infores (DataFrame): The ranked results based on primary infores. + + -------------- + Example: + >>> input_node_id, result, result_parsed, result_ranked_by_primary_infores1 = Neighborhood_finder('MONDO:0008170', #Ovarian Cancer + node2_categories = ['biolink:SmallMolecule', 'biolink:Drug', 'biolink:ChemicalEntity'], + APInames = APInames, + metaKG = metaKG, + API_predicates = API_predicates) + -------------- + + """ + from . import node_normalizer + from . import translator_query + + input_node_id = input_node + # Step 1: Resolve the input node to get its curie id and categories + input_node_info = node_normalizer.get_normalized_nodes(input_node_id) + print(input_node_id) + + if len(input_node_category) == 0: + input_node_category = input_node_info.types + else: + input_node_category = list(set(input_node_category).intersection(set(input_node_info.types))) + if len(input_node_category) == 0: + input_node_category = input_node_info.types + + # Step 2: Select predicates and APIs based on the intermediate categories + sele_predicates, sele_APIs, API_URLs = sele_predicates_API(input_node_category, + node2_categories, + metaKG, APInames) + + # Step 3: Format the query JSON for the input node + query_json = format_query_json([input_node_id], [], + [input_node_category], + node2_categories, + sele_predicates) + + # Step 4: Query the APIs in parallel + result = translator_query.parallel_api_query(query_json=query_json, + select_APIs= sele_APIs, + APInames=APInames, + API_predicates=API_predicates, + max_workers=len(sele_APIs)) + result_parsed = parse_KG(result) + # Step 7: Ranking the results. This ranking method is based on the number of unique + # primary infores. It can only be used to rank the results with one defined node. + result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed, input_node_id) # input_node1_id is the curie id of the + return input_node_id, result, result_parsed, result_ranked_by_primary_infores1 + + From d712fe3c1bce366c2c58a66db0ca214417fb31a7 Mon Sep 17 00:00:00 2001 From: yjzhang Date: Thu, 30 Apr 2026 19:47:10 -0700 Subject: [PATCH 5/7] neighborhood finder - new output type --- TCT/neighborhood_finder.py | 72 +++++++++++--------------------------- 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py index 5310794..03ca8c9 100644 --- a/TCT/neighborhood_finder.py +++ b/TCT/neighborhood_finder.py @@ -1,6 +1,9 @@ +from collections import Counter +from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores -def format_query_json_for_pathfinder(subject_ids, object_ids=None, + +def format_query_json_for_neighborhood_finder(subject_ids, object_ids=None, subject_categories=None, object_categories=None, predicates=None): @@ -134,7 +137,7 @@ def generate_score_results(results, method='infores'): return graph_scores, graph_scores_formatted -def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict, +def parse_results_for_neighborhood_finder(start_node_id:str, results:dict, start_node_categories=None, end_node_categories=None, get_node_info=True, scoring_method='infores'): @@ -143,11 +146,10 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic scoring_method is how the node scores are generated, and could be 'infores' or 'edges'. """ # nodes - # TODO: get some node info? node attributes node_info = {} # edges is a dict of intermediate nodes - intermediate_node_edges = {} - for k, v in result1.items(): + node_edges = {} + for k, v in results.items(): i1 = v['subject'] i2 = v['object'] s_o = 'object' @@ -159,10 +161,10 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic s_o = 'subject' else: continue - if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges: - intermediate_node_edges[intermediate_node_id].append((k, v)) + if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in node_edges: + node_edges[intermediate_node_id].append((k, v)) else: - intermediate_node_edges[intermediate_node_id] = [(k, v)] + node_edges[intermediate_node_id] = [(k, v)] # add node dict if intermediate_node_id not in node_info: node_dict = { @@ -179,38 +181,6 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: node_dict['name'] = attribute['value'] node_info[intermediate_node_id] = node_dict - connecting_intermediate_nodes = {} - for k, v in result2.items(): - i1 = v['subject'] - i2 = v['object'] - if i1 == end_node_id: - intermediate_node_id = i2 - s_o = 'object' - elif i2 == end_node_id: - intermediate_node_id = i1 - s_o = 'subject' - else: - continue - if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges: - if intermediate_node_id in connecting_intermediate_nodes: - connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v)) - else: - connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]} - if intermediate_node_id not in node_info: - node_dict = { - } - node_info[intermediate_node_id] = node_dict - else: - node_dict = node_info[intermediate_node_id] - for attribute in v['attributes']: - if attribute['attribute_type_id'] == f'{s_o}_category': - if 'categories' not in node_dict: - node_dict['categories'] = set([attribute['value']]) - else: - node_dict['categories'].add(attribute['value']) - if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict: - node_dict['name'] = attribute['value'] - node_info[intermediate_node_id] = node_dict for k, v in node_info.items(): if 'categories' in v: v['categories'] = list(v['categories']) @@ -218,21 +188,16 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic all_auxiliary_graphs = {} i = 1 # sort connecting_intermediate_nodes by total number of connections - connection_counts = Counter({k: len(v['e1'])*len(v['e2']) for k, v in connecting_intermediate_nodes.items()}) + connection_counts = Counter({k: len(v) for k, v in node_edges.items()}) for i1, count in connection_counts.most_common(): - kv = connecting_intermediate_nodes[i1] - e1s = kv['e1'] - e2s = kv['e2'] - edges = {k: v for k, v in e1s} - edges.update({k: v for k, v in e2s}) - all_edges.update(edges) - keys = [x[0] for x in e1s] + [x[0] for x in e2s] + edges = node_edges[i1] + all_edges.update({k: v for k, v in edges}) + keys = [x[0] for x in edges] all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys i += 1 # generate output json output = { - 'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories), - # TODO: don't drop the nodes + 'query_graph': build_query_graph(start_node_id, '', start_node_categories, end_node_categories), 'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()}, 'edges': all_edges, }, @@ -250,9 +215,11 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post') for node_id in nodes_to_add: nn = normalized_nodes[node_id] - output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types} + if nn is not None: + output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types} return output + def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_predicates, input_node_category = []): """ This function is used to find the neighborhood of a given input node with intermediate categories. @@ -319,6 +286,7 @@ def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_pred # Step 7: Ranking the results. This ranking method is based on the number of unique # primary infores. It can only be used to rank the results with one defined node. result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed, input_node_id) # input_node1_id is the curie id of the - return input_node_id, result, result_parsed, result_ranked_by_primary_infores1 + parsed_results = parse_results_for_neighborhood_finder(input_node_id, result, input_node_category, node2_categories) + return input_node_id, result, parsed_results, result_ranked_by_primary_infores1 From 3b097a9a7a1dd1f3cf9ae3ae9f64b99fa04ae7d5 Mon Sep 17 00:00:00 2001 From: yjzhang Date: Thu, 30 Apr 2026 19:48:24 -0700 Subject: [PATCH 6/7] removed extraneous function --- TCT/neighborhood_finder.py | 55 -------------------------------------- 1 file changed, 55 deletions(-) diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py index 03ca8c9..41f822a 100644 --- a/TCT/neighborhood_finder.py +++ b/TCT/neighborhood_finder.py @@ -3,61 +3,6 @@ from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores -def format_query_json_for_neighborhood_finder(subject_ids, object_ids=None, - subject_categories=None, - object_categories=None, - predicates=None): - ''' - Example input: - subject_ids = ["NCBIGene:3845"] - object_ids = [] - subject_categories = ["biolink:Gene"] - object_categories = ["biolink:Gene"] - predicates = ["biolink:positively_correlated_with", "biolink:physically_interacts_with"] - ''' - query_json_temp = { - "message": { - "query_graph": { - - "edges": { - "e00": { - "subject": "n00", - "object": "n01", - "predicates": predicates - } - }, - "nodes": { - "n00": { - "ids":subject_ids, # required - #"categories":[] # optional, if not provided, it will be empty - }, - "n01": { - #"ids":[], - "categories":[] # required - } - } - } - } - } - - if len(subject_ids) > 0: - query_json_temp["message"]["query_graph"]["nodes"]["n00"]["ids"] = subject_ids - - if object_ids is not None and len(object_ids) > 0: - query_json_temp["message"]["query_graph"]["nodes"]["n01"]["ids"] = object_ids - - if subject_categories is not None and len(subject_categories) > 0: - query_json_temp["message"]["query_graph"]["nodes"]["n00"]["categories"] = subject_categories - - if object_categories is not None and len(object_categories) > 0: - query_json_temp["message"]["query_graph"]["nodes"]["n01"]["categories"] = object_categories - - if predicates is not None and len(predicates) > 0: - query_json_temp["message"]["query_graph"]["edges"]["e00"]["predicates"] = predicates - - return query_json_temp - - def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None): """ start_node_categories and end_node_categories are lists of categories. From 2ffe87d7dc0bad03e84f08549197dc15560762ce Mon Sep 17 00:00:00 2001 From: yjzhang Date: Thu, 30 Apr 2026 19:49:36 -0700 Subject: [PATCH 7/7] consolidated some duplicate functions --- TCT/neighborhood_finder.py | 81 +------------------------------------- 1 file changed, 1 insertion(+), 80 deletions(-) diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py index 41f822a..4c8018a 100644 --- a/TCT/neighborhood_finder.py +++ b/TCT/neighborhood_finder.py @@ -1,86 +1,7 @@ from collections import Counter from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores - - -def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None): - """ - start_node_categories and end_node_categories are lists of categories. - """ - q = { - "nodes": { - "on": { - "categories": end_node_categories, - "constraints": [], - "ids": [ - end_node_id - ], - "is_set": False, - "option_group_id": None, - "set_id": None, - "set_interpretation": "BATCH" - }, - "sn": { - "categories": start_node_categories, - "constraints": [], - "ids": [ - start_node_id - ], - "is_set": False, - "option_group_id": None, - "set_id": None, - "set_interpretation": "BATCH" - } - }, - "paths": { - "p0": { - "constraints": None, - "object": "on", - "predicates": None, - "subject": "sn" - } - } - } - return q - - -def generate_score_results(results, method='infores'): - """ - Generates a score dict, and a list of "analyses". - method can be 'infores' or 'edges' - """ - graph_scores = {} - max_score = 0 - auxiliary_graphs = results['auxiliary_graphs'] - for k, graph in auxiliary_graphs.items(): - if method == 'infores': - sources = set() - for edge_index in graph: - edge = results['knowledge_graph']['edges'][edge_index] - for resource in edge['sources']: - sources.add(resource['resource_id']) - score = len(sources) - if score > max_score: - max_score = score - else: - score = len(graph) - if score > max_score: - max_score = score - graph_scores[k] = score - graph_scores_formatted = [] - for k in graph_scores.keys(): - graph_scores[k] = graph_scores[k]/max_score - graph_scores_formatted.append({ - 'attributes': None, - 'path_bindings': { - 'p0': [{'id': k}]}, - 'resource_id': 'infores:tct', - 'score': graph_scores[k], - 'scoring_method': None, - 'support_graphs': None - }) - return graph_scores, graph_scores_formatted - +from .TCT_pathfinder import generate_score_results, build_query_graph def parse_results_for_neighborhood_finder(start_node_id:str, results:dict, start_node_categories=None, end_node_categories=None,