From a3a166072846bc26e312ba7b95f1f7fc11ce5e0b Mon Sep 17 00:00:00 2001
From: Yue Zhang <yjzhang@cs.washington.edu>
Date: Tue, 28 Apr 2026 22:03:02 -0700
Subject: [PATCH 1/7] fix test error

---
 tests/test_nameres.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_nameres.py b/tests/test_nameres.py
index 3e926b5..d9650a0 100644
--- a/tests/test_nameres.py
+++ b/tests/test_nameres.py
@@ -42,7 +42,7 @@ def test_nameres_status():
     assert status['status'] == 'ok'
     assert status['babel_version'] != ''
     assert status['babel_version_url'] != ''
-    assert status['numDocs'] > 425_000_000
+    assert status['solr']['numDocs'] > 425_000_000
 
 
 def test_nameres_incorrect():

From afffeee5045c65d824f728e1adbf16bd4b5f258e Mon Sep 17 00:00:00 2001
From: Yue Zhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 03:52:32 -0700
Subject: [PATCH 2/7] adding scoring to pathfinder output

---
 TCT/TCT_pathfinder.py          |  56 +++++-
 notebooks/Pathfinder_new.ipynb | 328 +++++++++++++++++++++++++--------
 2 files changed, 306 insertions(+), 78 deletions(-)

diff --git a/TCT/TCT_pathfinder.py b/TCT/TCT_pathfinder.py
index f7a5e28..eff8128 100644
--- a/TCT/TCT_pathfinder.py
+++ b/TCT/TCT_pathfinder.py
@@ -103,10 +103,50 @@ def build_query_graph(start_node_id, end_node_id, start_node_categories=None, en
     return q
 
 
+def generate_score_results(results, method='infores'):
+    """
+    Generates a score dict, and a list of "analyses".
+    method can be 'infores' or 'edges'
+    """
+    graph_scores = {}
+    max_score = 0
+    auxiliary_graphs = results['auxiliary_graphs']
+    for k, graph in auxiliary_graphs.items():
+        if method == 'infores':
+            sources = set()
+            for edge_index in graph:
+                edge = results['knowledge_graph']['edges'][edge_index]
+                for resource in edge['sources']:
+                    sources.add(resource['resource_id'])
+            score = len(sources)
+            if score > max_score:
+                max_score = score
+        else:
+            score = len(graph)
+            if score > max_score:
+                max_score = score
+        graph_scores[k] = score
+    graph_scores_formatted = []
+    for k in graph_scores.keys():
+        graph_scores[k] = graph_scores[k]/max_score
+        graph_scores_formatted.append({
+            'attributes': None,
+            'path_bindings': {
+                'p0': [{'id': k}]},
+            'resource_id': 'infores:tct',
+            'score': graph_scores[k],
+            'scoring_method': None,
+            'support_graphs': None
+            })
+    return graph_scores, graph_scores_formatted
+
+
 def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict,
-        start_node_categories=None, end_node_categories=None):
+        start_node_categories=None, end_node_categories=None,
+        scoring_method='infores'):
     """
     Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs.
+    scoring_method is how the node scores are generated, and could be 'infores' or 'edges'.
     """
     # TODO: parse results...
     # nodes
@@ -155,6 +195,7 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
         keys = [x[0] for x in e1s] + [x[0] for x in e2s]
         all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys
         i += 1
+    # generate output json
     output = {
         'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories),
         'knowledge_graph': {'nodes': {x: {} for x in connection_counts.keys()},
@@ -163,11 +204,17 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
         'results': [{'analyses': []}],
         'auxiliary_graphs': all_auxiliary_graphs
     }
+    graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method)
+    output['results'][0]['analyses'] = graph_scores_formatted
     return output
 
 
 def pathfinder(input_node1_id:str, input_node2_id:str,
-        intermediate_categories:list, APInames, metaKG, API_predicates):
+        intermediate_categories:list, APInames, metaKG, API_predicates,
+        scoring_method='infores'):
+    """
+    Returns a Pathfinder output for the given pair of nodes. scoring_method could be 'infores' or 'edges'.
+    """
     # get categories for input nodes
     normalized_node_dict = node_normalizer.get_normalized_nodes([input_node1_id, input_node2_id])
     input_node1_info = normalized_node_dict[input_node1_id]
@@ -227,7 +274,8 @@ def pathfinder(input_node1_id:str, input_node2_id:str,
 
     output = parse_results_for_pathfinder(input_node1_id, input_node2_id, result1, result2,
             start_node_categories=input_node1_category,
-            end_node_categories=input_node2_category)
+            end_node_categories=input_node2_category,
+            scoring_method=scoring_method)
 
     return result1, result2, output, paths
 
@@ -278,4 +326,4 @@ def query_arax_pathfinder(node1_id, node1_category, node2_id, node2_category):
     ARAX_endpoint = 'https://arax.ci.transltr.io/api/arax/v1.4/query'
     query_current = format_pathfinder_query(node1_id, node1_category, node2_id, node2_category)
     response = requests.post(ARAX_endpoint, json=query_current)
-    return response
\ No newline at end of file
+    return response
diff --git a/notebooks/Pathfinder_new.ipynb b/notebooks/Pathfinder_new.ipynb
index 67512d1..6e33b5f 100644
--- a/notebooks/Pathfinder_new.ipynb
+++ b/notebooks/Pathfinder_new.ipynb
@@ -108,14 +108,15 @@
      "output_type": "stream",
      "text": [
       "MONDO:0100096\n",
-      "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
       "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n",
+      "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
       "Automat-genome-alliance(Trapi v1.5.0): Success!\n",
       "RTX KG2 - TRAPI 1.5.0: Success!\n",
       "Automat-cam-kp(Trapi v1.5.0): Success!\n",
       "Automat-hetionet(Trapi v1.5.0): Success!\n",
-      "Automat-robokop(Trapi v1.5.0): Success!\n",
+      "Service Provider TRAPI: Success!\n",
       "BioThings Explorer (BTE) TRAPI: Success!\n",
+      "Automat-robokop(Trapi v1.5.0): Success!\n",
       "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
       "Clinical Trials KP - TRAPI 1.5.0: Success!\n",
       "Microbiome KP - TRAPI 1.5.0: Success!\n",
@@ -141,6 +142,14 @@
     "        API_predicates=API_predicates)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bf0506c3-6615-421b-bc45-ab4e124e3477",
+   "metadata": {},
+   "source": [
+    "### Examining pathfinder outputs"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -185,36 +194,36 @@
        "      <td>PDCD1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>TNF</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:7124</td>\n",
-       "      <td>biolink:genetically_interacts_with; biolink:in...</td>\n",
-       "      <td>biolink:gene_associated_with_condition; biolin...</td>\n",
-       "      <td>TNF</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>IL13</th>\n",
+       "      <th>IL17A</th>\n",
        "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:3596</td>\n",
+       "      <td>NCBIGene:3605</td>\n",
        "      <td>biolink:interacts_with; biolink:affects</td>\n",
        "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>IL13</td>\n",
+       "      <td>IL17A</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>NLRP3</th>\n",
        "      <td>0.133333</td>\n",
        "      <td>NCBIGene:114548</td>\n",
-       "      <td>biolink:affects; biolink:physically_interacts_...</td>\n",
+       "      <td>biolink:physically_interacts_with; biolink:aff...</td>\n",
        "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
        "      <td>NLRP3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>IL17A</th>\n",
+       "      <th>IL13</th>\n",
        "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:3605</td>\n",
+       "      <td>NCBIGene:3596</td>\n",
        "      <td>biolink:interacts_with; biolink:affects</td>\n",
        "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>IL17A</td>\n",
+       "      <td>IL13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TNF</th>\n",
+       "      <td>0.133333</td>\n",
+       "      <td>NCBIGene:7124</td>\n",
+       "      <td>biolink:genetically_interacts_with; biolink:in...</td>\n",
+       "      <td>biolink:gene_associated_with_condition; biolin...</td>\n",
+       "      <td>TNF</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -225,44 +234,44 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>LIF</th>\n",
+       "      <th>IFIT2</th>\n",
        "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:3976</td>\n",
-       "      <td>biolink:affects</td>\n",
+       "      <td>NCBIGene:3433</td>\n",
+       "      <td>biolink:physically_interacts_with</td>\n",
        "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>LIF</td>\n",
+       "      <td>IFIT2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>FAS</th>\n",
+       "      <th>GPX3</th>\n",
        "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:355</td>\n",
-       "      <td>biolink:regulates; biolink:affects</td>\n",
+       "      <td>NCBIGene:2878</td>\n",
+       "      <td>biolink:physically_interacts_with</td>\n",
        "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>FAS</td>\n",
+       "      <td>GPX3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>MEFV</th>\n",
+       "      <th>RETNLB</th>\n",
        "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:4210</td>\n",
-       "      <td>biolink:affects</td>\n",
+       "      <td>NCBIGene:84666</td>\n",
+       "      <td>biolink:interacts_with; biolink:physically_int...</td>\n",
        "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>MEFV</td>\n",
+       "      <td>RETNLB</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>PXDNL</th>\n",
+       "      <th>NT5E</th>\n",
        "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:137902</td>\n",
-       "      <td>biolink:interacts_with</td>\n",
+       "      <td>NCBIGene:4907</td>\n",
+       "      <td>biolink:affects</td>\n",
        "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>PXDNL</td>\n",
+       "      <td>NT5E</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>THBD</th>\n",
+       "      <th>H2BC21</th>\n",
        "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:7056</td>\n",
-       "      <td>biolink:affects; biolink:physically_interacts_...</td>\n",
+       "      <td>NCBIGene:8349</td>\n",
+       "      <td>biolink:physically_interacts_with</td>\n",
        "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>THBD</td>\n",
+       "      <td>H2BC21</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -270,44 +279,44 @@
        "</div>"
       ],
       "text/plain": [
-       "          score      output_node  \\\n",
-       "PDCD1  0.133333    NCBIGene:5133   \n",
-       "TNF    0.133333    NCBIGene:7124   \n",
-       "IL13   0.133333    NCBIGene:3596   \n",
-       "NLRP3  0.133333  NCBIGene:114548   \n",
-       "IL17A  0.133333    NCBIGene:3605   \n",
-       "...         ...              ...   \n",
-       "LIF    0.033333    NCBIGene:3976   \n",
-       "FAS    0.033333     NCBIGene:355   \n",
-       "MEFV   0.033333    NCBIGene:4210   \n",
-       "PXDNL  0.033333  NCBIGene:137902   \n",
-       "THBD   0.033333    NCBIGene:7056   \n",
+       "           score      output_node  \\\n",
+       "PDCD1   0.133333    NCBIGene:5133   \n",
+       "IL17A   0.133333    NCBIGene:3605   \n",
+       "NLRP3   0.133333  NCBIGene:114548   \n",
+       "IL13    0.133333    NCBIGene:3596   \n",
+       "TNF     0.133333    NCBIGene:7124   \n",
+       "...          ...              ...   \n",
+       "IFIT2   0.033333    NCBIGene:3433   \n",
+       "GPX3    0.033333    NCBIGene:2878   \n",
+       "RETNLB  0.033333   NCBIGene:84666   \n",
+       "NT5E    0.033333    NCBIGene:4907   \n",
+       "H2BC21  0.033333    NCBIGene:8349   \n",
        "\n",
-       "                                             predicates1  \\\n",
-       "PDCD1            biolink:interacts_with; biolink:affects   \n",
-       "TNF    biolink:genetically_interacts_with; biolink:in...   \n",
-       "IL13             biolink:interacts_with; biolink:affects   \n",
-       "NLRP3  biolink:affects; biolink:physically_interacts_...   \n",
-       "IL17A            biolink:interacts_with; biolink:affects   \n",
-       "...                                                  ...   \n",
-       "LIF                                      biolink:affects   \n",
-       "FAS                   biolink:regulates; biolink:affects   \n",
-       "MEFV                                     biolink:affects   \n",
-       "PXDNL                             biolink:interacts_with   \n",
-       "THBD   biolink:affects; biolink:physically_interacts_...   \n",
+       "                                              predicates1  \\\n",
+       "PDCD1             biolink:interacts_with; biolink:affects   \n",
+       "IL17A             biolink:interacts_with; biolink:affects   \n",
+       "NLRP3   biolink:physically_interacts_with; biolink:aff...   \n",
+       "IL13              biolink:interacts_with; biolink:affects   \n",
+       "TNF     biolink:genetically_interacts_with; biolink:in...   \n",
+       "...                                                   ...   \n",
+       "IFIT2                   biolink:physically_interacts_with   \n",
+       "GPX3                    biolink:physically_interacts_with   \n",
+       "RETNLB  biolink:interacts_with; biolink:physically_int...   \n",
+       "NT5E                                      biolink:affects   \n",
+       "H2BC21                  biolink:physically_interacts_with   \n",
        "\n",
-       "                                             predicates2 output_node_name  \n",
-       "PDCD1  biolink:related_to; biolink:gene_associated_wi...            PDCD1  \n",
-       "TNF    biolink:gene_associated_with_condition; biolin...              TNF  \n",
-       "IL13   biolink:related_to; biolink:gene_associated_wi...             IL13  \n",
-       "NLRP3  biolink:related_to; biolink:gene_associated_wi...            NLRP3  \n",
-       "IL17A  biolink:related_to; biolink:gene_associated_wi...            IL17A  \n",
-       "...                                                  ...              ...  \n",
-       "LIF               biolink:gene_associated_with_condition              LIF  \n",
-       "FAS               biolink:gene_associated_with_condition              FAS  \n",
-       "MEFV              biolink:gene_associated_with_condition             MEFV  \n",
-       "PXDNL             biolink:gene_associated_with_condition            PXDNL  \n",
-       "THBD              biolink:gene_associated_with_condition             THBD  \n",
+       "                                              predicates2 output_node_name  \n",
+       "PDCD1   biolink:related_to; biolink:gene_associated_wi...            PDCD1  \n",
+       "IL17A   biolink:related_to; biolink:gene_associated_wi...            IL17A  \n",
+       "NLRP3   biolink:related_to; biolink:gene_associated_wi...            NLRP3  \n",
+       "IL13    biolink:related_to; biolink:gene_associated_wi...             IL13  \n",
+       "TNF     biolink:gene_associated_with_condition; biolin...              TNF  \n",
+       "...                                                   ...              ...  \n",
+       "IFIT2              biolink:gene_associated_with_condition            IFIT2  \n",
+       "GPX3               biolink:gene_associated_with_condition             GPX3  \n",
+       "RETNLB             biolink:gene_associated_with_condition           RETNLB  \n",
+       "NT5E               biolink:gene_associated_with_condition             NT5E  \n",
+       "H2BC21             biolink:gene_associated_with_condition           H2BC21  \n",
        "\n",
        "[680 rows x 5 columns]"
       ]
@@ -344,9 +353,180 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "9847dbb5-0d63-496b-8969-4dd330f3897c",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'attributes': [{'attribute_source': 'infores:rtx-kg2',\n",
+       "   'attribute_type_id': 'biolink:original_predicate',\n",
+       "   'description': 'The IDs of the original RTX-KG2pre edge(s) corresponding to this edge prior to any synonymization or remapping.',\n",
+       "   'value': ['UMLS:C0017337---SEMMEDDB:interacts_with---None---None---None---UMLS:C0021745---SEMMEDDB:',\n",
+       "    'UMLS:C0017337---SEMMEDDB:interacts_with---None---None---None---UMLS:C1334085---SEMMEDDB:'],\n",
+       "   'value_type_id': 'metatype:String'},\n",
+       "  {'attribute_source': 'infores:rtx-kg2',\n",
+       "   'attribute_type_id': 'biolink:knowledge_level',\n",
+       "   'value': 'prediction'},\n",
+       "  {'attribute_source': 'infores:rtx-kg2',\n",
+       "   'attribute_type_id': 'biolink:publications',\n",
+       "   'value': ['PMID:25676343',\n",
+       "    'PMID:3934559',\n",
+       "    'PMID:35453072',\n",
+       "    'PMID:36111345'],\n",
+       "   'value_type_id': 'biolink:Uriorcurie'},\n",
+       "  {'attribute_source': 'infores:rtx-kg2',\n",
+       "   'attribute_type_id': 'biolink:agent_type',\n",
+       "   'value': 'text_mining_agent'},\n",
+       "  {'attribute_source': 'infores:rtx-kg2',\n",
+       "   'attribute_type_id': 'biolink:supporting_text',\n",
+       "   'value': {'PMID:25676343': {'object score': '851',\n",
+       "     'publication date': '2015 Mar',\n",
+       "     'sentence': 'OBJECTIVES: Atypical familial mycobacteriosis (AFM, OMIM #209950) is caused by mutations in genes regulating IL12/IFNG pathway.',\n",
+       "     'subject score': '1000'},\n",
+       "    'PMID:35453072': {'object score': '1000',\n",
+       "     'publication date': '2022 Apr 18',\n",
+       "     'sentence': 'Innate immune responses were attenuated reflected by decreased expression of genes involved in interferon-gamma, leukocyte migration and neutrophil mediated immune response in convalescent COVID-19 patients.',\n",
+       "     'subject score': '1000'},\n",
+       "    'PMID:36111345': {'object score': '901',\n",
+       "     'publication date': '2022',\n",
+       "     'sentence': 'Differentially expressed genes (DEGs) between groups were enriched in allograft rejection, hypoxia, glycolysis, TNFalpha signaling    via     NF-kappaB, and interferon-gamma responses via Gene set enrichment analysis (GSEA).',\n",
+       "     'subject score': '790'},\n",
+       "    'PMID:3934559': {'object score': '1000',\n",
+       "     'publication date': '1985 Nov 21-27',\n",
+       "     'sentence': 'This gene controls a function or a product necessary for the action of gamma-interferon on class II genes.',\n",
+       "     'subject score': '1000'}}}],\n",
+       " 'object': 'NCBIGene:3458',\n",
+       " 'predicate': 'biolink:interacts_with',\n",
+       " 'sources': [{'resource_id': 'infores:semmeddb',\n",
+       "   'resource_role': 'primary_knowledge_source'},\n",
+       "  {'resource_id': 'infores:rtx-kg2',\n",
+       "   'resource_role': 'aggregator_knowledge_source',\n",
+       "   'upstream_resource_ids': ['infores:semmeddb']}],\n",
+       " 'subject': 'UMLS:C0017337'}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(formatted_output['knowledge_graph']['edges'].values())[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['12026540',\n",
+       " '13008760',\n",
+       " '13008998',\n",
+       " '1451206',\n",
+       " '25948818',\n",
+       " '12006224',\n",
+       " '12009572',\n",
+       " '12011840',\n",
+       " '12014211',\n",
+       " '12034620']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(formatted_output['auxiliary_graphs'].values())[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "94377dae-4f2a-4cc4-bdc0-8d7c380ed152",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'resource_id': 'infores:semmeddb',\n",
+       "  'resource_role': 'primary_knowledge_source'},\n",
+       " {'resource_id': 'infores:rtx-kg2',\n",
+       "  'resource_role': 'aggregator_knowledge_source',\n",
+       "  'upstream_resource_ids': ['infores:semmeddb']}]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formatted_output['knowledge_graph']['edges']['12026540']['sources']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "56886736-9e0a-4e9a-9773-308447fcce3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'resource_id': 'infores:semmeddb',\n",
+       "  'resource_role': 'primary_knowledge_source'},\n",
+       " {'resource_id': 'infores:rtx-kg2',\n",
+       "  'resource_role': 'aggregator_knowledge_source',\n",
+       "  'upstream_resource_ids': ['infores:semmeddb']}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formatted_output['knowledge_graph']['edges']['13008760']['sources']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8540fc14-dd13-444f-b9d9-afae72b31b49",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'attributes': None,\n",
+       " 'path_bindings': {'p0': [{'id': 'aux_1_UMLS:C0017337'}]},\n",
+       " 'resource_id': 'infores:tct',\n",
+       " 'score': 0.14285714285714285,\n",
+       " 'scoring_method': None,\n",
+       " 'support_graphs': None}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "formatted_output['results'][0]['analyses'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6ba8e09-c1aa-4010-86cf-722bc9c27058",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

From 43df311158a789dc148f8ff6cf012ec716354ec3 Mon Sep 17 00:00:00 2001
From: yjzhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 18:20:54 -0700
Subject: [PATCH 3/7] add nodes to TCT_pathfinder

---
 TCT/TCT_pathfinder.py          |  76 ++++--
 TCT/name_resolver.py           |  49 +++-
 notebooks/Pathfinder_new.ipynb | 441 +++++++++++++++++----------------
 3 files changed, 326 insertions(+), 240 deletions(-)

diff --git a/TCT/TCT_pathfinder.py b/TCT/TCT_pathfinder.py
index eff8128..435a125 100644
--- a/TCT/TCT_pathfinder.py
+++ b/TCT/TCT_pathfinder.py
@@ -143,36 +143,59 @@ def generate_score_results(results, method='infores'):
 
 def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict,
         start_node_categories=None, end_node_categories=None,
+        get_node_info=True,
         scoring_method='infores'):
     """
     Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs.
     scoring_method is how the node scores are generated, and could be 'infores' or 'edges'.
     """
-    # TODO: parse results...
     # nodes
+    # TODO: get some node info? node attributes
+    node_info = {}
     # edges is a dict of intermediate nodes
     intermediate_node_edges = {}
     for k, v in result1.items():
         i1 = v['subject']
         i2 = v['object']
+        s_o = 'object'
         if i1 == start_node_id:
             intermediate_node_id = i2
+            s_o = 'object'
         elif i2 == start_node_id:
             intermediate_node_id = i1
+            s_o = 'subject'
         else:
             continue
         if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges:
             intermediate_node_edges[intermediate_node_id].append((k, v))
         else:
             intermediate_node_edges[intermediate_node_id] = [(k, v)]
+        # add node dict
+        if intermediate_node_id not in node_info:
+            node_dict = {
+            }
+            node_info[intermediate_node_id] = node_dict
+        else:
+            node_dict = node_info[intermediate_node_id]
+        for attribute in v['attributes']:
+            if attribute['attribute_type_id'] == f'{s_o}_category':
+                if 'categories' not in node_dict:
+                    node_dict['categories'] = set([attribute['value']])
+                else:
+                    node_dict['categories'].add(attribute['value'])
+            if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
+                node_dict['name'] = attribute['value']
+        node_info[intermediate_node_id] = node_dict
     connecting_intermediate_nodes = {}
     for k, v in result2.items():
         i1 = v['subject']
         i2 = v['object']
         if i1 == end_node_id:
             intermediate_node_id = i2
+            s_o = 'object'
         elif i2 == end_node_id:
             intermediate_node_id = i1
+            s_o = 'subject'
         else:
             continue
         if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges:
@@ -180,6 +203,24 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
                 connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v))
             else:
                 connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]}
+        if intermediate_node_id not in node_info:
+            node_dict = {
+            }
+            node_info[intermediate_node_id] = node_dict
+        else:
+            node_dict = node_info[intermediate_node_id]
+        for attribute in v['attributes']:
+            if attribute['attribute_type_id'] == f'{s_o}_category':
+                if 'categories' not in node_dict:
+                    node_dict['categories'] = set([attribute['value']])
+                else:
+                    node_dict['categories'].add(attribute['value'])
+            if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
+                node_dict['name'] = attribute['value']
+        node_info[intermediate_node_id] = node_dict
+    for k, v in node_info.items():
+        if 'categories' in v:
+            v['categories'] = list(v['categories'])
     all_edges = {}
     all_auxiliary_graphs = {}
     i = 1
@@ -198,7 +239,8 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
     # generate output json
     output = {
         'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories),
-        'knowledge_graph': {'nodes': {x: {} for x in connection_counts.keys()},
+        # TODO: don't drop the nodes
+        'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()},
                             'edges': all_edges,
                            },
         'results': [{'analyses': []}],
@@ -206,6 +248,16 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
     }
     graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method)
     output['results'][0]['analyses'] = graph_scores_formatted
+    if get_node_info:
+        from .node_normalizer import get_normalized_nodes
+        nodes_to_add = []
+        for k, v in output['knowledge_graph']['nodes'].items():
+            if 'name' not in v or 'categories' not in v:
+                nodes_to_add.append(k)
+        normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post')
+        for node_id in nodes_to_add:
+            nn = normalized_nodes[node_id]
+            output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types}
     return output
 
 
@@ -257,27 +309,13 @@ def pathfinder(input_node1_id:str, input_node2_id:str,
                                 APInames=APInames,
                                 API_predicates=API_predicates,
                                 max_workers=len(sele_APIs2))
-
-    result_parsed1 = parse_KG(result1)
-        # Step 7: Ranking the results. This ranking method is based on the number of unique
-        # primary infores. It can only be used to rank the results with one defined node.
-    result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed1, input_node1_id)   # input_node1_id is the curie id of the
-
-    result_parsed2 = parse_KG(result2)
-    result_ranked_by_primary_infores2 = rank_by_primary_infores(result_parsed2, input_node2_id)   # input_node2_id is the curie id of the
-
-    possible_paths = len(set(result_ranked_by_primary_infores1['output_node']).intersection(set(result_ranked_by_primary_infores2['output_node'])))
-    print("Number of possible paths: ", possible_paths)
-
-    paths = merge_ranking_by_number_of_infores(result_ranked_by_primary_infores1, result_ranked_by_primary_infores2,
-            plot=False)
-
     output = parse_results_for_pathfinder(input_node1_id, input_node2_id, result1, result2,
             start_node_categories=input_node1_category,
             end_node_categories=input_node2_category,
-            scoring_method=scoring_method)
+            scoring_method=scoring_method,
+            get_node_info=True)
 
-    return result1, result2, output, paths
+    return result1, result2, output
 
 
 
diff --git a/TCT/name_resolver.py b/TCT/name_resolver.py
index 387698e..cf78e6a 100644
--- a/TCT/name_resolver.py
+++ b/TCT/name_resolver.py
@@ -72,14 +72,14 @@ def lookup(query: str, return_top_response:bool=True, return_synonyms:bool=False
         raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response))
 
 
-def synonyms(query: str, **kwargs):
+def synonyms(query: str|list, **kwargs):
     """
-    A wrapper around the `synonyms` api endpoint. Given a list of CURIEs, this returns a dict of CURIE id : TranslatorNode for all synonyms for the given query.
+    A wrapper around the `synonyms` api endpoint. Given a CURIE or a list of CURIEs, this returns a dict of CURIE id : TranslatorNode for all synonyms for the given query.
 
     Parameters
     ----------
-    query : str
-        Query CURIE
+    query : str|list
+        Query CURIE or list of CURIEs
     **kwargs
         Other arguments to `synonyms`
 
@@ -93,7 +93,7 @@ def synonyms(query: str, **kwargs):
     if response.status_code == 200:
         result = response.json()
         if len(result) == 0:
-            raise LookupError('No matching CURIE found for the given string ' + query)
+            raise LookupError('No matching CURIE found for the given string ' + str(query))
         else:
             all_nodes = {}
             for k, node in result.items():
@@ -172,3 +172,42 @@ def batch_lookup(strings:list[str], size: int=25, return_top_response:bool=True,
         else:
             raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response))
     return curies
+
+
+def batch_synonyms(strings:list[str], size:int=50, **kwargs) -> dict:
+    """
+    A wrapper around the `synonyms` API endpoint, using POST. Given a list of CURIEs, this returns a dict of CURIE:TranslatorNode, where each TranslatorNode contains all synonyms for the given CURIE.
+
+    Parameters
+    ----------
+    strings : list[str]
+        List of CURIEs.
+    size : int
+        Desired chunking size, default is 50.
+
+    Returns
+    -------
+    Dict of CURIE : TranslatorNode
+    """
+    chunks = chunk_list(strings, size)
+    path = urllib.parse.urljoin(URL, 'synonyms')
+    curies = {}
+    for chunk in chunks:
+        # set autocomplete to be false by default
+        response = requests.post(path, json={'preferred_curies': chunk, **kwargs})
+        if response.status_code == 200:
+            result = response.json()
+            if len(result) == 0:
+                raise LookupError('No matching CURIE found for the given CURIEs ' + str(chunks))
+            else:
+                for k, node in result.items():
+                    if not node:
+                        # If node is empty or None.
+                        curies[k] = None
+                    else:
+                        curies[k] = TranslatorNode.from_dict(node, return_synonyms=True)
+        else:
+            raise requests.RequestException('Response from server had error, code ' + str(response.status_code) + ' ' + str(response))
+    return curies
+
+
diff --git a/notebooks/Pathfinder_new.ipynb b/notebooks/Pathfinder_new.ipynb
index 6e33b5f..c4c1db1 100644
--- a/notebooks/Pathfinder_new.ipynb
+++ b/notebooks/Pathfinder_new.ipynb
@@ -108,33 +108,24 @@
      "output_type": "stream",
      "text": [
       "MONDO:0100096\n",
-      "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n",
-      "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
       "Automat-genome-alliance(Trapi v1.5.0): Success!\n",
+      "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
+      "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n",
       "RTX KG2 - TRAPI 1.5.0: Success!\n",
       "Automat-cam-kp(Trapi v1.5.0): Success!\n",
       "Automat-hetionet(Trapi v1.5.0): Success!\n",
+      "Automat-robokop(Trapi v1.5.0): Success!\n",
       "Service Provider TRAPI: Success!\n",
       "BioThings Explorer (BTE) TRAPI: Success!\n",
-      "Automat-robokop(Trapi v1.5.0): Success!\n",
       "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n",
       "Clinical Trials KP - TRAPI 1.5.0: Success!\n",
       "Microbiome KP - TRAPI 1.5.0: Success!\n",
-      "RTX KG2 - TRAPI 1.5.0: Success!\n",
-      "ENSEMBL:ENSP00000423463: no preferred name\n",
-      "ENSEMBL:ENSP00000423463: no preferred name\n",
-      "UniProtKB:P01308-1: no preferred name\n",
-      "UniProtKB:P01308-1: no preferred name\n",
-      "UniProtKB:P12544-1: no preferred name\n",
-      "UniProtKB:P12544-1: no preferred name\n",
-      "NodeNorm does not know about these identifiers: NCIT:C16612,UniProtKB:P05113-1,UniProtKB:P05113-1\n",
-      "NodeNorm does not know about these identifiers: UMLS:C5911035,UMLS:C5943250,UMLS:C5959582,UMLS:C5908975,UMLS:C5926549,UMLS:C5943245,CHEMBL.TARGET:CHEMBL1697664,CHEMBL.TARGET:CHEMBL6188,CHEMBL.TARGET:CHEMBL4295579,CHEMBL.TARGET:CHEMBL1075031,CHEMBL.TARGET:CHEMBL4295609,CHEMBL.TARGET:CHEMBL1287621,CHEMBL.TARGET:CHEMBL3499,CHEMBL.TARGET:CHEMBL4295625,CHEMBL.TARGET:CHEMBL4295542,CHEMBL.TARGET:CHEMBL1741198,CHEMBL.TARGET:CHEMBL3309030,CHEMBL.TARGET:CHEMBL4882,CHEMBL.TARGET:CHEMBL3309037\n",
-      "Number of possible paths:  680\n"
+      "RTX KG2 - TRAPI 1.5.0: Success!\n"
      ]
     }
    ],
    "source": [
-    "result1, result2, formatted_output, paths = TCT_pathfinder.pathfinder(input_node1_id='NCBIGene:3458', #IFNG\n",
+    "result1, result2, formatted_output = TCT_pathfinder.pathfinder(input_node1_id='NCBIGene:3458', #IFNG\n",
     "        input_node2_id= 'MONDO:0100096', #COVID-19\n",
     "        intermediate_categories=['biolink:Protein', 'biolink:Gene' ], \n",
     "        APInames=select_APIs, \n",
@@ -153,172 +144,13 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "c1e3f672-a900-4e45-b197-8874a8f44c32",
+   "id": "fcfb4342-ed1b-401d-8ffa-862978f0c275",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>score</th>\n",
-       "      <th>output_node</th>\n",
-       "      <th>predicates1</th>\n",
-       "      <th>predicates2</th>\n",
-       "      <th>output_node_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>PDCD1</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:5133</td>\n",
-       "      <td>biolink:interacts_with; biolink:affects</td>\n",
-       "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>PDCD1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>IL17A</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:3605</td>\n",
-       "      <td>biolink:interacts_with; biolink:affects</td>\n",
-       "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>IL17A</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>NLRP3</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:114548</td>\n",
-       "      <td>biolink:physically_interacts_with; biolink:aff...</td>\n",
-       "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>NLRP3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>IL13</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:3596</td>\n",
-       "      <td>biolink:interacts_with; biolink:affects</td>\n",
-       "      <td>biolink:related_to; biolink:gene_associated_wi...</td>\n",
-       "      <td>IL13</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>TNF</th>\n",
-       "      <td>0.133333</td>\n",
-       "      <td>NCBIGene:7124</td>\n",
-       "      <td>biolink:genetically_interacts_with; biolink:in...</td>\n",
-       "      <td>biolink:gene_associated_with_condition; biolin...</td>\n",
-       "      <td>TNF</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>IFIT2</th>\n",
-       "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:3433</td>\n",
-       "      <td>biolink:physically_interacts_with</td>\n",
-       "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>IFIT2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>GPX3</th>\n",
-       "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:2878</td>\n",
-       "      <td>biolink:physically_interacts_with</td>\n",
-       "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>GPX3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>RETNLB</th>\n",
-       "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:84666</td>\n",
-       "      <td>biolink:interacts_with; biolink:physically_int...</td>\n",
-       "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>RETNLB</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>NT5E</th>\n",
-       "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:4907</td>\n",
-       "      <td>biolink:affects</td>\n",
-       "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>NT5E</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>H2BC21</th>\n",
-       "      <td>0.033333</td>\n",
-       "      <td>NCBIGene:8349</td>\n",
-       "      <td>biolink:physically_interacts_with</td>\n",
-       "      <td>biolink:gene_associated_with_condition</td>\n",
-       "      <td>H2BC21</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>680 rows × 5 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "           score      output_node  \\\n",
-       "PDCD1   0.133333    NCBIGene:5133   \n",
-       "IL17A   0.133333    NCBIGene:3605   \n",
-       "NLRP3   0.133333  NCBIGene:114548   \n",
-       "IL13    0.133333    NCBIGene:3596   \n",
-       "TNF     0.133333    NCBIGene:7124   \n",
-       "...          ...              ...   \n",
-       "IFIT2   0.033333    NCBIGene:3433   \n",
-       "GPX3    0.033333    NCBIGene:2878   \n",
-       "RETNLB  0.033333   NCBIGene:84666   \n",
-       "NT5E    0.033333    NCBIGene:4907   \n",
-       "H2BC21  0.033333    NCBIGene:8349   \n",
-       "\n",
-       "                                              predicates1  \\\n",
-       "PDCD1             biolink:interacts_with; biolink:affects   \n",
-       "IL17A             biolink:interacts_with; biolink:affects   \n",
-       "NLRP3   biolink:physically_interacts_with; biolink:aff...   \n",
-       "IL13              biolink:interacts_with; biolink:affects   \n",
-       "TNF     biolink:genetically_interacts_with; biolink:in...   \n",
-       "...                                                   ...   \n",
-       "IFIT2                   biolink:physically_interacts_with   \n",
-       "GPX3                    biolink:physically_interacts_with   \n",
-       "RETNLB  biolink:interacts_with; biolink:physically_int...   \n",
-       "NT5E                                      biolink:affects   \n",
-       "H2BC21                  biolink:physically_interacts_with   \n",
-       "\n",
-       "                                              predicates2 output_node_name  \n",
-       "PDCD1   biolink:related_to; biolink:gene_associated_wi...            PDCD1  \n",
-       "IL17A   biolink:related_to; biolink:gene_associated_wi...            IL17A  \n",
-       "NLRP3   biolink:related_to; biolink:gene_associated_wi...            NLRP3  \n",
-       "IL13    biolink:related_to; biolink:gene_associated_wi...             IL13  \n",
-       "TNF     biolink:gene_associated_with_condition; biolin...              TNF  \n",
-       "...                                                   ...              ...  \n",
-       "IFIT2              biolink:gene_associated_with_condition            IFIT2  \n",
-       "GPX3               biolink:gene_associated_with_condition             GPX3  \n",
-       "RETNLB             biolink:gene_associated_with_condition           RETNLB  \n",
-       "NT5E               biolink:gene_associated_with_condition             NT5E  \n",
-       "H2BC21             biolink:gene_associated_with_condition           H2BC21  \n",
-       "\n",
-       "[680 rows x 5 columns]"
+       "dict_keys(['query_graph', 'knowledge_graph', 'results', 'auxiliary_graphs'])"
       ]
      },
      "execution_count": 5,
@@ -327,19 +159,59 @@
     }
    ],
    "source": [
-    "paths"
+    "formatted_output.keys()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "fcfb4342-ed1b-401d-8ffa-862978f0c275",
+   "id": "9847dbb5-0d63-496b-8969-4dd330f3897c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "dict_keys(['query_graph', 'knowledge_graph', 'results', 'auxiliary_graphs'])"
+       "{'sources': [{'resource_id': 'infores:automat-cam-kp',\n",
+       "   'resource_role': 'aggregator_knowledge_source',\n",
+       "   'upstream_resource_ids': ['infores:cam-kp']},\n",
+       "  {'resource_id': 'infores:ctd',\n",
+       "   'resource_role': 'primary_knowledge_source',\n",
+       "   'upstream_resource_ids': []},\n",
+       "  {'resource_id': 'infores:cam-kp',\n",
+       "   'resource_role': 'aggregator_knowledge_source',\n",
+       "   'upstream_resource_ids': ['infores:ctd']}],\n",
+       " 'qualifiers': [],\n",
+       " 'predicate': 'biolink:affects',\n",
+       " 'attributes': [{'attribute_type_id': 'biolink:xref',\n",
+       "   'value': ['http://ctdbase.org/detail.go?type=relationship&ixnId=4118485',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=3478223',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=8453602',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2683440',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=7638084',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2746570',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=3269154',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=6583144',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2763529',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2691105',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2749488',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=4935193',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=2763531',\n",
+       "    'http://ctdbase.org/detail.go?type=relationship&ixnId=8453590'],\n",
+       "   'original_attribute_name': 'xref'},\n",
+       "  {'attribute_type_id': 'biolink:knowledge_level',\n",
+       "   'value': 'knowledge_assertion',\n",
+       "   'original_attribute_name': 'knowledge_level'},\n",
+       "  {'attribute_type_id': 'biolink:agent_type',\n",
+       "   'value': 'manual_agent',\n",
+       "   'original_attribute_name': 'agent_type'},\n",
+       "  {'attribute_type_id': 'biolink:original_object',\n",
+       "   'value': 'NCBIGene:7124',\n",
+       "   'original_attribute_name': 'original_object'},\n",
+       "  {'attribute_type_id': 'biolink:original_subject',\n",
+       "   'value': 'NCBIGene:3458',\n",
+       "   'original_attribute_name': 'original_subject'}],\n",
+       " 'subject': 'NCBIGene:3458',\n",
+       " 'object': 'NCBIGene:7124'}"
       ]
      },
      "execution_count": 6,
@@ -348,14 +220,83 @@
     }
    ],
    "source": [
-    "formatted_output.keys()"
+    "list(formatted_output['knowledge_graph']['edges'].values())[0]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "9847dbb5-0d63-496b-8969-4dd330f3897c",
+   "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['19a467d8',\n",
+       " 'e_222',\n",
+       " '6ab7f02f',\n",
+       " '82968adc',\n",
+       " '978f6ce1',\n",
+       " '3571016d',\n",
+       " '119d5072',\n",
+       " '73f57667f20d4136d56ce1c56180ed4d',\n",
+       " '64df3ca7c1bb4570261ba9a485ab6ea2',\n",
+       " '5866d665014fe74cbd4e9d8b5744d6d0',\n",
+       " '9983f5c2918b0090337d2d597305d8c2',\n",
+       " 'd15478a27e98d66ab143989bd157dce5',\n",
+       " '946805402aa3b12a8a80e5c8ef4833cd',\n",
+       " 'a846d99d439b616b0e14a474f8254349',\n",
+       " '710009b429dc63bbd2c32329e1df16ec',\n",
+       " 'b5c61c0cce4e6273234ff58e111c06a2',\n",
+       " '26c765858975b8618f2184804a19ba05',\n",
+       " '3f01ede548609170f5828b6ffaf7d972',\n",
+       " 'e82906ca6f38a7ead24a067a3e435b38',\n",
+       " '0169afa0ab05defb81aafa9fce39e242',\n",
+       " '58d466d9f807a877150013e0691a25fc',\n",
+       " 'b21949aec9d249b6da58d3c67c8df73e',\n",
+       " 'd1e2ef5eec04efbc499889be6ce1eea7',\n",
+       " '4e5d8e79b1308e3ae15fb55298536387',\n",
+       " 'bf585df1c25702eb7449d34671891147',\n",
+       " '60975f91744b9f127a59a6322c2f7cb7',\n",
+       " '665990a36b9cba77e5e6d6645b19bbc3',\n",
+       " '78e8ccbbb83c35759418c71decafc356',\n",
+       " '109a1f3bfa18c2975ea82a36c31255e8',\n",
+       " '3c660b48a0849cc783ad03cbfded02ab',\n",
+       " '3f0b12128de418c82724fefb0452b2eb',\n",
+       " '3a5203d87d6f9e6f584b9843155aac05',\n",
+       " '28d7f20f51ddf666407ab90ae87bc55b',\n",
+       " 'e02741c698e72353f95c4a529ba01438',\n",
+       " 'f35a81cd97153cce97d982d7710b558c',\n",
+       " '712ff9a3be50edc0901d3e02757589f2',\n",
+       " 'fbe597bca092f6478afb3d2f8f31ee86',\n",
+       " '34d8d05a6d2bdd0541f72690b8f2f6e9',\n",
+       " 'd6f6af8c653dd95f918f99e15b4c0b0a',\n",
+       " '84e026bbdab961e56f2bdf6f30f9486b',\n",
+       " 'bb54be6d72593ab41809f3c7294a31a7',\n",
+       " '7b393fa1d78193a0e6e05fd1b67f6352',\n",
+       " '10e45932c3590ec64fd0dce9f7f69468',\n",
+       " '86de16e01a16264dcae011be9ba31b2e',\n",
+       " 'e97a340841d8f91890e6d10fb8def2d1',\n",
+       " '45319800',\n",
+       " '8155745']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(formatted_output['auxiliary_graphs'].values())[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e63554d9-5d04-4636-8c37-3e7aaf2ab641",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
@@ -407,43 +348,13 @@
        " 'subject': 'UMLS:C0017337'}"
       ]
      },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "list(formatted_output['knowledge_graph']['edges'].values())[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "c2ac8ad6-218f-4805-8c72-8cf21df4ac7c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['12026540',\n",
-       " '13008760',\n",
-       " '13008998',\n",
-       " '1451206',\n",
-       " '25948818',\n",
-       " '12006224',\n",
-       " '12009572',\n",
-       " '12011840',\n",
-       " '12014211',\n",
-       " '12034620']"
-      ]
-     },
      "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "list(formatted_output['auxiliary_graphs'].values())[0]"
+    "formatted_output['knowledge_graph']['edges']['12026540']"
    ]
   },
   {
@@ -498,7 +409,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "8540fc14-dd13-444f-b9d9-afae72b31b49",
    "metadata": {},
    "outputs": [
@@ -506,14 +417,14 @@
      "data": {
       "text/plain": [
        "{'attributes': None,\n",
-       " 'path_bindings': {'p0': [{'id': 'aux_1_UMLS:C0017337'}]},\n",
+       " 'path_bindings': {'p0': [{'id': 'aux_1_NCBIGene:7124'}]},\n",
        " 'resource_id': 'infores:tct',\n",
-       " 'score': 0.14285714285714285,\n",
+       " 'score': 0.7391304347826086,\n",
        " 'scoring_method': None,\n",
        " 'support_graphs': None}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -524,9 +435,107 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "a6ba8e09-c1aa-4010-86cf-722bc9c27058",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'attributes': [{'attribute_source': 'infores:catrax-pharmacogenomics',\n",
+       "   'attribute_type_id': 'biolink:publications',\n",
+       "   'value': 'PMID:36243968|PMID:9397972',\n",
+       "   'value_type_id': 'biolink:Uriorcurie'},\n",
+       "  {'attribute_type_id': 'knowledge_source', 'value': 'SIGNOR-267487'},\n",
+       "  {'attribute_type_id': 'subject_category', 'value': 'biolink:Gene'},\n",
+       "  {'attribute_source': 'infores:catrax-pharmacogenomics',\n",
+       "   'attribute_type_id': 'biolink:knowledge_level',\n",
+       "   'value': 'knowledge_assertion'},\n",
+       "  {'attribute_type_id': 'object_name', 'value': 'DIO1'},\n",
+       "  {'attribute_type_id': 'subject_name', 'value': 'IFNG'},\n",
+       "  {'attribute_source': 'infores:catrax-pharmacogenomics',\n",
+       "   'attribute_type_id': 'biolink:agent_type',\n",
+       "   'value': 'automated_agent'},\n",
+       "  {'attribute_type_id': 'object_category', 'value': 'biolink:Gene'},\n",
+       "  {'attribute_type_id': 'provided_by', 'value': 'SIGNOR'}],\n",
+       " 'object': 'NCBIGene:1733',\n",
+       " 'predicate': 'biolink:regulates',\n",
+       " 'qualifiers': [{'qualifier_type_id': 'biolink:object_direction_qualifier',\n",
+       "   'qualifier_value': 'downregulated'}],\n",
+       " 'sources': [{'resource_id': 'infores:catrax-pharmacogenomics',\n",
+       "   'resource_role': 'primary_knowledge_source'},\n",
+       "  {'resource_id': 'N/A', 'resource_role': 'supporting_data_source'}],\n",
+       " 'subject': 'NCBIGene:3458'}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result1['01a09566-2a29-5eae-9160-e44a18241c09']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "baa9a580-c3e8-4b27-834f-d7c5c1c297d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "6822"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(result1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "ad5eaf3e-205d-43cb-8c53-d14b489ba5a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2658"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(result2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "51fe55e0-5871-4714-ae46-8fe0647a6c99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('result.json', 'w') as f:\n",
+    "    json.dump(formatted_output, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6460b38-8bab-4247-b693-ef90c5a4e3d7",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

From 384c56a3e8ae39edafcbd2161f8cf8908a4c25fa Mon Sep 17 00:00:00 2001
From: yjzhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 18:37:43 -0700
Subject: [PATCH 4/7] start on neighborhood finder

---
 TCT/neighborhood_finder.py | 324 +++++++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 TCT/neighborhood_finder.py

diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py
new file mode 100644
index 0000000..5310794
--- /dev/null
+++ b/TCT/neighborhood_finder.py
@@ -0,0 +1,324 @@
+
+
+def format_query_json_for_pathfinder(subject_ids, object_ids=None,
+        subject_categories=None,
+        object_categories=None,
+        predicates=None):
+    '''
+    Example input:
+    subject_ids = ["NCBIGene:3845"]
+    object_ids = []
+    subject_categories = ["biolink:Gene"]
+    object_categories = ["biolink:Gene"]
+    predicates = ["biolink:positively_correlated_with", "biolink:physically_interacts_with"]
+    '''
+    query_json_temp = {
+        "message": {
+            "query_graph": {
+
+                "edges": {
+                    "e00": {
+                        "subject": "n00",
+                        "object": "n01",
+                        "predicates": predicates
+                        }
+                    },
+                "nodes": {
+                    "n00": {
+                        "ids":subject_ids, # required
+                        #"categories":[] # optional, if not provided, it will be empty
+                        },
+                    "n01": {
+                        #"ids":[],
+                        "categories":[] # required
+                        }
+                    }
+                }
+            }
+        }
+
+    if len(subject_ids) > 0:
+        query_json_temp["message"]["query_graph"]["nodes"]["n00"]["ids"] = subject_ids
+
+    if object_ids is not None and len(object_ids) > 0:
+        query_json_temp["message"]["query_graph"]["nodes"]["n01"]["ids"] = object_ids
+
+    if subject_categories is not None and len(subject_categories) > 0:
+        query_json_temp["message"]["query_graph"]["nodes"]["n00"]["categories"] = subject_categories
+
+    if object_categories is not None and len(object_categories) > 0:
+        query_json_temp["message"]["query_graph"]["nodes"]["n01"]["categories"] = object_categories
+
+    if predicates is not None and len(predicates) > 0:
+        query_json_temp["message"]["query_graph"]["edges"]["e00"]["predicates"] = predicates
+
+    return query_json_temp
+
+
+def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None):
+    """
+    start_node_categories and end_node_categories are lists of categories.
+    """
+    q = {
+            "nodes": {
+                "on": {
+                    "categories": end_node_categories,
+                    "constraints": [],
+                    "ids": [
+                        end_node_id
+                    ],
+                    "is_set": False,
+                    "option_group_id": None,
+                    "set_id": None,
+                    "set_interpretation": "BATCH"
+                },
+                "sn": {
+                    "categories": start_node_categories,
+                    "constraints": [],
+                    "ids": [
+                        start_node_id
+                    ],
+                    "is_set": False,
+                    "option_group_id": None,
+                    "set_id": None,
+                    "set_interpretation": "BATCH"
+                }
+            },
+            "paths": {
+                "p0": {
+                    "constraints": None,
+                    "object": "on",
+                    "predicates": None,
+                    "subject": "sn"
+                }
+            }
+        }
+    return q
+
+
+def generate_score_results(results, method='infores'):
+    """
+    Generates a score dict, and a list of "analyses".
+    method can be 'infores' or 'edges'
+    """
+    graph_scores = {}
+    max_score = 0
+    auxiliary_graphs = results['auxiliary_graphs']
+    for k, graph in auxiliary_graphs.items():
+        if method == 'infores':
+            sources = set()
+            for edge_index in graph:
+                edge = results['knowledge_graph']['edges'][edge_index]
+                for resource in edge['sources']:
+                    sources.add(resource['resource_id'])
+            score = len(sources)
+            if score > max_score:
+                max_score = score
+        else:
+            score = len(graph)
+            if score > max_score:
+                max_score = score
+        graph_scores[k] = score
+    graph_scores_formatted = []
+    for k in graph_scores.keys():
+        graph_scores[k] = graph_scores[k]/max_score
+        graph_scores_formatted.append({
+            'attributes': None,
+            'path_bindings': {
+                'p0': [{'id': k}]},
+            'resource_id': 'infores:tct',
+            'score': graph_scores[k],
+            'scoring_method': None,
+            'support_graphs': None
+            })
+    return graph_scores, graph_scores_formatted
+
+
+def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict,
+        start_node_categories=None, end_node_categories=None,
+        get_node_info=True,
+        scoring_method='infores'):
+    """
+    Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs.
+    scoring_method is how the node scores are generated, and could be 'infores' or 'edges'.
+    """
+    # nodes
+    # TODO: get some node info? node attributes
+    node_info = {}
+    # edges is a dict of intermediate nodes
+    intermediate_node_edges = {}
+    for k, v in result1.items():
+        i1 = v['subject']
+        i2 = v['object']
+        s_o = 'object'
+        if i1 == start_node_id:
+            intermediate_node_id = i2
+            s_o = 'object'
+        elif i2 == start_node_id:
+            intermediate_node_id = i1
+            s_o = 'subject'
+        else:
+            continue
+        if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges:
+            intermediate_node_edges[intermediate_node_id].append((k, v))
+        else:
+            intermediate_node_edges[intermediate_node_id] = [(k, v)]
+        # add node dict
+        if intermediate_node_id not in node_info:
+            node_dict = {
+            }
+            node_info[intermediate_node_id] = node_dict
+        else:
+            node_dict = node_info[intermediate_node_id]
+        for attribute in v['attributes']:
+            if attribute['attribute_type_id'] == f'{s_o}_category':
+                if 'categories' not in node_dict:
+                    node_dict['categories'] = set([attribute['value']])
+                else:
+                    node_dict['categories'].add(attribute['value'])
+            if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
+                node_dict['name'] = attribute['value']
+        node_info[intermediate_node_id] = node_dict
+    connecting_intermediate_nodes = {}
+    for k, v in result2.items():
+        i1 = v['subject']
+        i2 = v['object']
+        if i1 == end_node_id:
+            intermediate_node_id = i2
+            s_o = 'object'
+        elif i2 == end_node_id:
+            intermediate_node_id = i1
+            s_o = 'subject'
+        else:
+            continue
+        if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges:
+            if intermediate_node_id in connecting_intermediate_nodes:
+                connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v))
+            else:
+                connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]}
+        if intermediate_node_id not in node_info:
+            node_dict = {
+            }
+            node_info[intermediate_node_id] = node_dict
+        else:
+            node_dict = node_info[intermediate_node_id]
+        for attribute in v['attributes']:
+            if attribute['attribute_type_id'] == f'{s_o}_category':
+                if 'categories' not in node_dict:
+                    node_dict['categories'] = set([attribute['value']])
+                else:
+                    node_dict['categories'].add(attribute['value'])
+            if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
+                node_dict['name'] = attribute['value']
+        node_info[intermediate_node_id] = node_dict
+    for k, v in node_info.items():
+        if 'categories' in v:
+            v['categories'] = list(v['categories'])
+    all_edges = {}
+    all_auxiliary_graphs = {}
+    i = 1
+    # sort connecting_intermediate_nodes by total number of connections
+    connection_counts = Counter({k: len(v['e1'])*len(v['e2']) for k, v in connecting_intermediate_nodes.items()})
+    for i1, count in connection_counts.most_common():
+        kv = connecting_intermediate_nodes[i1]
+        e1s = kv['e1']
+        e2s = kv['e2']
+        edges = {k: v for k, v in e1s}
+        edges.update({k: v for k, v in e2s})
+        all_edges.update(edges)
+        keys = [x[0] for x in e1s] + [x[0] for x in e2s]
+        all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys
+        i += 1
+    # generate output json
+    output = {
+        'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories),
+        # TODO: don't drop the nodes
+        'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()},
+                            'edges': all_edges,
+                           },
+        'results': [{'analyses': []}],
+        'auxiliary_graphs': all_auxiliary_graphs
+    }
+    graph_scores, graph_scores_formatted = generate_score_results(output, method=scoring_method)
+    output['results'][0]['analyses'] = graph_scores_formatted
+    if get_node_info:
+        from .node_normalizer import get_normalized_nodes
+        nodes_to_add = []
+        for k, v in output['knowledge_graph']['nodes'].items():
+            if 'name' not in v or 'categories' not in v:
+                nodes_to_add.append(k)
+        normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post')
+        for node_id in nodes_to_add:
+            nn = normalized_nodes[node_id]
+            output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types}
+    return output
+
+def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_predicates, input_node_category = []):
+    """
+    This function is used to find the neighborhood of a given input node with intermediate categories.
+
+    --------------
+    Parameters:
+    input_node (str): The input node - should be a CURIE id.
+    node2_categories (list): A list of intermediate categories to be used in the neighborhood finding process.
+    APInames (dict): A dictionary containing the names of the APIs to be used.
+    metaKG (DataFrame): The metadata knowledge graph containing information about the APIs and their predicates.
+    API_predicates (dict): A dictionary containing the predicates for each API.
+    input_node_category (list): Optional. A list of categories for the input node. If empty, it will be derived from the input node's types.
+
+    --------------
+    Returns:
+    input_node_id (str): The curie id of the input node.
+    result (dict): The result of the query for the input node.
+    result_parsed (DataFrame): The parsed results for the input node.
+    result_ranked_by_primary_infores (DataFrame): The ranked results based on primary infores.
+
+    --------------
+    Example:
+    >>> input_node_id, result, result_parsed, result_ranked_by_primary_infores1 = Neighborhood_finder('MONDO:0008170', #Ovarian Cancer
+                                                                                            node2_categories = ['biolink:SmallMolecule', 'biolink:Drug', 'biolink:ChemicalEntity'],
+                                                                                            APInames = APInames,
+                                                                                            metaKG = metaKG,
+                                                                                            API_predicates = API_predicates)
+    --------------
+
+    """
+    from . import node_normalizer
+    from . import translator_query
+
+    input_node_id = input_node
+    # Step 1: Resolve the input node to get its curie id and categories
+    input_node_info = node_normalizer.get_normalized_nodes(input_node_id)
+    print(input_node_id)
+
+    if len(input_node_category) == 0:
+        input_node_category = input_node_info.types
+    else:
+        input_node_category = list(set(input_node_category).intersection(set(input_node_info.types)))
+        if len(input_node_category) == 0:
+            input_node_category = input_node_info.types
+
+    # Step 2: Select predicates and APIs based on the intermediate categories
+    sele_predicates, sele_APIs, API_URLs = sele_predicates_API(input_node_category,
+                                                                node2_categories,
+                                                                metaKG, APInames)
+
+    # Step 3: Format the query JSON for the input node
+    query_json = format_query_json([input_node_id], [],
+                                   [input_node_category],
+                                   node2_categories,
+                                   sele_predicates)
+
+    # Step 4: Query the APIs in parallel
+    result = translator_query.parallel_api_query(query_json=query_json,
+                             select_APIs= sele_APIs,
+                             APInames=APInames,
+                             API_predicates=API_predicates,
+                             max_workers=len(sele_APIs))
+    result_parsed = parse_KG(result)
+        # Step 7: Ranking the results. This ranking method is based on the number of unique
+        # primary infores. It can only be used to rank the results with one defined node.
+    result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed, input_node_id)   # input_node1_id is the curie id of the
+    return input_node_id, result, result_parsed, result_ranked_by_primary_infores1
+
+

From d712fe3c1bce366c2c58a66db0ca214417fb31a7 Mon Sep 17 00:00:00 2001
From: yjzhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 19:47:10 -0700
Subject: [PATCH 5/7] neighborhood finder - new output type

---
 TCT/neighborhood_finder.py | 72 +++++++++++---------------------------
 1 file changed, 20 insertions(+), 52 deletions(-)

diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py
index 5310794..03ca8c9 100644
--- a/TCT/neighborhood_finder.py
+++ b/TCT/neighborhood_finder.py
@@ -1,6 +1,9 @@
+from collections import Counter
 
+from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores
 
-def format_query_json_for_pathfinder(subject_ids, object_ids=None,
+
+def format_query_json_for_neighborhood_finder(subject_ids, object_ids=None,
         subject_categories=None,
         object_categories=None,
         predicates=None):
@@ -134,7 +137,7 @@ def generate_score_results(results, method='infores'):
     return graph_scores, graph_scores_formatted
 
 
-def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict,
+def parse_results_for_neighborhood_finder(start_node_id:str, results:dict,
         start_node_categories=None, end_node_categories=None,
         get_node_info=True,
         scoring_method='infores'):
@@ -143,11 +146,10 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
     scoring_method is how the node scores are generated, and could be 'infores' or 'edges'.
     """
     # nodes
-    # TODO: get some node info? node attributes
     node_info = {}
     # edges is a dict of intermediate nodes
-    intermediate_node_edges = {}
-    for k, v in result1.items():
+    node_edges = {}
+    for k, v in results.items():
         i1 = v['subject']
         i2 = v['object']
         s_o = 'object'
@@ -159,10 +161,10 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
             s_o = 'subject'
         else:
             continue
-        if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges:
-            intermediate_node_edges[intermediate_node_id].append((k, v))
+        if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in node_edges:
+            node_edges[intermediate_node_id].append((k, v))
         else:
-            intermediate_node_edges[intermediate_node_id] = [(k, v)]
+            node_edges[intermediate_node_id] = [(k, v)]
         # add node dict
         if intermediate_node_id not in node_info:
             node_dict = {
@@ -179,38 +181,6 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
             if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
                 node_dict['name'] = attribute['value']
         node_info[intermediate_node_id] = node_dict
-    connecting_intermediate_nodes = {}
-    for k, v in result2.items():
-        i1 = v['subject']
-        i2 = v['object']
-        if i1 == end_node_id:
-            intermediate_node_id = i2
-            s_o = 'object'
-        elif i2 == end_node_id:
-            intermediate_node_id = i1
-            s_o = 'subject'
-        else:
-            continue
-        if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges:
-            if intermediate_node_id in connecting_intermediate_nodes:
-                connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v))
-            else:
-                connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]}
-        if intermediate_node_id not in node_info:
-            node_dict = {
-            }
-            node_info[intermediate_node_id] = node_dict
-        else:
-            node_dict = node_info[intermediate_node_id]
-        for attribute in v['attributes']:
-            if attribute['attribute_type_id'] == f'{s_o}_category':
-                if 'categories' not in node_dict:
-                    node_dict['categories'] = set([attribute['value']])
-                else:
-                    node_dict['categories'].add(attribute['value'])
-            if attribute['attribute_type_id'] == f'{s_o}_name' and 'name' not in node_dict:
-                node_dict['name'] = attribute['value']
-        node_info[intermediate_node_id] = node_dict
     for k, v in node_info.items():
         if 'categories' in v:
             v['categories'] = list(v['categories'])
@@ -218,21 +188,16 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
     all_auxiliary_graphs = {}
     i = 1
     # sort connecting_intermediate_nodes by total number of connections
-    connection_counts = Counter({k: len(v['e1'])*len(v['e2']) for k, v in connecting_intermediate_nodes.items()})
+    connection_counts = Counter({k: len(v) for k, v in node_edges.items()})
     for i1, count in connection_counts.most_common():
-        kv = connecting_intermediate_nodes[i1]
-        e1s = kv['e1']
-        e2s = kv['e2']
-        edges = {k: v for k, v in e1s}
-        edges.update({k: v for k, v in e2s})
-        all_edges.update(edges)
-        keys = [x[0] for x in e1s] + [x[0] for x in e2s]
+        edges = node_edges[i1]
+        all_edges.update({k: v for k, v in edges})
+        keys = [x[0] for x in edges]
         all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys
         i += 1
     # generate output json
     output = {
-        'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories),
-        # TODO: don't drop the nodes
+        'query_graph': build_query_graph(start_node_id, '', start_node_categories, end_node_categories),
         'knowledge_graph': {'nodes': {x: node_info[x] for x in connection_counts.keys()},
                             'edges': all_edges,
                            },
@@ -250,9 +215,11 @@ def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dic
         normalized_nodes = get_normalized_nodes(nodes_to_add, mode='post')
         for node_id in nodes_to_add:
             nn = normalized_nodes[node_id]
-            output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types}
+            if nn is not None:
+                output['knowledge_graph']['nodes'][node_id] = {'name': nn.label, 'categories': nn.types}
     return output
 
+
 def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_predicates, input_node_category = []):
     """
     This function is used to find the neighborhood of a given input node with intermediate categories.
@@ -319,6 +286,7 @@ def neighborhood_finder(input_node, node2_categories, APInames, metaKG, API_pred
         # Step 7: Ranking the results. This ranking method is based on the number of unique
         # primary infores. It can only be used to rank the results with one defined node.
     result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed, input_node_id)   # input_node1_id is the curie id of the
-    return input_node_id, result, result_parsed, result_ranked_by_primary_infores1
+    parsed_results = parse_results_for_neighborhood_finder(input_node_id, result, input_node_category, node2_categories)
+    return input_node_id, result, parsed_results, result_ranked_by_primary_infores1
 
 

From 3b097a9a7a1dd1f3cf9ae3ae9f64b99fa04ae7d5 Mon Sep 17 00:00:00 2001
From: yjzhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 19:48:24 -0700
Subject: [PATCH 6/7] removed extraneous function

---
 TCT/neighborhood_finder.py | 55 --------------------------------------
 1 file changed, 55 deletions(-)

diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py
index 03ca8c9..41f822a 100644
--- a/TCT/neighborhood_finder.py
+++ b/TCT/neighborhood_finder.py
@@ -3,61 +3,6 @@
 from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores
 
 
-def format_query_json_for_neighborhood_finder(subject_ids, object_ids=None,
-        subject_categories=None,
-        object_categories=None,
-        predicates=None):
-    '''
-    Example input:
-    subject_ids = ["NCBIGene:3845"]
-    object_ids = []
-    subject_categories = ["biolink:Gene"]
-    object_categories = ["biolink:Gene"]
-    predicates = ["biolink:positively_correlated_with", "biolink:physically_interacts_with"]
-    '''
-    query_json_temp = {
-        "message": {
-            "query_graph": {
-
-                "edges": {
-                    "e00": {
-                        "subject": "n00",
-                        "object": "n01",
-                        "predicates": predicates
-                        }
-                    },
-                "nodes": {
-                    "n00": {
-                        "ids":subject_ids, # required
-                        #"categories":[] # optional, if not provided, it will be empty
-                        },
-                    "n01": {
-                        #"ids":[],
-                        "categories":[] # required
-                        }
-                    }
-                }
-            }
-        }
-
-    if len(subject_ids) > 0:
-        query_json_temp["message"]["query_graph"]["nodes"]["n00"]["ids"] = subject_ids
-
-    if object_ids is not None and len(object_ids) > 0:
-        query_json_temp["message"]["query_graph"]["nodes"]["n01"]["ids"] = object_ids
-
-    if subject_categories is not None and len(subject_categories) > 0:
-        query_json_temp["message"]["query_graph"]["nodes"]["n00"]["categories"] = subject_categories
-
-    if object_categories is not None and len(object_categories) > 0:
-        query_json_temp["message"]["query_graph"]["nodes"]["n01"]["categories"] = object_categories
-
-    if predicates is not None and len(predicates) > 0:
-        query_json_temp["message"]["query_graph"]["edges"]["e00"]["predicates"] = predicates
-
-    return query_json_temp
-
-
 def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None):
     """
     start_node_categories and end_node_categories are lists of categories.

From 2ffe87d7dc0bad03e84f08549197dc15560762ce Mon Sep 17 00:00:00 2001
From: yjzhang <yjzhang@cs.washington.edu>
Date: Thu, 30 Apr 2026 19:49:36 -0700
Subject: [PATCH 7/7] consolidated some duplicate functions

---
 TCT/neighborhood_finder.py | 81 +-------------------------------------
 1 file changed, 1 insertion(+), 80 deletions(-)

diff --git a/TCT/neighborhood_finder.py b/TCT/neighborhood_finder.py
index 41f822a..4c8018a 100644
--- a/TCT/neighborhood_finder.py
+++ b/TCT/neighborhood_finder.py
@@ -1,86 +1,7 @@
 from collections import Counter
 
 from .TCT import sele_predicates_API, format_query_json, parse_KG, rank_by_primary_infores
-
-
-def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None):
-    """
-    start_node_categories and end_node_categories are lists of categories.
-    """
-    q = {
-            "nodes": {
-                "on": {
-                    "categories": end_node_categories,
-                    "constraints": [],
-                    "ids": [
-                        end_node_id
-                    ],
-                    "is_set": False,
-                    "option_group_id": None,
-                    "set_id": None,
-                    "set_interpretation": "BATCH"
-                },
-                "sn": {
-                    "categories": start_node_categories,
-                    "constraints": [],
-                    "ids": [
-                        start_node_id
-                    ],
-                    "is_set": False,
-                    "option_group_id": None,
-                    "set_id": None,
-                    "set_interpretation": "BATCH"
-                }
-            },
-            "paths": {
-                "p0": {
-                    "constraints": None,
-                    "object": "on",
-                    "predicates": None,
-                    "subject": "sn"
-                }
-            }
-        }
-    return q
-
-
-def generate_score_results(results, method='infores'):
-    """
-    Generates a score dict, and a list of "analyses".
-    method can be 'infores' or 'edges'
-    """
-    graph_scores = {}
-    max_score = 0
-    auxiliary_graphs = results['auxiliary_graphs']
-    for k, graph in auxiliary_graphs.items():
-        if method == 'infores':
-            sources = set()
-            for edge_index in graph:
-                edge = results['knowledge_graph']['edges'][edge_index]
-                for resource in edge['sources']:
-                    sources.add(resource['resource_id'])
-            score = len(sources)
-            if score > max_score:
-                max_score = score
-        else:
-            score = len(graph)
-            if score > max_score:
-                max_score = score
-        graph_scores[k] = score
-    graph_scores_formatted = []
-    for k in graph_scores.keys():
-        graph_scores[k] = graph_scores[k]/max_score
-        graph_scores_formatted.append({
-            'attributes': None,
-            'path_bindings': {
-                'p0': [{'id': k}]},
-            'resource_id': 'infores:tct',
-            'score': graph_scores[k],
-            'scoring_method': None,
-            'support_graphs': None
-            })
-    return graph_scores, graph_scores_formatted
-
+from .TCT_pathfinder import generate_score_results, build_query_graph
 
 def parse_results_for_neighborhood_finder(start_node_id:str, results:dict,
         start_node_categories=None, end_node_categories=None,