diff --git a/TCT/TCT.py b/TCT/TCT.py index 7576bca..c327d62 100644 --- a/TCT/TCT.py +++ b/TCT/TCT.py @@ -1558,6 +1558,7 @@ def merge_by_ranking_index(result_ranked_by_primary_infores, def merge_ranking_by_number_of_infores(result_ranked_by_primary_infores, result_ranked_by_primary_infores1, + plot=True, top_n = 30, fontsize = 12, title_fontsize = 12, @@ -1590,8 +1591,8 @@ def merge_ranking_by_number_of_infores(result_ranked_by_primary_infores, #result_xy["output_node_name"] = new_colnames - result_xy['predictes1'] = predicts_list1 - result_xy['predictes2'] = predicts_list2 + result_xy['predicates1'] = predicts_list1 + result_xy['predicates2'] = predicts_list2 result_xy_sorted = result_xy.sort_values(by=['score'], ascending=False) @@ -1613,7 +1614,8 @@ def merge_ranking_by_number_of_infores(result_ranked_by_primary_infores, x = result_xy_sorted.iloc[0:top_n].index y = result_xy_sorted.iloc[0:top_n]['score'] - plot_path_bar(x,y,fontsize, title_fontsize, output_png=output_png) + if plot: + plot_path_bar(x,y,fontsize, title_fontsize, output_png=output_png) return result_xy_sorted diff --git a/TCT/TCT_pathfinder.py b/TCT/TCT_pathfinder.py new file mode 100644 index 0000000..ddd6601 --- /dev/null +++ b/TCT/TCT_pathfinder.py @@ -0,0 +1,232 @@ +# TCT Pathfinder... + +from collections import Counter + +from . import node_normalizer +from . import translator_query +from .TCT import sele_predicates_API, parse_KG, rank_by_primary_infores, merge_ranking_by_number_of_infores + +def format_query_json_for_pathfinder(subject_ids, object_ids=None, + subject_categories=None, + object_categories=None, + predicates=None): + ''' + Example input: + subject_ids = ["NCBIGene:3845"] + object_ids = [] + subject_categories = ["biolink:Gene"] + object_categories = ["biolink:Gene"] + predicates = ["biolink:positively_correlated_with", "biolink:physically_interacts_with"] + ''' + query_json_temp = { + "message": { + "query_graph": { + + "edges": { + "e00": { + "subject": "n00", + "object": "n01", + "predicates": predicates + } + }, + "nodes": { + "n00": { + "ids":subject_ids, # required + #"categories":[] # optional, if not provided, it will be empty + }, + "n01": { + #"ids":[], + "categories":[] # required + } + } + } + } + } + + if len(subject_ids) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n00"]["ids"] = subject_ids + + if object_ids is not None and len(object_ids) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n01"]["ids"] = object_ids + + if subject_categories is not None and len(subject_categories) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n00"]["categories"] = subject_categories + + if object_categories is not None and len(object_categories) > 0: + query_json_temp["message"]["query_graph"]["nodes"]["n01"]["categories"] = object_categories + + if predicates is not None and len(predicates) > 0: + query_json_temp["message"]["query_graph"]["edges"]["e00"]["predicates"] = predicates + + return query_json_temp + + +def build_query_graph(start_node_id, end_node_id, start_node_categories=None, end_node_categories=None): + """ + start_node_categories and end_node_categories are lists of categories. + """ + q = { + "nodes": { + "on": { + "categories": end_node_categories, + "constraints": [], + "ids": [ + end_node_id + ], + "is_set": False, + "option_group_id": None, + "set_id": None, + "set_interpretation": "BATCH" + }, + "sn": { + "categories": start_node_categories, + "constraints": [], + "ids": [ + start_node_id + ], + "is_set": False, + "option_group_id": None, + "set_id": None, + "set_interpretation": "BATCH" + } + }, + "paths": { + "p0": { + "constraints": None, + "object": "on", + "predicates": None, + "subject": "sn" + } + } + } + return q + + +def parse_results_for_pathfinder(start_node_id:str, end_node_id:str, result1:dict, result2:dict, + start_node_categories=None, end_node_categories=None): + """ + Converts the results of two TRAPI queries into the same general json format as the other pathfinder APIs. + """ + # TODO: parse results... + # nodes + # edges is a dict of intermediate nodes + intermediate_node_edges = {} + for k, v in result1.items(): + i1 = v['subject'] + i2 = v['object'] + if i1 == start_node_id: + intermediate_node_id = i2 + elif i2 == start_node_id: + intermediate_node_id = i1 + else: + continue + if (i1 == start_node_id or i2 == start_node_id) and intermediate_node_id in intermediate_node_edges: + intermediate_node_edges[intermediate_node_id].append((k, v)) + else: + intermediate_node_edges[intermediate_node_id] = [(k, v)] + connecting_intermediate_nodes = {} + for k, v in result2.items(): + i1 = v['subject'] + i2 = v['object'] + if i1 == end_node_id: + intermediate_node_id = i2 + elif i2 == end_node_id: + intermediate_node_id = i1 + else: + continue + if (i1 == end_node_id or i2 == end_node_id) and intermediate_node_id in intermediate_node_edges: + if intermediate_node_id in connecting_intermediate_nodes: + connecting_intermediate_nodes[intermediate_node_id]['e2'].append((k, v)) + else: + connecting_intermediate_nodes[intermediate_node_id] = {'e1': intermediate_node_edges[intermediate_node_id], 'e2' : [(k, v)]} + all_edges = {} + all_auxiliary_graphs = {} + i = 1 + # sort connecting_intermediate_nodes by total number of connections + connection_counts = Counter({k: len(v['e1'])*len(v['e2']) for k, v in connecting_intermediate_nodes.items()}) + for i1, count in connection_counts.most_common(): + kv = connecting_intermediate_nodes[i1] + e1s = kv['e1'] + e2s = kv['e2'] + edges = {k: v for k, v in e1s} + edges.update({k: v for k, v in e2s}) + all_edges.update(edges) + keys = [x[0] for x in e1s] + [x[0] for x in e2s] + all_auxiliary_graphs[f'aux_{i}_{i1}'] = keys + i += 1 + output = { + 'query_graph': build_query_graph(start_node_id, end_node_id, start_node_categories, end_node_categories), + 'knowledge_graph': {'nodes': {x: {} for x in connection_counts.keys()}, + 'edges': all_edges, + }, + 'results': [{'analyses': []}], + 'auxiliary_graphs': all_auxiliary_graphs + } + return output + + +def pathfinder(input_node1_id:str, input_node2_id:str, + intermediate_categories:list, APInames, metaKG, API_predicates): + # get categories for input nodes + normalized_node_dict = node_normalizer.get_normalized_nodes([input_node1_id, input_node2_id]) + input_node1_info = normalized_node_dict[input_node1_id] + input_node1_list = [input_node1_id] + input_node1_category = input_node1_info.types + + input_node2_info = normalized_node_dict[input_node2_id] + print(input_node2_id) + input_node2_list = [input_node2_id] + + input_node2_category = input_node2_info.types + + # Select predicates and APIs based on the intermediate categories + sele_predicates1, sele_APIs1, API_URLs1 = sele_predicates_API(input_node1_category, + intermediate_categories, + metaKG, APInames) + sele_predicates2, sele_APIs2, API_URLs2 = sele_predicates_API(intermediate_categories, + input_node2_category, + metaKG, APInames) + query_json1 = format_query_json_for_pathfinder(input_node1_list, # a list of identifiers for input node1 + [], # id list for the intermediate node, it can be empty list if only want to query node1 + input_node1_category, # a list of categories of input node1 + intermediate_categories, # a list of categories of the intermediate node + sele_predicates1) # a list of predicates + + # for the second hop, we want the predicates to be... + query_json2 = format_query_json_for_pathfinder([], + input_node2_list, + intermediate_categories, # a list of categories of input node2 + input_node2_category, # a list of categories of the intermediate node + sele_predicates2) # a list of predicates + + result1 = translator_query.parallel_api_query(query_json=query_json1, + select_APIs = sele_APIs1, + APInames=APInames, + API_predicates=API_predicates, + max_workers=len(sele_APIs1)) + result2 = translator_query.parallel_api_query(query_json=query_json2, + select_APIs = sele_APIs2, + APInames=APInames, + API_predicates=API_predicates, + max_workers=len(sele_APIs2)) + + result_parsed1 = parse_KG(result1) + # Step 7: Ranking the results. This ranking method is based on the number of unique + # primary infores. It can only be used to rank the results with one defined node. + result_ranked_by_primary_infores1 = rank_by_primary_infores(result_parsed1, input_node1_id) # input_node1_id is the curie id of the + + result_parsed2 = parse_KG(result2) + result_ranked_by_primary_infores2 = rank_by_primary_infores(result_parsed2, input_node2_id) # input_node2_id is the curie id of the + + possible_paths = len(set(result_ranked_by_primary_infores1['output_node']).intersection(set(result_ranked_by_primary_infores2['output_node']))) + print("Number of possible paths: ", possible_paths) + + paths = merge_ranking_by_number_of_infores(result_ranked_by_primary_infores1, result_ranked_by_primary_infores2, + plot=False) + + output = parse_results_for_pathfinder(input_node1_id, input_node2_id, result1, result2, + start_node_categories=input_node1_category, + end_node_categories=input_node2_category) + + return result1, result2, output, paths + diff --git a/TCT/graph_downloader.py b/TCT/graph_downloader.py new file mode 100644 index 0000000..85db2d1 --- /dev/null +++ b/TCT/graph_downloader.py @@ -0,0 +1,67 @@ +# Download graphs to local caches... + +import os +from pathlib import Path +import tarfile +import tempfile + +import requests +from zstandard import ZstdDecompressor + +GRAPHS = { + 'signor': { + 'download': 'https://kgx-storage.rtx.ai/releases/signor/latest/signor.tar.zst', + 'metadata': 'https://kgx-storage.rtx.ai/releases/signor/latest/graph-metadata.json' + }, +} + +CACHE_DIR = Path.home() / '.cache' / 'TCT' + +os.makedirs(CACHE_DIR, exist_ok=True) + +def download_graph(graph_name: str): + graph_path = CACHE_DIR / graph_name + download_path = GRAPHS[graph_name]['download'] + save_path = graph_path / (graph_name + '.tar.zst') + request = requests.get(download_path, stream=True) + with open(save_path, 'wb') as f: + for chunk in request.iter_content(chunk_size=16*1024): + f.write(chunk) + # Extract file with zstandard + dctx = ZstdDecompressor() + # source: https://gist.github.com/scivision/ad241e9cf0474e267240e196d7545eca + with tempfile.TemporaryFile(suffix=".tar") as ofh: + with save_path.open("rb") as ifh: + dctx.copy_stream(ifh, ofh) + ofh.seek(0) + with tarfile.open(fileobj=ofh) as z: + z.extractall(graph_path) + +def load_graph(graph_name: str, output='igraph'): + """ + Loads a Translator graph into igraph. + + Params + ------ + graph_name : str + The name of the graph - it should be in graph_downloader.GRAPHS. + """ + if graph_name not in GRAPHS.keys(): + raise ValueError('graph_name not found') + graph_path = CACHE_DIR / graph_name + metadata_path = graph_path / 'graph-metadata.json' + nodes_path = graph_path / 'nodes.jsonl' + edges_path = graph_path / 'edges.jsonl' + os.makedirs(graph_path, exist_ok=True) + # download metadata and main download + if not os.path.exists(metadata_path) or not os.path.exists(nodes_path) or not os.path.exists(edges_path): + # Download the .tar.zst file + download_graph(graph_name) + # load graph + from . import kg_loader + nodes, edges, node_types, edge_types = kg_loader.import_kg2_jsonl(nodes_path, edges_path) + if output == 'igraph': + return kg_loader.load_kg2_igraph_from_data(nodes, edges, node_types, edge_types) + #else: + # TODO: not implemented yet + # return kg_loader.load_kg2_networkx_from_data(nodes, edges, node_types, edge_types) diff --git a/TCT/kg_loader.py b/TCT/kg_loader.py new file mode 100644 index 0000000..aca9909 --- /dev/null +++ b/TCT/kg_loader.py @@ -0,0 +1,379 @@ +# Import graph from kg2 csv dump, tsv dump, or jsonl dump. + +import csv +import gzip +import json +import os + +from scipy import sparse, io + +# TODO: multiple edges between two nodes? +def import_kg2_csv(node_filename, edge_filename, edges_to_include=None, remove_unused_nodes=False, verbose=True, reindex_edges=True, use_node_types=True, use_edge_types=True, use_edge_properties=False): + """ + Args: + csv_filename: name of csv file (could be csv or tsv, or csv.gz or tsv.gz) + edges_to_include: set of edge types + remove_unused_nodes: True if nodes with no in- or out-edges are to be removed. + reindex_edges: whether or not to use indices or original IDs in the edge list. + + Returns: + nodes: list of (_id, _name, _labels_id) where _labels_id corresponds to a key in node_types + edges: dict of (node1, node2): _type_id where node1 and node2 index into nodes, and _type_id corresponds to a key in edge_types + node_types: dict of int: str (_labels) + edge_types: dict of int: str (_type) + """ + nodes = [] + n_nodes = 0 + # mapping of _id to index in nodes + node_index = {} + # node_types is a map of string ( + node_types = {} + edges = {} + # edge_types is a map of string (_type) to node + edge_types = {} + # sets of nodes that have in-edges or out-edges (to use when deciding whether to remove nodes) + node_has_edge = set() + csv.field_size_limit(99999999) + if node_filename.endswith('.gz'): + # handle gzip + f = gzip.open(node_filename, 'rt') + else: + f = open(node_filename) + delimiter = ',' + if 'tsv' in node_filename: + delimiter = '\t' + dr = csv.DictReader(f, dialect='unix', delimiter=delimiter) + for i, row in enumerate(dr): + if verbose and i % 100000 == 0: + print(i, 'nodes: ', len(node_index), 'edges: ', len(edges)) + # if this is a node + row_name = '' + row_identifier = row['id'] + if 'name' not in row: + row_name = row_identifier + else: + row_name = row['name'] + row_source = row['category'] + row_label = row['category'] + if use_node_types: + if row_label in node_types: + nodes.append((row['id'], row_name, node_types[row_label], row_identifier, row_source)) + else: + nodes.append((row['id'], row_name, len(node_types) + 1, row_identifier, row_source)) + node_types[row_label] = len(node_types) + 1 + else: + nodes.append((row['id'], row_name, True, row_identifier, row_source)) + node_index[row['id']] = n_nodes + n_nodes += 1 + if edge_filename.endswith('.gz'): + # handle gzip + f = gzip.open(edge_filename, 'rt') + else: + f = open(edge_filename) + delimiter = ',' + if 'tsv' in edge_filename: + delimiter = '\t' + dr = csv.DictReader(f, dialect='unix', delimiter=delimiter) + for i, row in enumerate(dr): + # if this row is an edge + edge_type = row['predicate'] + if edges_to_include is None or edge_type in edges_to_include: + node1 = row['subject'] + node2 = row['object'] + node_has_edge.add(node1) + node_has_edge.add(node2) + if use_edge_types: + if edge_type in edge_types: + edges[(node1, node2)] = edge_types[edge_type] + else: + edges[(node1, node2)] = len(edge_types) + 1 + edge_types[edge_type] = len(edge_types) + 1 + else: + edges[(node1, node2)] = True + if use_edge_properties: + if 'properties' in row: + edge_properties = row['properties'] + else: + edge_properties = {} + if 'primary_knowledge_source' in row: + edge_properties['primary_knowledge_source'] = row['primary_knowledge_source'] + edge_properties['id'] = int(row['id']) + edges[(node1, node2)] = edge_properties + if remove_unused_nodes: + # remove all nodes that don't have edges + to_remove = set(node_index.keys()).difference(node_has_edge) + nodes = [n for n in nodes if n[0] not in to_remove] + # rebuild node_index + node_index = {n[0]: i for i, n in enumerate(nodes)} + # convert edge indices + if reindex_edges: + new_edges = {} + for k, e in edges.items(): + node1, node2 = k + node1 = node_index[node1] + node2 = node_index[node2] + new_edges[(node1, node2)] = e + edges = new_edges + node_types = {v: k for k, v in node_types.items()} + edge_types = {v: k for k, v in edge_types.items()} + return nodes, edges, node_types, edge_types + + +def import_kg2_jsonl(node_filename, edge_filename, edges_to_include=None, remove_unused_nodes=True, use_edge_types=True, use_node_types=True, verbose=True, reindex_edges=True, use_edge_properties=False): + """ + Imports a jsonl file that contains nodes and edges. + + Args: + node_filename: name of jsonl file containing nodes, or both nodes and edges. + edge_filename: name of jsonl file containing edges. Can be None if all nodes and edges are in node_filename. + edges_to_include: set of edge types + remove_unused_nodes: True if nodes with no in- or out-edges are to be removed. + reindex_edges: whether or not to use indices or original IDs in the edge list. + + Returns: + nodes: list of (_id, _name, _labels_id) where _labels_id corresponds to a key in node_types + edges: dict of (node1, node2): _type_id where node1 and node2 index into nodes, and _type_id corresponds to a key in edge_types + node_types: dict of int: str (_labels) + edge_types: dict of int: str (_type) + """ + nodes = [] + n_nodes = 0 + # mapping of _id to index in nodes + node_index = {} + # node_types is a map of string ( + node_types = {} + edges = {} + # edge_types is a map of string (_type) to node + edge_types = {} + # sets of nodes that have in-edges or out-edges (to use when deciding whether to remove nodes) + node_has_edge = set() + if str(node_filename).endswith('.gz'): + # handle gzip + f = gzip.open(node_filename, 'rt') + else: + f = open(node_filename) + line = f.readline() + i = 0 + using_edge_file = False + while line: + row = json.loads(line) + if verbose and i % 100000 == 0: + print(i, 'nodes: ', len(node_index), 'edges: ', len(edges)) + print(row) + # if this is a node + if 'id' in row and 'category' in row and 'subject' not in row and 'object' not in row: + row_name = '' + row_identifier = row['id'] + if 'name' not in row: + row_name = row_identifier + else: + row_name = row['name'] + row_source = row['category'] + # arbitrarily pick the first category... + row_label = row['category'][0] + #print('row_label:', row_label) + if use_node_types: + if row_label in node_types: + nodes.append((row['id'], row_name, node_types[row_label], row_identifier, row_source)) + else: + nodes.append((row['id'], row_name, len(node_types) + 1, row_identifier, row_source)) + node_types[row_label] = len(node_types) + 1 + else: + nodes.append((row['id'], row_name, True, row_identifier, row_source)) + node_index[row['id']] = n_nodes + n_nodes += 1 + # if this row is an edge + else: + edge_type = row['predicate'] + if edges_to_include is None or edge_type in edges_to_include: + node1 = row['subject'] + node2 = row['object'] + node_has_edge.add(node1) + node_has_edge.add(node2) + if use_edge_types: + if edge_type in edge_types: + edges[(node1, node2)] = edge_types[edge_type] + else: + edges[(node1, node2)] = len(edge_types) + 1 + edge_types[edge_type] = len(edge_types) + 1 + else: + edges[(node1, node2)] = True + if use_edge_properties: + if 'properties' in row: + edge_properties = row['properties'] + else: + edge_properties = {} + if 'primary_knowledge_source' in row: + edge_properties['primary_knowledge_source'] = row['primary_knowledge_source'] + edge_properties['id'] = int(row['id']) + edges[(node1, node2)] = edge_properties + line = f.readline() + if not line and not using_edge_file and edge_filename is not None: + f.close() + print('Opening edge file', edge_filename) + if str(edge_filename).endswith('.gz'): + # handle gzip + f = gzip.open(edge_filename, 'rt') + else: + f = open(edge_filename) + line = f.readline() + using_edge_file = True + i += 1 + if remove_unused_nodes: + # remove all nodes that don't have edges + to_remove = set(node_index.keys()).difference(node_has_edge) + nodes = [n for n in nodes if n[0] not in to_remove] + # rebuild node_index + node_index = {n[0]: i for i, n in enumerate(nodes)} + # convert edge indices + if reindex_edges: + new_edges = {} + for k, e in edges.items(): + node1, node2 = k + node1 = node_index[node1] + node2 = node_index[node2] + new_edges[(node1, node2)] = e + edges = new_edges + node_types = {v: k for k, v in node_types.items()} + edge_types = {v: k for k, v in edge_types.items()} + return nodes, edges, node_types, edge_types + + + + +def to_sparse(nodes, edges): + """ + Returns a DOK matrix from the edges... + """ + n_nodes = len(nodes) + edge_matrix = sparse.dok_array((n_nodes, n_nodes), dtype=int) + for k, v in sorted(edges.items()): + n1, n2 = k + edge_matrix[n1, n2] = v + return edge_matrix + + +def load_kg2(filename='kg2.csv', edges_to_include=None, remove_unused_nodes=False, mtx_filename='spoke.mtx', **kwargs): + if filename.endswith('.csv') or filename.endswith('.csv.gz') or filename.endswith('.tsv') or filename.endswith('.tsv.gz'): + nodes, edges, node_types, edge_types = import_kg2_csv(filename, edges_to_include, remove_unused_nodes, **kwargs) + elif filename.endswith('.json') or filename.endswith('.json.gz') or filename.endswith('.jsonl') or filename.endswith('.jsonl.gz'): + nodes, edges, node_types, edge_types = import_kg2_jsonl(filename, edges_to_include, remove_unused_nodes, **kwargs) + else: + raise Exception('Filename should be a csv, tsv, json, or jsonl.') + if not os.path.exists(mtx_filename): + edge_matrix = to_sparse(nodes, edges) + io.mmwrite(mtx_filename, edge_matrix) + else: + edge_matrix = io.mmread(mtx_filename) + return nodes, edges, node_types, edge_types, edge_matrix + + +def load_kg2_igraph_from_data(nodes, edges, node_types, edge_types, remove_unused_nodes=True, directed=False, verbose=False, low_memory=False, **kwargs): + """ + Uses the output of import_kg2_csv or import_kg2_jsonl + """ + import igraph as ig + if low_memory: + edge_list = ({'s': str(v[0]), 't': str(v[1])} for v in edges.keys()) + del edges + else: + if 'use_edge_properties' in kwargs and kwargs['use_edge_properties'] == True: + # igraph doesn't allow lists as edge properties, so we are converting them to a string. + for v, e in edges.items(): + for key, value in e.copy().items(): + if isinstance(value, list) or isinstance(value, dict): + e[key] = str(value) + edge_list = ({'source': str(v[0]), 'target': str(v[1]), **e} for v, e in edges.items()) + else: + edge_list = ({'source': str(v[0]), 'target': str(v[1]), 'type': edge_types[e]} for v, e in edges.items()) + if verbose: + print('creating node list') + # set node attributes + # convert the node id to a string, bc + if low_memory: + node_list = ({ + 'name': str(n[0]), + 'feature_name': n[1], + 'category': node_types[n[2]], + 'identifier': n[3], + } for n in nodes) + del nodes + else: + node_list = ({ + 'name': str(n[0]), + 'feature_name': n[1], + 'category': node_types[n[2]], + 'identifier': n[3], + 'source': n[4], + } for n in nodes) + if verbose: + print('calling igraph.Graph.DictList') + if low_memory: + graph = ig.Graph.DictList(node_list, edge_list, directed=directed, + edge_foreign_keys=('s', 't'), + iterative=False) + else: + graph = ig.Graph.DictList(node_list, edge_list, directed=directed, + edge_foreign_keys=('source', 'target'), + iterative=False) + return graph + + +def load_kg2_networkx(filename='spoke.csv', edges_to_include=None, remove_unused_nodes=True, directed=False, **kwargs): + import networkx as nx + if filename.endswith('.csv') or filename.endswith('.csv.gz') or filename.endswith('.tsv') or filename.endswith('.tsv.gz'): + nodes, edges, node_types, edge_types = import_kg2_csv(filename, edges_to_include, remove_unused_nodes, reindex_edges=False, **kwargs) + elif filename.endswith('.json') or filename.endswith('.json.gz') or filename.endswith('.jsonl') or filename.endswith('.jsonl.gz'): + nodes, edges, node_types, edge_types = import_kg2_jsonl(filename, edges_to_include, remove_unused_nodes, reindex_edges=False, **kwargs) + else: + raise Exception('Filename should be a csv, tsv, json, or jsonl.') + edge_list = edges.keys() + if directed: + graph = nx.from_edgelist(edge_list, nx.DiGraph) + else: + graph = nx.from_edgelist(edge_list) + # set node attributes + node_attributes = {} + # TODO: get all IDs, not just names + for n in nodes: + node_attributes[n[0]] = { + 'name': n[1], + 'category': node_types[n[2]], + 'identifier': n[3], + 'source': n[4], + } + nx.set_node_attributes(graph, node_attributes) + # set edge attributes + edge_attributes = {k: {'type': edge_types[v]} for k, v in edges.items()} + nx.set_edge_attributes(graph, edge_attributes) + return graph + + +def load_kg2_igraph(filename='graph.jsonl.gz', edges_to_include=None, remove_unused_nodes=True, directed=False, verbose=False, low_memory=False, **kwargs): + """ + Imports the file as an igraph. The file can be a json/jsonl export from neo4j, and it can be gzipped. The spoke IDs are converted to strings because igraph is very slow if the ids are ints. + """ + if low_memory: + kwargs['use_edge_properties'] = False + if filename.endswith('.csv') or filename.endswith('.csv.gz') or filename.endswith('.tsv') or filename.endswith('.tsv.gz'): + nodes, edges, node_types, edge_types = import_kg2_csv(filename, edges_to_include, remove_unused_nodes, reindex_edges=False, **kwargs) + elif filename.endswith('.json') or filename.endswith('.json.gz') or filename.endswith('.jsonl') or filename.endswith('.jsonl.gz'): + nodes, edges, node_types, edge_types = import_kg2_jsonl(filename, edges_to_include, remove_unused_nodes, reindex_edges=False, **kwargs) + else: + raise Exception('Filename should be a csv, tsv, json, or jsonl.') + if verbose: + print('Done loading data, creating edge list') + return load_kg2_igraph_from_data(nodes, edges, node_types, edge_types, remove_unused_nodes, directed, verbose, low_memory, **kwargs) + # use igraph.graph.DictList + + +def symmetrize_matrix(matrix): + """ + Symmetrizes an adjacency matrix. + + Warning: this completely destroys any meaning applied to node values. Nonzero = edge exists, zero = edge doesn't exist. + """ + lower_triangle = sparse.tril(matrix) + upper_triangle = sparse.triu(matrix) + return lower_triangle + lower_triangle.T + upper_triangle + upper_triangle.T + diff --git a/notebooks/Pathfinder_new.ipynb b/notebooks/Pathfinder_new.ipynb new file mode 100644 index 0000000..67512d1 --- /dev/null +++ b/notebooks/Pathfinder_new.ipynb @@ -0,0 +1,375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "34f3d34d-afbe-4d33-aecf-92ff0a7e4d34", + "metadata": {}, + "source": [ + "# Pathfinder\n", + "\n", + "2026-04-23" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4ded6547-7067-4763-b352-46eea2905833", + "metadata": {}, + "outputs": [], + "source": [ + "from TCT import TCT\n", + "\n", + "from TCT import name_resolver\n", + "from TCT import translator_metakg\n", + "from TCT import translator_kpinfo\n", + "from TCT import translator_query\n", + "from TCT import TCT_pathfinder" + ] + }, + { + "cell_type": "markdown", + "id": "14844e8e-c4f2-49a3-a5a1-2b03388d4550", + "metadata": {}, + "source": [ + "## Get Meta-KGs" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5f0dab0c-6376-41da-80df-450a07c263b2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping server without x-maturity: {'url': '/sipr'}\n" + ] + } + ], + "source": [ + "APInames, metaKG, Translator_KP_info= translator_metakg.load_translator_resources()\n", + "\n", + "All_predicates = list(set(metaKG['Predicate']))\n", + "All_categories = list((set(list(set(metaKG['Subject']))+list(set(metaKG['Object'])))))\n", + "API_withMetaKG = list(set(metaKG['API']))\n", + "\n", + " # generate a dictionary of API and its predicates\n", + "API_predicates = {}\n", + "for api in API_withMetaKG:\n", + " API_predicates[api] = list(set(metaKG[metaKG['API'] == api]['Predicate']))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7dea58cb-ec97-4f76-b62b-824e22940ac8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(22568, 5)\n" + ] + } + ], + "source": [ + "# select a list of APIs to use and a list of predicates to use\n", + "selected_APIlist = []\n", + "\n", + "if len(selected_APIlist) == 0:\n", + " select_APIs = APInames\n", + "else:\n", + " select_APIs = {k: APInames[k] for k in selected_APIlist if k in APInames}\n", + "\n", + "selected_metaKG = metaKG[metaKG['API'].isin(select_APIs.keys())]\n", + "#print(select_APIs)\n", + "print(selected_metaKG.shape)\n" + ] + }, + { + "cell_type": "markdown", + "id": "980e05c9-d0ff-46bb-8cf7-28b19491de00", + "metadata": {}, + "source": [ + "## Run New Pathfinder" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0396121a-c78f-4cd1-aba5-34248eefa5c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONDO:0100096\n", + "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", + "CATRAX BigGIM DrugResponse Performance Phase KP - TRAPI 1.5.0: Success!\n", + "Automat-genome-alliance(Trapi v1.5.0): Success!\n", + "RTX KG2 - TRAPI 1.5.0: Success!\n", + "Automat-cam-kp(Trapi v1.5.0): Success!\n", + "Automat-hetionet(Trapi v1.5.0): Success!\n", + "Automat-robokop(Trapi v1.5.0): Success!\n", + "BioThings Explorer (BTE) TRAPI: Success!\n", + "CATRAX Pharmacogenomics KP - TRAPI 1.5.0: Success!\n", + "Clinical Trials KP - TRAPI 1.5.0: Success!\n", + "Microbiome KP - TRAPI 1.5.0: Success!\n", + "RTX KG2 - TRAPI 1.5.0: Success!\n", + "ENSEMBL:ENSP00000423463: no preferred name\n", + "ENSEMBL:ENSP00000423463: no preferred name\n", + "UniProtKB:P01308-1: no preferred name\n", + "UniProtKB:P01308-1: no preferred name\n", + "UniProtKB:P12544-1: no preferred name\n", + "UniProtKB:P12544-1: no preferred name\n", + "NodeNorm does not know about these identifiers: NCIT:C16612,UniProtKB:P05113-1,UniProtKB:P05113-1\n", + "NodeNorm does not know about these identifiers: UMLS:C5911035,UMLS:C5943250,UMLS:C5959582,UMLS:C5908975,UMLS:C5926549,UMLS:C5943245,CHEMBL.TARGET:CHEMBL1697664,CHEMBL.TARGET:CHEMBL6188,CHEMBL.TARGET:CHEMBL4295579,CHEMBL.TARGET:CHEMBL1075031,CHEMBL.TARGET:CHEMBL4295609,CHEMBL.TARGET:CHEMBL1287621,CHEMBL.TARGET:CHEMBL3499,CHEMBL.TARGET:CHEMBL4295625,CHEMBL.TARGET:CHEMBL4295542,CHEMBL.TARGET:CHEMBL1741198,CHEMBL.TARGET:CHEMBL3309030,CHEMBL.TARGET:CHEMBL4882,CHEMBL.TARGET:CHEMBL3309037\n", + "Number of possible paths: 680\n" + ] + } + ], + "source": [ + "result1, result2, formatted_output, paths = TCT_pathfinder.pathfinder(input_node1_id='NCBIGene:3458', #IFNG\n", + " input_node2_id= 'MONDO:0100096', #COVID-19\n", + " intermediate_categories=['biolink:Protein', 'biolink:Gene' ], \n", + " APInames=select_APIs, \n", + " metaKG=selected_metaKG, \n", + " API_predicates=API_predicates)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c1e3f672-a900-4e45-b197-8874a8f44c32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
scoreoutput_nodepredicates1predicates2output_node_name
PDCD10.133333NCBIGene:5133biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...PDCD1
TNF0.133333NCBIGene:7124biolink:genetically_interacts_with; biolink:in...biolink:gene_associated_with_condition; biolin...TNF
IL130.133333NCBIGene:3596biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...IL13
NLRP30.133333NCBIGene:114548biolink:affects; biolink:physically_interacts_...biolink:related_to; biolink:gene_associated_wi...NLRP3
IL17A0.133333NCBIGene:3605biolink:interacts_with; biolink:affectsbiolink:related_to; biolink:gene_associated_wi...IL17A
..................
LIF0.033333NCBIGene:3976biolink:affectsbiolink:gene_associated_with_conditionLIF
FAS0.033333NCBIGene:355biolink:regulates; biolink:affectsbiolink:gene_associated_with_conditionFAS
MEFV0.033333NCBIGene:4210biolink:affectsbiolink:gene_associated_with_conditionMEFV
PXDNL0.033333NCBIGene:137902biolink:interacts_withbiolink:gene_associated_with_conditionPXDNL
THBD0.033333NCBIGene:7056biolink:affects; biolink:physically_interacts_...biolink:gene_associated_with_conditionTHBD
\n", + "

680 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " score output_node \\\n", + "PDCD1 0.133333 NCBIGene:5133 \n", + "TNF 0.133333 NCBIGene:7124 \n", + "IL13 0.133333 NCBIGene:3596 \n", + "NLRP3 0.133333 NCBIGene:114548 \n", + "IL17A 0.133333 NCBIGene:3605 \n", + "... ... ... \n", + "LIF 0.033333 NCBIGene:3976 \n", + "FAS 0.033333 NCBIGene:355 \n", + "MEFV 0.033333 NCBIGene:4210 \n", + "PXDNL 0.033333 NCBIGene:137902 \n", + "THBD 0.033333 NCBIGene:7056 \n", + "\n", + " predicates1 \\\n", + "PDCD1 biolink:interacts_with; biolink:affects \n", + "TNF biolink:genetically_interacts_with; biolink:in... \n", + "IL13 biolink:interacts_with; biolink:affects \n", + "NLRP3 biolink:affects; biolink:physically_interacts_... \n", + "IL17A biolink:interacts_with; biolink:affects \n", + "... ... \n", + "LIF biolink:affects \n", + "FAS biolink:regulates; biolink:affects \n", + "MEFV biolink:affects \n", + "PXDNL biolink:interacts_with \n", + "THBD biolink:affects; biolink:physically_interacts_... \n", + "\n", + " predicates2 output_node_name \n", + "PDCD1 biolink:related_to; biolink:gene_associated_wi... PDCD1 \n", + "TNF biolink:gene_associated_with_condition; biolin... TNF \n", + "IL13 biolink:related_to; biolink:gene_associated_wi... IL13 \n", + "NLRP3 biolink:related_to; biolink:gene_associated_wi... NLRP3 \n", + "IL17A biolink:related_to; biolink:gene_associated_wi... IL17A \n", + "... ... ... \n", + "LIF biolink:gene_associated_with_condition LIF \n", + "FAS biolink:gene_associated_with_condition FAS \n", + "MEFV biolink:gene_associated_with_condition MEFV \n", + "PXDNL biolink:gene_associated_with_condition PXDNL \n", + "THBD biolink:gene_associated_with_condition THBD \n", + "\n", + "[680 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paths" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fcfb4342-ed1b-401d-8ffa-862978f0c275", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['query_graph', 'knowledge_graph', 'results', 'auxiliary_graphs'])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "formatted_output.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9847dbb5-0d63-496b-8969-4dd330f3897c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index a141bef..63498a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,9 @@ dependencies = [ "fastmcp>=2.12.2", "click>=8.2.1", 'networkx', - 'pyvis' + 'igraph', + 'pyvis', + 'zstandard', ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index fcae53b..bab518f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ openai ipykernel PyYAML ipywidgets +zstandard +igraph