diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 5f09ea68..771b359a 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -14,11 +14,14 @@ class DJORNL_Parser(object): - def config(self): + def config(self, value): if not hasattr(self, '_config'): - return self._configure() + self._configure() - return self._config + if value not in self._config: + raise KeyError(f'No such config value: {value}') + + return self._config[value] def _configure(self): @@ -43,15 +46,15 @@ def _configure(self): _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') configuration['_CLUSTER_PATHS'] = { - 'cluster_I2': os.path.join( + 'markov_i2': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' ), - 'cluster_I4': os.path.join( + 'markov_i4': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' ), - 'cluster_I6': os.path.join( + 'markov_i6': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' ), @@ -74,10 +77,10 @@ def load_edges(self): # dict of nodes, indexed by node ID (node1 and node2 from the file) node_ix = {} edges = [] - node_name = self.config()['_NODE_NAME'] - expected_col_count = self.config()['_EDGE_FILE_COL_COUNT'] + node_name = self.config('_NODE_NAME') + expected_col_count = self.config('_EDGE_FILE_COL_COUNT') - with open(self.config()['_EDGE_PATH']) as fd: + with open(self.config('_EDGE_PATH')) as fd: csv_reader = csv.reader(fd, delimiter='\t') next(csv_reader, None) # skip headers line_no = 1 @@ -102,6 +105,7 @@ def load_edges(self): 'score': float(cols[2]), 'edge_type': edge_remap[edge_type], }) + return { 'nodes': [{'_key': n} for n in node_ix.keys()], 'edges': edges, @@ -111,8 +115,9 @@ def load_node_metadata(self): """Load node metadata""" nodes = [] - expected_col_count = self.config()['_NODE_FILE_COL_COUNT'] - with open(self.config()['_NODE_PATH']) as fd: + valid_node_types = ['gene', 'pheno'] + expected_col_count = self.config('_NODE_FILE_COL_COUNT') + with open(self.config('_NODE_PATH')) as fd: csv_reader = csv.reader(fd, delimiter=',') next(csv_reader, None) # skip headers line_no = 1 @@ -126,7 +131,7 @@ def load_node_metadata(self): _key = cols[0] node_type = cols[1] - if node_type != 'gene' and node_type != 'pheno': + if node_type not in valid_node_types: raise RuntimeError(f"line {line_no}: invalid node type: {node_type}") go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] @@ -154,40 +159,53 @@ def load_node_metadata(self): 'user_notes': cols[19], } nodes.append(doc) + return {'nodes': nodes} def load_cluster_data(self): """Annotate genes with cluster ID fields.""" - nodes = [] - cluster_paths = self.config()['_CLUSTER_PATHS'] + + # index of nodes + node_ix = {} + + cluster_paths = self.config('_CLUSTER_PATHS') for (cluster_label, path) in cluster_paths.items(): with open(path) as fd: csv_reader = csv.reader(fd, delimiter='\t') for row in csv_reader: if len(row) > 1: - # remove the 'Cluster' text - cluster_id = row[0].replace('Cluster', '') - gene_keys = row[1:] - nodes += [ - {'_key': key, cluster_label: int(cluster_id)} - for key in gene_keys - ] + # remove the 'Cluster' text and replace it with cluster_label + cluster_id = cluster_label + ':' + row[0].replace('Cluster', '') + + node_keys = row[1:] + for key in node_keys: + if key not in node_ix: + node_ix[key] = [cluster_id] + elif cluster_id not in node_ix[key]: + node_ix[key].append(cluster_id) + + # gather a list of cluster IDs for each node + nodes = [{ + '_key': key, + 'clusters': cluster_data + } for (key, cluster_data) in node_ix.items()] + return {'nodes': nodes} def save_dataset(self, dataset): if 'nodes' in dataset and len(dataset['nodes']) > 0: - self.save_docs(self.config()['_NODE_NAME'], dataset['nodes']) + self.save_docs(self.config('_NODE_NAME'), dataset['nodes']) if 'edges' in dataset and len(dataset['edges']) > 0: - self.save_docs(self.config()['_EDGE_NAME'], dataset['edges']) + self.save_docs(self.config('_EDGE_NAME'), dataset['edges']) def save_docs(self, coll_name, docs, on_dupe='update'): resp = requests.put( - self.config()['API_URL'] + '/api/v1/documents', + self.config('API_URL') + '/api/v1/documents', params={'collection': coll_name, 'on_duplicate': on_dupe}, - headers={'Authorization': self.config()['AUTH_TOKEN']}, + headers={'Authorization': self.config('AUTH_TOKEN')}, data='\n'.join(json.dumps(d) for d in docs) ) if not resp.ok: @@ -202,3 +220,32 @@ def load_data(self): self.save_dataset(self.load_edges()) self.save_dataset(self.load_node_metadata()) self.save_dataset(self.load_cluster_data()) + + def check_data_delta(self): + edge_data = self.load_edges() + node_metadata = self.load_node_metadata() + clusters = self.load_cluster_data() + + self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters) + + def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): + + edge_nodes = set([e['_key'] for e in edge_data['nodes']]) + node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']]) + cluster_nodes = set([e['_key'] for e in cluster_data['nodes']]) + all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes) + + # check all nodes in cluster_data have node_metadata + clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes) + if clstr_no_node_md_set: + print({'clusters with no node metadata': clstr_no_node_md_set}) + + # check all nodes in the edge_data have node_metadata + edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes) + if edge_no_node_md_set: + print({'edges with no node metadata': edge_no_node_md_set}) + + # count all edges + print("Dataset contains " + str(len(edge_data['edges'])) + " edges") + # count all nodes + print("Dataset contains " + str(len(all_nodes)) + " nodes") diff --git a/importers/test/test_djornl_parser.py b/importers/test/test_djornl_parser.py index 71d43dbc..2ad9184e 100644 --- a/importers/test/test_djornl_parser.py +++ b/importers/test/test_djornl_parser.py @@ -9,7 +9,6 @@ import os from importers.djornl.parser import DJORNL_Parser - from spec.test.helpers import modified_environ _TEST_DIR = '/app/spec/test' @@ -24,14 +23,29 @@ def setUpClass(cls): with open(results_file) as fh: cls.json_data = json.load(fh) + cls.maxDiff = None + def init_parser_with_path(self, root_path): with modified_environ(RES_ROOT_DATA_PATH=root_path): parser = DJORNL_Parser() # ensure that the configuration has been set - parser.config() + parser._configure() return parser + def test_load_invalid_file(self): + """ test loading when what is supposed to be a file is actually a directory """ + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file') + + # edges: directory, not a file + edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv") + err_str = f"Is a directory: '{edges_file_path}'" + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + with self.assertRaisesRegex(IsADirectoryError, err_str): + parser.load_edges() + def test_load_empty_files(self): """ test loading files containing no data """ @@ -100,33 +114,38 @@ def test_load_valid_edge_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.maxDiff = None - edge_data = parser.load_edges() - self.assertEqual( - edge_data, - self.json_data["load_edges"] - ) + expected = self.json_data["load_edges"] + + for data_structure in [edge_data, expected]: + for k in data_structure.keys(): + data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key']) + + self.assertEqual(edge_data, expected) def test_load_valid_node_metadata(self): - self.maxDiff = None RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) node_metadata = parser.load_node_metadata() - self.assertEqual( - node_metadata, - self.json_data["load_node_metadata"] - ) + expected = self.json_data["load_node_metadata"] + + for data_structure in [node_metadata, expected]: + for k in data_structure.keys(): + data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key']) + data_structure[k] = [n['_key'] for n in data_structure[k]] + + self.assertEqual(node_metadata, expected) def test_load_valid_cluster_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - cluster_data = parser.load_cluster_data() self.assertEqual( cluster_data, self.json_data["load_cluster_data"] ) + + parser.check_data_delta() diff --git a/spec/collections/djornl/djornl_node.yaml b/spec/collections/djornl/djornl_node.yaml index 9248f1c1..13b14541 100644 --- a/spec/collections/djornl/djornl_node.yaml +++ b/spec/collections/djornl/djornl_node.yaml @@ -2,6 +2,10 @@ name: djornl_node type: vertex delta: false +indexes: + - type: hash + fields: ["clusters[*]"] + schema: "$schema": http://json-schema.org/draft-07/schema# title: Gene and Phenotype Vertices @@ -13,21 +17,15 @@ schema: type: string title: Key examples: ["AT1G01010"] - cluster_I2: - type: integer - title: Cluster 2 ID - description: Iterative random forest cluster group ID - examples: [1] - cluster_I4: - type: integer - title: Cluster 4 ID - description: Iterative random forest cluster group ID - examples: [13] - cluster_I6: - type: integer - title: Cluster 6 ID - description: Iterative random forest cluster group ID - examples: [27] + clusters: + type: array + title: Clusters + description: Clusters to which the node has been assigned + items: + type: string + format: regex + pattern: ^\w+:\d+$ + examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] node_type: type: string title: Node type diff --git a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml index 4c6b8c50..1fadca36 100644 --- a/spec/stored_queries/djornl/djornl_fetch_clusters.yaml +++ b/spec/stored_queries/djornl/djornl_fetch_clusters.yaml @@ -2,25 +2,13 @@ name: djornl_fetch_clusters description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes. params: type: object + required: [cluster_ids] properties: - cluster_i2_ids: - title: Cluster I2 IDs - description: Cluster I2 IDs to locate - items: {type: integer} - default: [] - examples: [[1], [3, 5]] - cluster_i4_ids: - title: Cluster I4 IDs - description: Cluster I4 IDs to locate - items: {type: integer} - examples: [[2], [4, 6]] - default: [] - cluster_i6_ids: - title: Cluster I6 IDs - description: Cluster I6 IDs to locate - items: {type: integer} - examples: [[666], [999, 333]] - default: [] + cluster_ids: + title: Cluster IDs + description: Cluster IDs, in the form "clustering_system_name:cluster_id" + items: {type: string} + examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']] distance: type: integer title: Traversal Distance @@ -31,7 +19,7 @@ params: query: | LET node_ids = ( FOR n IN djornl_node - FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids + FILTER n.clusters ANY IN @cluster_ids FOR node IN 0..@distance ANY n djornl_edge OPTIONS {bfs: true, uniqueVertices: "global"} RETURN DISTINCT node._id diff --git a/spec/test/djornl/invalid_file/merged_edges-AMW-060820_AF.tsv/empty b/spec/test/djornl/invalid_file/merged_edges-AMW-060820_AF.tsv/empty new file mode 100644 index 00000000..e69de29b diff --git a/spec/test/djornl/results.json b/spec/test/djornl/results.json index a844c2c2..7fd3a4d5 100644 --- a/spec/test/djornl/results.json +++ b/spec/test/djornl/results.json @@ -27,21 +27,15 @@ }, "load_cluster_data": { "nodes": [ - {"_key": "AT1G01010", "cluster_I2": 1}, - {"_key": "AT1G01030", "cluster_I2": 1}, - {"_key": "AT1G01040", "cluster_I2": 1}, - {"_key": "AT1G01050", "cluster_I2": 2}, - {"_key": "AT1G01060", "cluster_I2": 2}, - {"_key": "AT1G01070", "cluster_I2": 2}, - {"_key": "AT1G01080", "cluster_I2": 3}, - {"_key": "AT1G01090", "cluster_I2": 3}, - {"_key": "AT1G01020", "cluster_I2": 5}, - {"_key": "AT1G01040", "cluster_I6": 1}, - {"_key": "AT1G01090", "cluster_I6": 1}, - {"_key": "AT1G01070", "cluster_I6": 2}, - {"_key": "AT1G01010", "cluster_I6": 3}, - {"_key": "AT1G01020", "cluster_I6": 3}, - {"_key": "AT1G01030", "cluster_I6": 3} + {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01040", "clusters": ["markov_i2:1", "markov_i6:1"]}, + {"_key": "AT1G01050", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01060", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]}, + {"_key": "AT1G01080", "clusters": ["markov_i2:3"]}, + {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]}, + {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]} ] }, "load_node_metadata": { @@ -93,220 +87,265 @@ ] }, "fetch_genes": { - "AT1G01010": { - "0": { - "nodes": ["AT1G01010"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": [ - "AT1G01010", - "AT1G01020", - "AT1G01030", - "AT1G01040" - ], - "edges": [ - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5" - ] + "AT1G01010": { + "distance": { + "0": { + "nodes": ["AT1G01010"], + "edges": [] + }, + "1": { + "nodes": [ + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040" + ], + "edges": [ + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "AT1G01020__AT1G01070": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] - }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "AT1G01020__AT1G01070": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "fetch_phenotypes": { - "As2": { - "0": { - "nodes": ["As2"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "As2__Na23": { - "0": { - "nodes": ["As2", "Na23"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] + "As2": { + "distance": { + "0": { + "nodes": ["As2"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "As2__Na23": { + "distance": { + "0": { + "nodes": ["As2", "Na23"], + "edges": [] + }, + "1": { + "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "search_nodes": { - "Mary Poppins": { - "0": {"nodes": [], "edges": []}, - "1": {"nodes": [], "edges": []}, - "5": {"nodes": [], "edges": []} - }, - "GO:0005515": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "search_text": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "GO:0005515": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } } } }, "fetch_clusters": { - "i6-1": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - } - }, - "i2-5__i6-2": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] + "cluster_ids": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] + "markov_i6:1": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "markov_i2:5__markov_i6:2": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } } diff --git a/spec/test/stored_queries/test_djornl.py b/spec/test/stored_queries/test_djornl.py index 98250993..8e492fbe 100644 --- a/spec/test/stored_queries/test_djornl.py +++ b/spec/test/stored_queries/test_djornl.py @@ -34,26 +34,28 @@ def setUpClass(cls): cls.json_data = json.load(fh) cls.no_results = {'nodes': [], 'edges': []} + cls.maxDiff = None # load the DB root_path = os.path.join(_TEST_DIR, 'djornl', 'test_data') with modified_environ(RES_ROOT_DATA_PATH=root_path): parser = DJORNL_Parser() - config = parser.config() + node_name = parser.config('_NODE_NAME') + edge_name = parser.config('_EDGE_NAME') edge_data = parser.load_edges() - r = create_test_docs(config['_NODE_NAME'], edge_data['nodes']) - print_db_update(r, config['_NODE_NAME']) - r = create_test_docs(config['_EDGE_NAME'], edge_data['edges']) - print_db_update(r, config['_EDGE_NAME']) + r = create_test_docs(node_name, edge_data['nodes']) + print_db_update(r, node_name) + r = create_test_docs(edge_name, edge_data['edges']) + print_db_update(r, edge_name) node_metadata = parser.load_node_metadata() - r = create_test_docs(config['_NODE_NAME'], node_metadata['nodes'], True) - print_db_update(r, config['_NODE_NAME']) + r = create_test_docs(node_name, node_metadata['nodes'], True) + print_db_update(r, node_name) cluster_data = parser.load_cluster_data() - r = create_test_docs(config['_NODE_NAME'], cluster_data['nodes'], True) - print_db_update(r, config['_NODE_NAME']) + r = create_test_docs(node_name, cluster_data['nodes'], True) + print_db_update(r, node_name) def submit_query(self, query_name, query_data={}): """submit a database query""" @@ -68,6 +70,7 @@ def check_expected_results(self, description, response, expected): if _VERBOSE: print("Running test " + description) + results = response['results'][0] self.assertEqual( set([n["_key"] for n in results['nodes']]), @@ -81,31 +84,45 @@ def check_expected_results(self, description, response, expected): def test_fetch_all(self): + response = self.submit_query('djornl_fetch_all') self.check_expected_results( "djornl_fetch_all", - self.submit_query('djornl_fetch_all'), + response, self.json_data['fetch_all'] ) + # ensure that all the cluster data is returned OK + node_data = response['results'][0]['nodes'] + expected_node_data = self.json_data['load_cluster_data']['nodes'] + self.assertEqual( + {n['_key']: n['clusters'] for n in node_data if 'clusters' in n}, + {n['_key']: n['clusters'] for n in expected_node_data if 'clusters' in n}, + ) + # indexing schema in results.json - # self.json_data[query][primary_param][distance_param] - # if primary_param is an array, join the array entities with "__" + # self.json_data[query_name][param_name][param_value]["distance"][distance_param] + # e.g. for fetch_clusters data: + # "fetch_clusters": { + # "cluster_ids": { + # "markov_i2:6__markov_i4:3": { + # "distance": { + # 1: { + # "nodes": [ node IDs ], + # "edges": [ edge data ], + # } + # } + # } + # } + # } + # if param_value is an array, join the array entities with "__" # results are in the form {"nodes": [...], "edges": [...]} # nodes are represented as a list of node[_key] # edges are objects with keys _to, _from, edge_type and score - def test_fetch_phenotypes_no_results(self): - - resp = self.submit_query('djornl_fetch_phenotypes', { - # gene node - "keys": ["AT1G01010"], - }) - self.assertEqual(resp['results'][0], self.no_results) - def test_fetch_phenotypes(self): - for fetch_args in self.json_data['fetch_phenotypes'].keys(): - for distance in self.json_data['fetch_phenotypes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_phenotypes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_phenotypes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -113,20 +130,13 @@ def test_fetch_phenotypes(self): self.check_expected_results( "fetch phenotypes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_phenotypes'][fetch_args][distance] + distance_data ) - def test_fetch_genes_no_results(self): - resp = self.submit_query('djornl_fetch_genes', { - # phenotype node - "keys": ["As2"], - }) - self.assertEqual(resp['results'][0], self.no_results) - def test_fetch_genes(self): - for fetch_args in self.json_data['fetch_genes'].keys(): - for distance in self.json_data['fetch_genes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_genes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_genes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -134,49 +144,27 @@ def test_fetch_genes(self): self.check_expected_results( "fetch genes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_genes'][fetch_args][distance] + distance_data ) - def test_fetch_clusters_no_results(self): - - resp = self.submit_query('djornl_fetch_clusters', { - 'cluster_i2_ids': [666], - 'cluster_i4_ids': [666], - 'cluster_i6_ids': [666], - }) - self.assertEqual(resp['results'][0], self.no_results) - def test_fetch_clusters(self): - for fetch_args in self.json_data['fetch_clusters'].keys(): - cluster_args = {} - for arg in fetch_args.split('__'): - [c_name, c_id] = arg.split('-', maxsplit=1) - if "cluster_" + c_name + "_ids" in cluster_args: - cluster_args["cluster_" + c_name + "_ids"] += int(c_id) - else: - cluster_args["cluster_" + c_name + "_ids"] = [int(c_id)] - - for distance in self.json_data['fetch_clusters'][fetch_args].keys(): - cluster_args['distance'] = int(distance) - resp = self.submit_query('djornl_fetch_clusters', cluster_args) + for (fetch_args, cluster_data) in self.json_data['fetch_clusters']['cluster_ids'].items(): + for (distance, distance_data) in cluster_data['distance'].items(): + resp = self.submit_query('djornl_fetch_clusters', { + "cluster_ids": fetch_args.split('__'), + "distance": int(distance), + }) self.check_expected_results( "fetch clusters with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_clusters'][fetch_args][distance] + distance_data ) - def test_search_nodes_no_results(self): - - resp = self.submit_query('djornl_search_nodes', { - "search_text": "Mary Poppins", - }) - self.assertEqual(resp['results'][0], self.no_results) - def test_search_nodes(self): - for search_text in self.json_data['search_nodes'].keys(): - for distance in self.json_data['search_nodes'][search_text].keys(): + for (search_text, search_data) in self.json_data['search_nodes']['search_text'].items(): + for (distance, distance_data) in search_data['distance'].items(): resp = self.submit_query('djornl_search_nodes', { "search_text": search_text, "distance": int(distance), @@ -184,5 +172,5 @@ def test_search_nodes(self): self.check_expected_results( "search nodes with args " + search_text + " and distance " + distance, resp, - self.json_data['search_nodes'][search_text][distance] + distance_data )