kbase · jayrbolton · Aug 19, 2020 · Aug 18, 2020 · ialarmedalien · Aug 18, 2020
@@ -14,11 +14,14 @@
 
 class DJORNL_Parser(object):
 
-    def config(self):
+    def config(self, value):
         if not hasattr(self, '_config'):
-            return self._configure()
+            self._configure()
 
-        return self._config
+        if value not in self._config:
+            raise KeyError(f'No such config value: {value}')
+
+        return self._config[value]
 
     def _configure(self):
 
@@ -43,15 +46,15 @@ def _configure(self):
 
         _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
         configuration['_CLUSTER_PATHS'] = {
-            'cluster_I2': os.path.join(
+            'markov_i2': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
             ),
-            'cluster_I4': os.path.join(
+            'markov_i4': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
             ),
-            'cluster_I6': os.path.join(
+            'markov_i6': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
             ),
@@ -74,10 +77,10 @@ def load_edges(self):
         # dict of nodes, indexed by node ID (node1 and node2 from the file)
         node_ix = {}
         edges = []
-        node_name = self.config()['_NODE_NAME']
-        expected_col_count = self.config()['_EDGE_FILE_COL_COUNT']
+        node_name = self.config('_NODE_NAME')
+        expected_col_count = self.config('_EDGE_FILE_COL_COUNT')
 
-        with open(self.config()['_EDGE_PATH']) as fd:
+        with open(self.config('_EDGE_PATH')) as fd:
             csv_reader = csv.reader(fd, delimiter='\t')
             next(csv_reader, None)  # skip headers
             line_no = 1
@@ -102,6 +105,7 @@ def load_edges(self):
                     'score': float(cols[2]),
                     'edge_type': edge_remap[edge_type],
                 })
+
         return {
             'nodes': [{'_key': n} for n in node_ix.keys()],
             'edges': edges,
@@ -111,8 +115,9 @@ def load_node_metadata(self):
         """Load node metadata"""
 
         nodes = []
-        expected_col_count = self.config()['_NODE_FILE_COL_COUNT']
-        with open(self.config()['_NODE_PATH']) as fd:
+        valid_node_types = ['gene', 'pheno']
+        expected_col_count = self.config('_NODE_FILE_COL_COUNT')
+        with open(self.config('_NODE_PATH')) as fd:
             csv_reader = csv.reader(fd, delimiter=',')
             next(csv_reader, None)  # skip headers
             line_no = 1
@@ -126,7 +131,7 @@ def load_node_metadata(self):
 
                 _key = cols[0]
                 node_type = cols[1]
-                if node_type != 'gene' and node_type != 'pheno':
+                if node_type not in valid_node_types:
                     raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")
 
                 go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
@@ -154,40 +159,53 @@ def load_node_metadata(self):
                     'user_notes': cols[19],
                 }
                 nodes.append(doc)
+
         return {'nodes': nodes}
 
     def load_cluster_data(self):
         """Annotate genes with cluster ID fields."""
-        nodes = []
-        cluster_paths = self.config()['_CLUSTER_PATHS']
+
+        # index of nodes
+        node_ix = {}
+
+        cluster_paths = self.config('_CLUSTER_PATHS')
         for (cluster_label, path) in cluster_paths.items():
             with open(path) as fd:
                 csv_reader = csv.reader(fd, delimiter='\t')
                 for row in csv_reader:
                     if len(row) > 1:
-                        # remove the 'Cluster' text
-                        cluster_id = row[0].replace('Cluster', '')
-                        gene_keys = row[1:]
-                        nodes += [
-                            {'_key': key, cluster_label: int(cluster_id)}
-                            for key in gene_keys
-                        ]
+                        # remove the 'Cluster' text and replace it with cluster_label
+                        cluster_id = cluster_label + ':' + row[0].replace('Cluster', '')
+
+                        node_keys = row[1:]
+                        for key in node_keys:
+                            if key not in node_ix:
+                                node_ix[key] = [cluster_id]
+                            elif cluster_id not in node_ix[key]:
+                                node_ix[key].append(cluster_id)
+
+        # gather a list of cluster IDs for each node
+        nodes = [{
+            '_key': key,
+            'clusters': cluster_data
+        } for (key, cluster_data) in node_ix.items()]
+
         return {'nodes': nodes}
 
     def save_dataset(self, dataset):
 
         if 'nodes' in dataset and len(dataset['nodes']) > 0:
-            self.save_docs(self.config()['_NODE_NAME'], dataset['nodes'])
+            self.save_docs(self.config('_NODE_NAME'), dataset['nodes'])
 
         if 'edges' in dataset and len(dataset['edges']) > 0:
-            self.save_docs(self.config()['_EDGE_NAME'], dataset['edges'])
+            self.save_docs(self.config('_EDGE_NAME'), dataset['edges'])
 
     def save_docs(self, coll_name, docs, on_dupe='update'):
 
         resp = requests.put(
-            self.config()['API_URL'] + '/api/v1/documents',
+            self.config('API_URL') + '/api/v1/documents',
             params={'collection': coll_name, 'on_duplicate': on_dupe},
-            headers={'Authorization': self.config()['AUTH_TOKEN']},
+            headers={'Authorization': self.config('AUTH_TOKEN')},
             data='\n'.join(json.dumps(d) for d in docs)
         )
         if not resp.ok:
@@ -202,3 +220,32 @@ def load_data(self):
         self.save_dataset(self.load_edges())
         self.save_dataset(self.load_node_metadata())
         self.save_dataset(self.load_cluster_data())
+
+    def check_data_delta(self):
+        edge_data = self.load_edges()
+        node_metadata = self.load_node_metadata()
+        clusters = self.load_cluster_data()
+
+        self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters)
+
+    def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
+
+        edge_nodes = set([e['_key'] for e in edge_data['nodes']])
+        node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']])
+        cluster_nodes = set([e['_key'] for e in cluster_data['nodes']])
+        all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes)
+
+        # check all nodes in cluster_data have node_metadata
+        clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes)
+        if clstr_no_node_md_set:
+            print({'clusters with no node metadata': clstr_no_node_md_set})
+
+        # check all nodes in the edge_data have node_metadata
+        edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes)
+        if edge_no_node_md_set:
+            print({'edges with no node metadata': edge_no_node_md_set})
+
+        # count all edges
+        print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
+        # count all nodes
+        print("Dataset contains " + str(len(all_nodes)) + " nodes")
@@ -9,7 +9,6 @@
 import os
 
 from importers.djornl.parser import DJORNL_Parser
-
 from spec.test.helpers import modified_environ
 
 _TEST_DIR = '/app/spec/test'
@@ -24,14 +23,29 @@ def setUpClass(cls):
         with open(results_file) as fh:
             cls.json_data = json.load(fh)
 
+        cls.maxDiff = None
+
     def init_parser_with_path(self, root_path):
 
         with modified_environ(RES_ROOT_DATA_PATH=root_path):
             parser = DJORNL_Parser()
             # ensure that the configuration has been set
-            parser.config()
+            parser._configure()
             return parser
 
+    def test_load_invalid_file(self):
+        """ test loading when what is supposed to be a file is actually a directory """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file')
+
+        # edges: directory, not a file
+        edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv")
+        err_str = f"Is a directory: '{edges_file_path}'"
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        with self.assertRaisesRegex(IsADirectoryError, err_str):
+            parser.load_edges()
+
     def test_load_empty_files(self):
         """ test loading files containing no data """
 
@@ -100,33 +114,38 @@ def test_load_valid_edge_data(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.maxDiff = None
-
         edge_data = parser.load_edges()
-        self.assertEqual(
-            edge_data,
-            self.json_data["load_edges"]
-        )
+        expected = self.json_data["load_edges"]
+
+        for data_structure in [edge_data, expected]:
+            for k in data_structure.keys():
+                data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
+
+        self.assertEqual(edge_data, expected)
 
     def test_load_valid_node_metadata(self):
 
-        self.maxDiff = None
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
         node_metadata = parser.load_node_metadata()
-        self.assertEqual(
-            node_metadata,
-            self.json_data["load_node_metadata"]
-        )
+        expected = self.json_data["load_node_metadata"]
+
+        for data_structure in [node_metadata, expected]:
+            for k in data_structure.keys():
+                data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
+                data_structure[k] = [n['_key'] for n in data_structure[k]]
+
+        self.assertEqual(node_metadata, expected)
 
     def test_load_valid_cluster_data(self):
 
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
-
         cluster_data = parser.load_cluster_data()
         self.assertEqual(
             cluster_data,
             self.json_data["load_cluster_data"]
         )
+
+        parser.check_data_delta()
@@ -2,6 +2,10 @@ name: djornl_node
 type: vertex
 delta: false
 
+indexes:
+ - type: hash
+   fields: ["clusters[*]"]
+
 schema:
   "$schema": http://json-schema.org/draft-07/schema#
   title: Gene and Phenotype Vertices
@@ -13,21 +17,15 @@ schema:
       type: string
       title: Key
       examples: ["AT1G01010"]
-    cluster_I2:
-      type: integer
-      title: Cluster 2 ID
-      description: Iterative random forest cluster group ID
-      examples: [1]
-    cluster_I4:
-      type: integer
-      title: Cluster 4 ID
-      description: Iterative random forest cluster group ID
-      examples: [13]
-    cluster_I6:
-      type: integer
-      title: Cluster 6 ID
-      description: Iterative random forest cluster group ID
-      examples: [27]
+    clusters:
+      type: array
+      title: Clusters
+      description: Clusters to which the node has been assigned
+      items:
+        type: string
+        format: regex
+        pattern: ^\w+:\d+$
+      examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
     node_type:
       type: string
       title: Node type

@@ -2,25 +2,13 @@ name: djornl_fetch_clusters
 description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
 params:
   type: object
+  required: [cluster_ids]
   properties:
-    cluster_i2_ids:
-      title: Cluster I2 IDs
-      description: Cluster I2 IDs to locate
-      items: {type: integer}
-      default: []
-      examples: [[1], [3, 5]]
-    cluster_i4_ids:
-      title: Cluster I4 IDs
-      description: Cluster I4 IDs to locate
-      items: {type: integer}
-      examples: [[2], [4, 6]]
-      default: []
-    cluster_i6_ids:
-      title: Cluster I6 IDs
-      description: Cluster I6 IDs to locate
-      items: {type: integer}
-      examples: [[666], [999, 333]]
-      default: []
+    cluster_ids:
+      title: Cluster IDs
+      description: Cluster IDs, in the form "clustering_system_name:cluster_id"
+      items: {type: string}
+      examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
     distance:
       type: integer
       title: Traversal Distance
@@ -31,7 +19,7 @@ params:
 query: |
   LET node_ids = (
     FOR n IN djornl_node
-      FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
+      FILTER n.clusters ANY IN @cluster_ids
       FOR node IN 0..@distance ANY n djornl_edge
         OPTIONS {bfs: true, uniqueVertices: "global"}
         RETURN DISTINCT node._id