-
Notifications
You must be signed in to change notification settings - Fork 11
Cluster the cluster data #18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,11 +14,14 @@ | |
|
|
||
| class DJORNL_Parser(object): | ||
|
|
||
| def config(self): | ||
| def config(self, value): | ||
| if not hasattr(self, '_config'): | ||
| return self._configure() | ||
| self._configure() | ||
|
|
||
| return self._config | ||
| if value not in self._config: | ||
| raise KeyError(f'No such config value: {value}') | ||
|
|
||
| return self._config[value] | ||
|
|
||
| def _configure(self): | ||
|
|
||
|
|
@@ -43,15 +46,15 @@ def _configure(self): | |
|
|
||
| _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') | ||
| configuration['_CLUSTER_PATHS'] = { | ||
| 'cluster_I2': os.path.join( | ||
| 'markov_i2': os.path.join( | ||
| _CLUSTER_BASE, | ||
| 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' | ||
| ), | ||
| 'cluster_I4': os.path.join( | ||
| 'markov_i4': os.path.join( | ||
| _CLUSTER_BASE, | ||
| 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' | ||
| ), | ||
| 'cluster_I6': os.path.join( | ||
| 'markov_i6': os.path.join( | ||
| _CLUSTER_BASE, | ||
| 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' | ||
| ), | ||
|
|
@@ -74,10 +77,10 @@ def load_edges(self): | |
| # dict of nodes, indexed by node ID (node1 and node2 from the file) | ||
| node_ix = {} | ||
| edges = [] | ||
| node_name = self.config()['_NODE_NAME'] | ||
| expected_col_count = self.config()['_EDGE_FILE_COL_COUNT'] | ||
| node_name = self.config('_NODE_NAME') | ||
| expected_col_count = self.config('_EDGE_FILE_COL_COUNT') | ||
|
|
||
| with open(self.config()['_EDGE_PATH']) as fd: | ||
| with open(self.config('_EDGE_PATH')) as fd: | ||
| csv_reader = csv.reader(fd, delimiter='\t') | ||
| next(csv_reader, None) # skip headers | ||
| line_no = 1 | ||
|
|
@@ -102,6 +105,7 @@ def load_edges(self): | |
| 'score': float(cols[2]), | ||
| 'edge_type': edge_remap[edge_type], | ||
| }) | ||
|
|
||
| return { | ||
| 'nodes': [{'_key': n} for n in node_ix.keys()], | ||
| 'edges': edges, | ||
|
|
@@ -111,8 +115,9 @@ def load_node_metadata(self): | |
| """Load node metadata""" | ||
|
|
||
| nodes = [] | ||
| expected_col_count = self.config()['_NODE_FILE_COL_COUNT'] | ||
| with open(self.config()['_NODE_PATH']) as fd: | ||
| valid_node_types = ['gene', 'pheno'] | ||
| expected_col_count = self.config('_NODE_FILE_COL_COUNT') | ||
| with open(self.config('_NODE_PATH')) as fd: | ||
| csv_reader = csv.reader(fd, delimiter=',') | ||
| next(csv_reader, None) # skip headers | ||
| line_no = 1 | ||
|
|
@@ -126,7 +131,7 @@ def load_node_metadata(self): | |
|
|
||
| _key = cols[0] | ||
| node_type = cols[1] | ||
| if node_type != 'gene' and node_type != 'pheno': | ||
| if node_type not in valid_node_types: | ||
| raise RuntimeError(f"line {line_no}: invalid node type: {node_type}") | ||
|
|
||
| go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] | ||
|
|
@@ -154,40 +159,53 @@ def load_node_metadata(self): | |
| 'user_notes': cols[19], | ||
| } | ||
| nodes.append(doc) | ||
|
|
||
| return {'nodes': nodes} | ||
|
|
||
| def load_cluster_data(self): | ||
| """Annotate genes with cluster ID fields.""" | ||
| nodes = [] | ||
| cluster_paths = self.config()['_CLUSTER_PATHS'] | ||
|
|
||
| # index of nodes | ||
| node_ix = {} | ||
|
|
||
| cluster_paths = self.config('_CLUSTER_PATHS') | ||
| for (cluster_label, path) in cluster_paths.items(): | ||
| with open(path) as fd: | ||
| csv_reader = csv.reader(fd, delimiter='\t') | ||
| for row in csv_reader: | ||
| if len(row) > 1: | ||
| # remove the 'Cluster' text | ||
| cluster_id = row[0].replace('Cluster', '') | ||
| gene_keys = row[1:] | ||
| nodes += [ | ||
| {'_key': key, cluster_label: int(cluster_id)} | ||
| for key in gene_keys | ||
| ] | ||
| # remove the 'Cluster' text and replace it with cluster_label | ||
| cluster_id = cluster_label + ':' + row[0].replace('Cluster', '') | ||
|
|
||
| node_keys = row[1:] | ||
| for key in node_keys: | ||
| if key not in node_ix: | ||
| node_ix[key] = [cluster_id] | ||
| elif cluster_id not in node_ix[key]: | ||
| node_ix[key].append(cluster_id) | ||
|
|
||
| # gather a list of cluster IDs for each node | ||
| nodes = [{ | ||
| '_key': key, | ||
| 'clusters': cluster_data | ||
| } for (key, cluster_data) in node_ix.items()] | ||
|
|
||
| return {'nodes': nodes} | ||
|
|
||
| def save_dataset(self, dataset): | ||
|
|
||
| if 'nodes' in dataset and len(dataset['nodes']) > 0: | ||
| self.save_docs(self.config()['_NODE_NAME'], dataset['nodes']) | ||
| self.save_docs(self.config('_NODE_NAME'), dataset['nodes']) | ||
|
|
||
| if 'edges' in dataset and len(dataset['edges']) > 0: | ||
| self.save_docs(self.config()['_EDGE_NAME'], dataset['edges']) | ||
| self.save_docs(self.config('_EDGE_NAME'), dataset['edges']) | ||
|
|
||
| def save_docs(self, coll_name, docs, on_dupe='update'): | ||
|
|
||
| resp = requests.put( | ||
| self.config()['API_URL'] + '/api/v1/documents', | ||
| self.config('API_URL') + '/api/v1/documents', | ||
| params={'collection': coll_name, 'on_duplicate': on_dupe}, | ||
| headers={'Authorization': self.config()['AUTH_TOKEN']}, | ||
| headers={'Authorization': self.config('AUTH_TOKEN')}, | ||
| data='\n'.join(json.dumps(d) for d in docs) | ||
| ) | ||
| if not resp.ok: | ||
|
|
@@ -202,3 +220,32 @@ def load_data(self): | |
| self.save_dataset(self.load_edges()) | ||
| self.save_dataset(self.load_node_metadata()) | ||
| self.save_dataset(self.load_cluster_data()) | ||
|
|
||
| def check_data_delta(self): | ||
| edge_data = self.load_edges() | ||
| node_metadata = self.load_node_metadata() | ||
| clusters = self.load_cluster_data() | ||
|
|
||
| self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters) | ||
|
|
||
| def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. brief dataset summary for sanity checking |
||
|
|
||
| edge_nodes = set([e['_key'] for e in edge_data['nodes']]) | ||
| node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']]) | ||
| cluster_nodes = set([e['_key'] for e in cluster_data['nodes']]) | ||
| all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes) | ||
|
|
||
| # check all nodes in cluster_data have node_metadata | ||
| clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes) | ||
| if clstr_no_node_md_set: | ||
| print({'clusters with no node metadata': clstr_no_node_md_set}) | ||
|
|
||
| # check all nodes in the edge_data have node_metadata | ||
| edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes) | ||
| if edge_no_node_md_set: | ||
| print({'edges with no node metadata': edge_no_node_md_set}) | ||
|
|
||
| # count all edges | ||
| print("Dataset contains " + str(len(edge_data['edges'])) + " edges") | ||
| # count all nodes | ||
| print("Dataset contains " + str(len(all_nodes)) + " nodes") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,6 @@ | |
| import os | ||
|
|
||
| from importers.djornl.parser import DJORNL_Parser | ||
|
|
||
| from spec.test.helpers import modified_environ | ||
|
|
||
| _TEST_DIR = '/app/spec/test' | ||
|
|
@@ -24,14 +23,29 @@ def setUpClass(cls): | |
| with open(results_file) as fh: | ||
| cls.json_data = json.load(fh) | ||
|
|
||
| cls.maxDiff = None | ||
|
|
||
| def init_parser_with_path(self, root_path): | ||
|
|
||
| with modified_environ(RES_ROOT_DATA_PATH=root_path): | ||
| parser = DJORNL_Parser() | ||
| # ensure that the configuration has been set | ||
| parser.config() | ||
| parser._configure() | ||
| return parser | ||
|
|
||
| def test_load_invalid_file(self): | ||
| """ test loading when what is supposed to be a file is actually a directory """ | ||
|
|
||
| RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file') | ||
|
|
||
| # edges: directory, not a file | ||
| edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv") | ||
| err_str = f"Is a directory: '{edges_file_path}'" | ||
| parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) | ||
|
|
||
| with self.assertRaisesRegex(IsADirectoryError, err_str): | ||
| parser.load_edges() | ||
|
|
||
| def test_load_empty_files(self): | ||
| """ test loading files containing no data """ | ||
|
|
||
|
|
@@ -100,33 +114,38 @@ def test_load_valid_edge_data(self): | |
| RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') | ||
| parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) | ||
|
|
||
| self.maxDiff = None | ||
|
|
||
| edge_data = parser.load_edges() | ||
| self.assertEqual( | ||
| edge_data, | ||
| self.json_data["load_edges"] | ||
| ) | ||
| expected = self.json_data["load_edges"] | ||
|
|
||
| for data_structure in [edge_data, expected]: | ||
| for k in data_structure.keys(): | ||
| data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key']) | ||
|
Comment on lines
+120
to
+122
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. order data as it won't necessarily be sorted when coming out of the parser |
||
|
|
||
| self.assertEqual(edge_data, expected) | ||
|
|
||
| def test_load_valid_node_metadata(self): | ||
|
|
||
| self.maxDiff = None | ||
| RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') | ||
| parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) | ||
|
|
||
| node_metadata = parser.load_node_metadata() | ||
| self.assertEqual( | ||
| node_metadata, | ||
| self.json_data["load_node_metadata"] | ||
| ) | ||
| expected = self.json_data["load_node_metadata"] | ||
|
|
||
| for data_structure in [node_metadata, expected]: | ||
| for k in data_structure.keys(): | ||
| data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key']) | ||
| data_structure[k] = [n['_key'] for n in data_structure[k]] | ||
|
|
||
| self.assertEqual(node_metadata, expected) | ||
|
|
||
| def test_load_valid_cluster_data(self): | ||
|
|
||
| RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') | ||
| parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) | ||
|
|
||
| cluster_data = parser.load_cluster_data() | ||
| self.assertEqual( | ||
| cluster_data, | ||
| self.json_data["load_cluster_data"] | ||
| ) | ||
|
|
||
| parser.check_data_delta() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,10 @@ name: djornl_node | |
| type: vertex | ||
| delta: false | ||
|
|
||
| indexes: | ||
| - type: hash | ||
| fields: ["clusters[*]"] | ||
|
|
||
| schema: | ||
| "$schema": http://json-schema.org/draft-07/schema# | ||
| title: Gene and Phenotype Vertices | ||
|
|
@@ -13,21 +17,15 @@ schema: | |
| type: string | ||
| title: Key | ||
| examples: ["AT1G01010"] | ||
| cluster_I2: | ||
| type: integer | ||
| title: Cluster 2 ID | ||
| description: Iterative random forest cluster group ID | ||
| examples: [1] | ||
| cluster_I4: | ||
| type: integer | ||
| title: Cluster 4 ID | ||
| description: Iterative random forest cluster group ID | ||
| examples: [13] | ||
| cluster_I6: | ||
| type: integer | ||
| title: Cluster 6 ID | ||
| description: Iterative random forest cluster group ID | ||
| examples: [27] | ||
| clusters: | ||
| type: array | ||
| title: Clusters | ||
| description: Clusters to which the node has been assigned | ||
| items: | ||
| type: string | ||
| format: regex | ||
| pattern: ^\w+:\d+$ | ||
| examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] | ||
|
Comment on lines
+20
to
+28
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The important bit |
||
| node_type: | ||
| type: string | ||
| title: Node type | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,25 +2,13 @@ name: djornl_fetch_clusters | |
| description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes. | ||
| params: | ||
| type: object | ||
| required: [cluster_ids] | ||
| properties: | ||
| cluster_i2_ids: | ||
| title: Cluster I2 IDs | ||
| description: Cluster I2 IDs to locate | ||
| items: {type: integer} | ||
| default: [] | ||
| examples: [[1], [3, 5]] | ||
| cluster_i4_ids: | ||
| title: Cluster I4 IDs | ||
| description: Cluster I4 IDs to locate | ||
| items: {type: integer} | ||
| examples: [[2], [4, 6]] | ||
| default: [] | ||
| cluster_i6_ids: | ||
| title: Cluster I6 IDs | ||
| description: Cluster I6 IDs to locate | ||
| items: {type: integer} | ||
| examples: [[666], [999, 333]] | ||
| default: [] | ||
| cluster_ids: | ||
| title: Cluster IDs | ||
| description: Cluster IDs, in the form "clustering_system_name:cluster_id" | ||
| items: {type: string} | ||
| examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be an object so we don't have to parse these entries?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess if the client is using string parameters like "markov_i2:5" then it doesn't matter |
||
| distance: | ||
| type: integer | ||
| title: Traversal Distance | ||
|
|
@@ -31,7 +19,7 @@ params: | |
| query: | | ||
| LET node_ids = ( | ||
| FOR n IN djornl_node | ||
| FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids | ||
| FILTER n.clusters ANY IN @cluster_ids | ||
| FOR node IN 0..@distance ANY n djornl_edge | ||
| OPTIONS {bfs: true, uniqueVertices: "global"} | ||
| RETURN DISTINCT node._id | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename these to something more informative