Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 72 additions & 25 deletions importers/djornl/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@

class DJORNL_Parser(object):

def config(self):
def config(self, value):
if not hasattr(self, '_config'):
return self._configure()
self._configure()

return self._config
if value not in self._config:
raise KeyError(f'No such config value: {value}')

return self._config[value]

def _configure(self):

Expand All @@ -43,15 +46,15 @@ def _configure(self):

_CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
configuration['_CLUSTER_PATHS'] = {
'cluster_I2': os.path.join(
'markov_i2': os.path.join(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename these to something more informative

_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
),
'cluster_I4': os.path.join(
'markov_i4': os.path.join(
_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
),
'cluster_I6': os.path.join(
'markov_i6': os.path.join(
_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
),
Expand All @@ -74,10 +77,10 @@ def load_edges(self):
# dict of nodes, indexed by node ID (node1 and node2 from the file)
node_ix = {}
edges = []
node_name = self.config()['_NODE_NAME']
expected_col_count = self.config()['_EDGE_FILE_COL_COUNT']
node_name = self.config('_NODE_NAME')
expected_col_count = self.config('_EDGE_FILE_COL_COUNT')

with open(self.config()['_EDGE_PATH']) as fd:
with open(self.config('_EDGE_PATH')) as fd:
csv_reader = csv.reader(fd, delimiter='\t')
next(csv_reader, None) # skip headers
line_no = 1
Expand All @@ -102,6 +105,7 @@ def load_edges(self):
'score': float(cols[2]),
'edge_type': edge_remap[edge_type],
})

return {
'nodes': [{'_key': n} for n in node_ix.keys()],
'edges': edges,
Expand All @@ -111,8 +115,9 @@ def load_node_metadata(self):
"""Load node metadata"""

nodes = []
expected_col_count = self.config()['_NODE_FILE_COL_COUNT']
with open(self.config()['_NODE_PATH']) as fd:
valid_node_types = ['gene', 'pheno']
expected_col_count = self.config('_NODE_FILE_COL_COUNT')
with open(self.config('_NODE_PATH')) as fd:
csv_reader = csv.reader(fd, delimiter=',')
next(csv_reader, None) # skip headers
line_no = 1
Expand All @@ -126,7 +131,7 @@ def load_node_metadata(self):

_key = cols[0]
node_type = cols[1]
if node_type != 'gene' and node_type != 'pheno':
if node_type not in valid_node_types:
raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")

go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
Expand Down Expand Up @@ -154,40 +159,53 @@ def load_node_metadata(self):
'user_notes': cols[19],
}
nodes.append(doc)

return {'nodes': nodes}

def load_cluster_data(self):
"""Annotate genes with cluster ID fields."""
nodes = []
cluster_paths = self.config()['_CLUSTER_PATHS']

# index of nodes
node_ix = {}

cluster_paths = self.config('_CLUSTER_PATHS')
for (cluster_label, path) in cluster_paths.items():
with open(path) as fd:
csv_reader = csv.reader(fd, delimiter='\t')
for row in csv_reader:
if len(row) > 1:
# remove the 'Cluster' text
cluster_id = row[0].replace('Cluster', '')
gene_keys = row[1:]
nodes += [
{'_key': key, cluster_label: int(cluster_id)}
for key in gene_keys
]
# remove the 'Cluster' text and replace it with cluster_label
cluster_id = cluster_label + ':' + row[0].replace('Cluster', '')

node_keys = row[1:]
for key in node_keys:
if key not in node_ix:
node_ix[key] = [cluster_id]
elif cluster_id not in node_ix[key]:
node_ix[key].append(cluster_id)

# gather a list of cluster IDs for each node
nodes = [{
'_key': key,
'clusters': cluster_data
} for (key, cluster_data) in node_ix.items()]

return {'nodes': nodes}

def save_dataset(self, dataset):

if 'nodes' in dataset and len(dataset['nodes']) > 0:
self.save_docs(self.config()['_NODE_NAME'], dataset['nodes'])
self.save_docs(self.config('_NODE_NAME'), dataset['nodes'])

if 'edges' in dataset and len(dataset['edges']) > 0:
self.save_docs(self.config()['_EDGE_NAME'], dataset['edges'])
self.save_docs(self.config('_EDGE_NAME'), dataset['edges'])

def save_docs(self, coll_name, docs, on_dupe='update'):

resp = requests.put(
self.config()['API_URL'] + '/api/v1/documents',
self.config('API_URL') + '/api/v1/documents',
params={'collection': coll_name, 'on_duplicate': on_dupe},
headers={'Authorization': self.config()['AUTH_TOKEN']},
headers={'Authorization': self.config('AUTH_TOKEN')},
data='\n'.join(json.dumps(d) for d in docs)
)
if not resp.ok:
Expand All @@ -202,3 +220,32 @@ def load_data(self):
self.save_dataset(self.load_edges())
self.save_dataset(self.load_node_metadata())
self.save_dataset(self.load_cluster_data())

def check_data_delta(self):
edge_data = self.load_edges()
node_metadata = self.load_node_metadata()
clusters = self.load_cluster_data()

self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters)

def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

brief dataset summary for sanity checking


edge_nodes = set([e['_key'] for e in edge_data['nodes']])
node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']])
cluster_nodes = set([e['_key'] for e in cluster_data['nodes']])
all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes)

# check all nodes in cluster_data have node_metadata
clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes)
if clstr_no_node_md_set:
print({'clusters with no node metadata': clstr_no_node_md_set})

# check all nodes in the edge_data have node_metadata
edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes)
if edge_no_node_md_set:
print({'edges with no node metadata': edge_no_node_md_set})

# count all edges
print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
# count all nodes
print("Dataset contains " + str(len(all_nodes)) + " nodes")
47 changes: 33 additions & 14 deletions importers/test/test_djornl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import os

from importers.djornl.parser import DJORNL_Parser

from spec.test.helpers import modified_environ

_TEST_DIR = '/app/spec/test'
Expand All @@ -24,14 +23,29 @@ def setUpClass(cls):
with open(results_file) as fh:
cls.json_data = json.load(fh)

cls.maxDiff = None

def init_parser_with_path(self, root_path):

with modified_environ(RES_ROOT_DATA_PATH=root_path):
parser = DJORNL_Parser()
# ensure that the configuration has been set
parser.config()
parser._configure()
return parser

def test_load_invalid_file(self):
""" test loading when what is supposed to be a file is actually a directory """

RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file')

# edges: directory, not a file
edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv")
err_str = f"Is a directory: '{edges_file_path}'"
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)

with self.assertRaisesRegex(IsADirectoryError, err_str):
parser.load_edges()

def test_load_empty_files(self):
""" test loading files containing no data """

Expand Down Expand Up @@ -100,33 +114,38 @@ def test_load_valid_edge_data(self):
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)

self.maxDiff = None

edge_data = parser.load_edges()
self.assertEqual(
edge_data,
self.json_data["load_edges"]
)
expected = self.json_data["load_edges"]

for data_structure in [edge_data, expected]:
for k in data_structure.keys():
data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
Comment on lines +120 to +122
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

order data as it won't necessarily be sorted when coming out of the parser


self.assertEqual(edge_data, expected)

def test_load_valid_node_metadata(self):

self.maxDiff = None
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)

node_metadata = parser.load_node_metadata()
self.assertEqual(
node_metadata,
self.json_data["load_node_metadata"]
)
expected = self.json_data["load_node_metadata"]

for data_structure in [node_metadata, expected]:
for k in data_structure.keys():
data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
data_structure[k] = [n['_key'] for n in data_structure[k]]

self.assertEqual(node_metadata, expected)

def test_load_valid_cluster_data(self):

RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)

cluster_data = parser.load_cluster_data()
self.assertEqual(
cluster_data,
self.json_data["load_cluster_data"]
)

parser.check_data_delta()
28 changes: 13 additions & 15 deletions spec/collections/djornl/djornl_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: djornl_node
type: vertex
delta: false

indexes:
- type: hash
fields: ["clusters[*]"]

schema:
"$schema": http://json-schema.org/draft-07/schema#
title: Gene and Phenotype Vertices
Expand All @@ -13,21 +17,15 @@ schema:
type: string
title: Key
examples: ["AT1G01010"]
cluster_I2:
type: integer
title: Cluster 2 ID
description: Iterative random forest cluster group ID
examples: [1]
cluster_I4:
type: integer
title: Cluster 4 ID
description: Iterative random forest cluster group ID
examples: [13]
cluster_I6:
type: integer
title: Cluster 6 ID
description: Iterative random forest cluster group ID
examples: [27]
clusters:
type: array
title: Clusters
description: Clusters to which the node has been assigned
items:
type: string
format: regex
pattern: ^\w+:\d+$
examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
Comment on lines +20 to +28
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The important bit

node_type:
type: string
title: Node type
Expand Down
26 changes: 7 additions & 19 deletions spec/stored_queries/djornl/djornl_fetch_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,13 @@ name: djornl_fetch_clusters
description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
params:
type: object
required: [cluster_ids]
properties:
cluster_i2_ids:
title: Cluster I2 IDs
description: Cluster I2 IDs to locate
items: {type: integer}
default: []
examples: [[1], [3, 5]]
cluster_i4_ids:
title: Cluster I4 IDs
description: Cluster I4 IDs to locate
items: {type: integer}
examples: [[2], [4, 6]]
default: []
cluster_i6_ids:
title: Cluster I6 IDs
description: Cluster I6 IDs to locate
items: {type: integer}
examples: [[666], [999, 333]]
default: []
cluster_ids:
title: Cluster IDs
description: Cluster IDs, in the form "clustering_system_name:cluster_id"
items: {type: string}
examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an object so we don't have to parse these entries?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess if the client is using string parameters like "markov_i2:5" then it doesn't matter

distance:
type: integer
title: Traversal Distance
Expand All @@ -31,7 +19,7 @@ params:
query: |
LET node_ids = (
FOR n IN djornl_node
FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
FILTER n.clusters ANY IN @cluster_ids
FOR node IN 0..@distance ANY n djornl_edge
OPTIONS {bfs: true, uniqueVertices: "global"}
RETURN DISTINCT node._id
Expand Down
Empty file.
Loading