Skip to content

Commit b7780a0

Browse files
committed
Reformat clusters to be a single field in the djornl_node collection.
Update parser and tests accordingly
1 parent c49351d commit b7780a0

7 files changed

Lines changed: 426 additions & 347 deletions

File tree

importers/djornl/parser.py

Lines changed: 72 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414

1515
class DJORNL_Parser(object):
1616

17-
def config(self):
17+
def config(self, value):
1818
if not hasattr(self, '_config'):
19-
return self._configure()
19+
self._configure()
2020

21-
return self._config
21+
if value not in self._config:
22+
raise KeyError(f'No such config value: {value}')
23+
24+
return self._config[value]
2225

2326
def _configure(self):
2427

@@ -43,15 +46,15 @@ def _configure(self):
4346

4447
_CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
4548
configuration['_CLUSTER_PATHS'] = {
46-
'cluster_I2': os.path.join(
49+
'markov_i2': os.path.join(
4750
_CLUSTER_BASE,
4851
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
4952
),
50-
'cluster_I4': os.path.join(
53+
'markov_i4': os.path.join(
5154
_CLUSTER_BASE,
5255
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
5356
),
54-
'cluster_I6': os.path.join(
57+
'markov_i6': os.path.join(
5558
_CLUSTER_BASE,
5659
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
5760
),
@@ -74,10 +77,10 @@ def load_edges(self):
7477
# dict of nodes, indexed by node ID (node1 and node2 from the file)
7578
node_ix = {}
7679
edges = []
77-
node_name = self.config()['_NODE_NAME']
78-
expected_col_count = self.config()['_EDGE_FILE_COL_COUNT']
80+
node_name = self.config('_NODE_NAME')
81+
expected_col_count = self.config('_EDGE_FILE_COL_COUNT')
7982

80-
with open(self.config()['_EDGE_PATH']) as fd:
83+
with open(self.config('_EDGE_PATH')) as fd:
8184
csv_reader = csv.reader(fd, delimiter='\t')
8285
next(csv_reader, None) # skip headers
8386
line_no = 1
@@ -102,6 +105,7 @@ def load_edges(self):
102105
'score': float(cols[2]),
103106
'edge_type': edge_remap[edge_type],
104107
})
108+
105109
return {
106110
'nodes': [{'_key': n} for n in node_ix.keys()],
107111
'edges': edges,
@@ -111,8 +115,9 @@ def load_node_metadata(self):
111115
"""Load node metadata"""
112116

113117
nodes = []
114-
expected_col_count = self.config()['_NODE_FILE_COL_COUNT']
115-
with open(self.config()['_NODE_PATH']) as fd:
118+
valid_node_types = ['gene', 'pheno']
119+
expected_col_count = self.config('_NODE_FILE_COL_COUNT')
120+
with open(self.config('_NODE_PATH')) as fd:
116121
csv_reader = csv.reader(fd, delimiter=',')
117122
next(csv_reader, None) # skip headers
118123
line_no = 1
@@ -126,7 +131,7 @@ def load_node_metadata(self):
126131

127132
_key = cols[0]
128133
node_type = cols[1]
129-
if node_type != 'gene' and node_type != 'pheno':
134+
if node_type not in valid_node_types:
130135
raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")
131136

132137
go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
@@ -154,40 +159,53 @@ def load_node_metadata(self):
154159
'user_notes': cols[19],
155160
}
156161
nodes.append(doc)
162+
157163
return {'nodes': nodes}
158164

159165
def load_cluster_data(self):
160166
"""Annotate genes with cluster ID fields."""
161-
nodes = []
162-
cluster_paths = self.config()['_CLUSTER_PATHS']
167+
168+
# index of nodes
169+
node_ix = {}
170+
171+
cluster_paths = self.config('_CLUSTER_PATHS')
163172
for (cluster_label, path) in cluster_paths.items():
164173
with open(path) as fd:
165174
csv_reader = csv.reader(fd, delimiter='\t')
166175
for row in csv_reader:
167176
if len(row) > 1:
168-
# remove the 'Cluster' text
169-
cluster_id = row[0].replace('Cluster', '')
170-
gene_keys = row[1:]
171-
nodes += [
172-
{'_key': key, cluster_label: int(cluster_id)}
173-
for key in gene_keys
174-
]
177+
# remove the 'Cluster' text and replace it with cluster_label
178+
cluster_id = cluster_label + ':' + row[0].replace('Cluster', '')
179+
180+
node_keys = row[1:]
181+
for key in node_keys:
182+
if key not in node_ix:
183+
node_ix[key] = [cluster_id]
184+
elif cluster_id not in node_ix[key]:
185+
node_ix[key].append(cluster_id)
186+
187+
# gather a list of cluster IDs for each node
188+
nodes = [{
189+
'_key': key,
190+
'clusters': cluster_data
191+
} for (key, cluster_data) in node_ix.items()]
192+
175193
return {'nodes': nodes}
176194

177195
def save_dataset(self, dataset):
178196

179197
if 'nodes' in dataset and len(dataset['nodes']) > 0:
180-
self.save_docs(self.config()['_NODE_NAME'], dataset['nodes'])
198+
self.save_docs(self.config('_NODE_NAME'), dataset['nodes'])
181199

182200
if 'edges' in dataset and len(dataset['edges']) > 0:
183-
self.save_docs(self.config()['_EDGE_NAME'], dataset['edges'])
201+
self.save_docs(self.config('_EDGE_NAME'), dataset['edges'])
184202

185203
def save_docs(self, coll_name, docs, on_dupe='update'):
186204

187205
resp = requests.put(
188-
self.config()['API_URL'] + '/api/v1/documents',
206+
self.config('API_URL') + '/api/v1/documents',
189207
params={'collection': coll_name, 'on_duplicate': on_dupe},
190-
headers={'Authorization': self.config()['AUTH_TOKEN']},
208+
headers={'Authorization': self.config('AUTH_TOKEN')},
191209
data='\n'.join(json.dumps(d) for d in docs)
192210
)
193211
if not resp.ok:
@@ -202,3 +220,32 @@ def load_data(self):
202220
self.save_dataset(self.load_edges())
203221
self.save_dataset(self.load_node_metadata())
204222
self.save_dataset(self.load_cluster_data())
223+
224+
def check_data_delta(self):
225+
edge_data = self.load_edges()
226+
node_metadata = self.load_node_metadata()
227+
clusters = self.load_cluster_data()
228+
229+
self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters)
230+
231+
def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
232+
233+
edge_nodes = set([e['_key'] for e in edge_data['nodes']])
234+
node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']])
235+
cluster_nodes = set([e['_key'] for e in cluster_data['nodes']])
236+
all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes)
237+
238+
# check all nodes in cluster_data have node_metadata
239+
clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes)
240+
if clstr_no_node_md_set:
241+
print({'clusters with no node metadata': clstr_no_node_md_set})
242+
243+
# check all nodes in the edge_data have node_metadata
244+
edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes)
245+
if edge_no_node_md_set:
246+
print({'edges with no node metadata': edge_no_node_md_set})
247+
248+
# count all edges
249+
print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
250+
# count all nodes
251+
print("Dataset contains " + str(len(all_nodes)) + " nodes")

importers/test/test_djornl_parser.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import os
1010

1111
from importers.djornl.parser import DJORNL_Parser
12-
1312
from spec.test.helpers import modified_environ
1413

1514
_TEST_DIR = '/app/spec/test'
@@ -24,14 +23,29 @@ def setUpClass(cls):
2423
with open(results_file) as fh:
2524
cls.json_data = json.load(fh)
2625

26+
cls.maxDiff = None
27+
2728
def init_parser_with_path(self, root_path):
2829

2930
with modified_environ(RES_ROOT_DATA_PATH=root_path):
3031
parser = DJORNL_Parser()
3132
# ensure that the configuration has been set
32-
parser.config()
33+
parser._configure()
3334
return parser
3435

36+
def test_load_invalid_file(self):
37+
""" test loading when what is supposed to be a file is actually a directory """
38+
39+
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file')
40+
41+
# edges: directory, not a file
42+
edges_file_path = os.path.join(RES_ROOT_DATA_PATH, "merged_edges-AMW-060820_AF.tsv")
43+
err_str = f"Is a directory: '{edges_file_path}'"
44+
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
45+
46+
with self.assertRaisesRegex(IsADirectoryError, err_str):
47+
parser.load_edges()
48+
3549
def test_load_empty_files(self):
3650
""" test loading files containing no data """
3751

@@ -100,33 +114,38 @@ def test_load_valid_edge_data(self):
100114
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
101115
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
102116

103-
self.maxDiff = None
104-
105117
edge_data = parser.load_edges()
106-
self.assertEqual(
107-
edge_data,
108-
self.json_data["load_edges"]
109-
)
118+
expected = self.json_data["load_edges"]
119+
120+
for data_structure in [edge_data, expected]:
121+
for k in data_structure.keys():
122+
data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
123+
124+
self.assertEqual(edge_data, expected)
110125

111126
def test_load_valid_node_metadata(self):
112127

113-
self.maxDiff = None
114128
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
115129
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
116130

117131
node_metadata = parser.load_node_metadata()
118-
self.assertEqual(
119-
node_metadata,
120-
self.json_data["load_node_metadata"]
121-
)
132+
expected = self.json_data["load_node_metadata"]
133+
134+
for data_structure in [node_metadata, expected]:
135+
for k in data_structure.keys():
136+
data_structure[k] = sorted(data_structure[k], key=lambda n: n['_key'])
137+
data_structure[k] = [n['_key'] for n in data_structure[k]]
138+
139+
self.assertEqual(node_metadata, expected)
122140

123141
def test_load_valid_cluster_data(self):
124142

125143
RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
126144
parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
127-
128145
cluster_data = parser.load_cluster_data()
129146
self.assertEqual(
130147
cluster_data,
131148
self.json_data["load_cluster_data"]
132149
)
150+
151+
parser.check_data_delta()

spec/collections/djornl/djornl_node.yaml

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ name: djornl_node
22
type: vertex
33
delta: false
44

5+
indexes:
6+
- type: hash
7+
fields: ["clusters[*]"]
8+
59
schema:
610
"$schema": http://json-schema.org/draft-07/schema#
711
title: Gene and Phenotype Vertices
@@ -13,21 +17,15 @@ schema:
1317
type: string
1418
title: Key
1519
examples: ["AT1G01010"]
16-
cluster_I2:
17-
type: integer
18-
title: Cluster 2 ID
19-
description: Iterative random forest cluster group ID
20-
examples: [1]
21-
cluster_I4:
22-
type: integer
23-
title: Cluster 4 ID
24-
description: Iterative random forest cluster group ID
25-
examples: [13]
26-
cluster_I6:
27-
type: integer
28-
title: Cluster 6 ID
29-
description: Iterative random forest cluster group ID
30-
examples: [27]
20+
clusters:
21+
type: array
22+
title: Clusters
23+
description: Clusters to which the node has been assigned
24+
items:
25+
type: string
26+
format: regex
27+
pattern: ^\w+:\d+$
28+
examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
3129
node_type:
3230
type: string
3331
title: Node type

spec/stored_queries/djornl/djornl_fetch_clusters.yaml

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,13 @@ name: djornl_fetch_clusters
22
description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
33
params:
44
type: object
5+
required: [cluster_ids]
56
properties:
6-
cluster_i2_ids:
7-
title: Cluster I2 IDs
8-
description: Cluster I2 IDs to locate
9-
items: {type: integer}
10-
default: []
11-
examples: [[1], [3, 5]]
12-
cluster_i4_ids:
13-
title: Cluster I4 IDs
14-
description: Cluster I4 IDs to locate
15-
items: {type: integer}
16-
examples: [[2], [4, 6]]
17-
default: []
18-
cluster_i6_ids:
19-
title: Cluster I6 IDs
20-
description: Cluster I6 IDs to locate
21-
items: {type: integer}
22-
examples: [[666], [999, 333]]
23-
default: []
7+
cluster_ids:
8+
title: Cluster IDs
9+
description: Cluster IDs, in the form "clustering_system_name:cluster_id"
10+
items: {type: string}
11+
examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
2412
distance:
2513
type: integer
2614
title: Traversal Distance
@@ -31,7 +19,7 @@ params:
3119
query: |
3220
LET node_ids = (
3321
FOR n IN djornl_node
34-
FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
22+
FILTER n.clusters ANY IN @cluster_ids
3523
FOR node IN 0..@distance ANY n djornl_edge
3624
OPTIONS {bfs: true, uniqueVertices: "global"}
3725
RETURN DISTINCT node._id

spec/test/djornl/invalid_file/merged_edges-AMW-060820_AF.tsv/empty

Whitespace-only changes.

0 commit comments

Comments
 (0)