1414
1515class DJORNL_Parser (object ):
1616
17- def config (self ):
17+ def config (self , value ):
1818 if not hasattr (self , '_config' ):
19- return self ._configure ()
19+ self ._configure ()
2020
21- return self ._config
21+ if value not in self ._config :
22+ raise KeyError (f'No such config value: { value } ' )
23+
24+ return self ._config [value ]
2225
2326 def _configure (self ):
2427
@@ -43,15 +46,15 @@ def _configure(self):
4346
4447 _CLUSTER_BASE = os .path .join (configuration ['ROOT_DATA_PATH' ], 'cluster_data' )
4548 configuration ['_CLUSTER_PATHS' ] = {
46- 'cluster_I2 ' : os .path .join (
49+ 'markov_i2 ' : os .path .join (
4750 _CLUSTER_BASE ,
4851 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
4952 ),
50- 'cluster_I4 ' : os .path .join (
53+ 'markov_i4 ' : os .path .join (
5154 _CLUSTER_BASE ,
5255 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
5356 ),
54- 'cluster_I6 ' : os .path .join (
57+ 'markov_i6 ' : os .path .join (
5558 _CLUSTER_BASE ,
5659 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
5760 ),
@@ -74,10 +77,10 @@ def load_edges(self):
7477 # dict of nodes, indexed by node ID (node1 and node2 from the file)
7578 node_ix = {}
7679 edges = []
77- node_name = self .config ()[ '_NODE_NAME' ]
78- expected_col_count = self .config ()[ '_EDGE_FILE_COL_COUNT' ]
80+ node_name = self .config ('_NODE_NAME' )
81+ expected_col_count = self .config ('_EDGE_FILE_COL_COUNT' )
7982
80- with open (self .config ()[ '_EDGE_PATH' ] ) as fd :
83+ with open (self .config ('_EDGE_PATH' ) ) as fd :
8184 csv_reader = csv .reader (fd , delimiter = '\t ' )
8285 next (csv_reader , None ) # skip headers
8386 line_no = 1
@@ -102,6 +105,7 @@ def load_edges(self):
102105 'score' : float (cols [2 ]),
103106 'edge_type' : edge_remap [edge_type ],
104107 })
108+
105109 return {
106110 'nodes' : [{'_key' : n } for n in node_ix .keys ()],
107111 'edges' : edges ,
@@ -111,8 +115,9 @@ def load_node_metadata(self):
111115 """Load node metadata"""
112116
113117 nodes = []
114- expected_col_count = self .config ()['_NODE_FILE_COL_COUNT' ]
115- with open (self .config ()['_NODE_PATH' ]) as fd :
118+ valid_node_types = ['gene' , 'pheno' ]
119+ expected_col_count = self .config ('_NODE_FILE_COL_COUNT' )
120+ with open (self .config ('_NODE_PATH' )) as fd :
116121 csv_reader = csv .reader (fd , delimiter = ',' )
117122 next (csv_reader , None ) # skip headers
118123 line_no = 1
@@ -126,7 +131,7 @@ def load_node_metadata(self):
126131
127132 _key = cols [0 ]
128133 node_type = cols [1 ]
129- if node_type != 'gene' and node_type != 'pheno' :
134+ if node_type not in valid_node_types :
130135 raise RuntimeError (f"line { line_no } : invalid node type: { node_type } " )
131136
132137 go_terms = [c .strip () for c in cols [10 ].split (',' )] if len (cols [10 ]) else []
@@ -154,40 +159,53 @@ def load_node_metadata(self):
154159 'user_notes' : cols [19 ],
155160 }
156161 nodes .append (doc )
162+
157163 return {'nodes' : nodes }
158164
159165 def load_cluster_data (self ):
160166 """Annotate genes with cluster ID fields."""
161- nodes = []
162- cluster_paths = self .config ()['_CLUSTER_PATHS' ]
167+
168+ # index of nodes
169+ node_ix = {}
170+
171+ cluster_paths = self .config ('_CLUSTER_PATHS' )
163172 for (cluster_label , path ) in cluster_paths .items ():
164173 with open (path ) as fd :
165174 csv_reader = csv .reader (fd , delimiter = '\t ' )
166175 for row in csv_reader :
167176 if len (row ) > 1 :
168- # remove the 'Cluster' text
169- cluster_id = row [0 ].replace ('Cluster' , '' )
170- gene_keys = row [1 :]
171- nodes += [
172- {'_key' : key , cluster_label : int (cluster_id )}
173- for key in gene_keys
174- ]
177+ # remove the 'Cluster' text and replace it with cluster_label
178+ cluster_id = cluster_label + ':' + row [0 ].replace ('Cluster' , '' )
179+
180+ node_keys = row [1 :]
181+ for key in node_keys :
182+ if key not in node_ix :
183+ node_ix [key ] = [cluster_id ]
184+ elif cluster_id not in node_ix [key ]:
185+ node_ix [key ].append (cluster_id )
186+
187+ # gather a list of cluster IDs for each node
188+ nodes = [{
189+ '_key' : key ,
190+ 'clusters' : cluster_data
191+ } for (key , cluster_data ) in node_ix .items ()]
192+
175193 return {'nodes' : nodes }
176194
177195 def save_dataset (self , dataset ):
178196
179197 if 'nodes' in dataset and len (dataset ['nodes' ]) > 0 :
180- self .save_docs (self .config ()[ '_NODE_NAME' ] , dataset ['nodes' ])
198+ self .save_docs (self .config ('_NODE_NAME' ) , dataset ['nodes' ])
181199
182200 if 'edges' in dataset and len (dataset ['edges' ]) > 0 :
183- self .save_docs (self .config ()[ '_EDGE_NAME' ] , dataset ['edges' ])
201+ self .save_docs (self .config ('_EDGE_NAME' ) , dataset ['edges' ])
184202
185203 def save_docs (self , coll_name , docs , on_dupe = 'update' ):
186204
187205 resp = requests .put (
188- self .config ()[ 'API_URL' ] + '/api/v1/documents' ,
206+ self .config ('API_URL' ) + '/api/v1/documents' ,
189207 params = {'collection' : coll_name , 'on_duplicate' : on_dupe },
190- headers = {'Authorization' : self .config ()[ 'AUTH_TOKEN' ] },
208+ headers = {'Authorization' : self .config ('AUTH_TOKEN' ) },
191209 data = '\n ' .join (json .dumps (d ) for d in docs )
192210 )
193211 if not resp .ok :
@@ -202,3 +220,32 @@ def load_data(self):
202220 self .save_dataset (self .load_edges ())
203221 self .save_dataset (self .load_node_metadata ())
204222 self .save_dataset (self .load_cluster_data ())
223+
224+ def check_data_delta (self ):
225+ edge_data = self .load_edges ()
226+ node_metadata = self .load_node_metadata ()
227+ clusters = self .load_cluster_data ()
228+
229+ self .check_deltas (edge_data = edge_data , node_metadata = node_metadata , cluster_data = clusters )
230+
231+ def check_deltas (self , edge_data = {}, node_metadata = {}, cluster_data = {}):
232+
233+ edge_nodes = set ([e ['_key' ] for e in edge_data ['nodes' ]])
234+ node_metadata_nodes = set ([e ['_key' ] for e in node_metadata ['nodes' ]])
235+ cluster_nodes = set ([e ['_key' ] for e in cluster_data ['nodes' ]])
236+ all_nodes = edge_nodes .union (node_metadata_nodes ).union (cluster_nodes )
237+
238+ # check all nodes in cluster_data have node_metadata
239+ clstr_no_node_md_set = cluster_nodes .difference (node_metadata_nodes )
240+ if clstr_no_node_md_set :
241+ print ({'clusters with no node metadata' : clstr_no_node_md_set })
242+
243+ # check all nodes in the edge_data have node_metadata
244+ edge_no_node_md_set = edge_nodes .difference (node_metadata_nodes )
245+ if edge_no_node_md_set :
246+ print ({'edges with no node metadata' : edge_no_node_md_set })
247+
248+ # count all edges
249+ print ("Dataset contains " + str (len (edge_data ['edges' ])) + " edges" )
250+ # count all nodes
251+ print ("Dataset contains " + str (len (all_nodes )) + " nodes" )
0 commit comments