From 8939ee06536592cde30606e07dd600be2bbecf25 Mon Sep 17 00:00:00 2001 From: chasem Date: Wed, 18 Feb 2026 13:53:42 -0600 Subject: [PATCH 1/2] this seems to work --- docs/tutorials/virtual_db_tutorial.ipynb | 3039 +++++++++++----------- docs/virtual_db_configuration.md | 27 +- tfbpapi/datacard.py | 243 +- tfbpapi/models.py | 29 + tfbpapi/tests/test_datacard.py | 461 ++++ tfbpapi/tests/test_virtual_db.py | 350 ++- tfbpapi/virtual_db.py | 271 +- 7 files changed, 2824 insertions(+), 1596 deletions(-) diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb index 7305146..4756aa2 100644 --- a/docs/tutorials/virtual_db_tutorial.ipynb +++ b/docs/tutorials/virtual_db_tutorial.ipynb @@ -33,7 +33,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Config saved to: /tmp/tmpf610qghb/vdb_config.yaml\n" + "Config saved to: /tmp/tmp60wlr_qk/vdb_config.yaml\n" ] } ], @@ -202,10 +202,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12264.05it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9731.56it/s]\n", - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 21883.33it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10837.99it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1986.88it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 4471.54it/s]\n", + "No metadata fields found for data config 'dto' in repo 'BrentLab/yeast_comparative_analysis' -- no embedded metadata_fields and no metadata config with applies_to\n", + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 12571.60it/s]\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", @@ -284,7 +285,7 @@ "type": "unknown" } ], - "ref": "8720a362-ea0c-4293-9656-ba6725dcaa3d", + "ref": "66a177b5-e333-42e2-8d93-ea35947dbea2", "rows": [ [ "0", @@ -299,7 +300,7 @@ [ "1", "harbison_meta", - "regulator_locus_tag", + "condition", "VARCHAR", "YES", null, @@ -309,7 +310,7 @@ [ "2", "harbison_meta", - "regulator_symbol", + "regulator_locus_tag", "VARCHAR", "YES", null, @@ -319,7 +320,7 @@ [ "3", "harbison_meta", - "condition", + "regulator_symbol", "VARCHAR", "YES", null, @@ -394,7 +395,7 @@ " \n", " 1\n", " harbison_meta\n", - " regulator_locus_tag\n", + " condition\n", " VARCHAR\n", " YES\n", " None\n", @@ -404,7 +405,7 @@ " \n", " 2\n", " harbison_meta\n", - " regulator_symbol\n", + " regulator_locus_tag\n", " VARCHAR\n", " YES\n", " None\n", @@ -414,7 +415,7 @@ " \n", " 3\n", " harbison_meta\n", - " condition\n", + " regulator_symbol\n", " VARCHAR\n", " YES\n", " None\n", @@ -448,9 +449,9 @@ "text/plain": [ " table column_name column_type null key default extra\n", "0 harbison_meta sample_id INTEGER YES None None None\n", - "1 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", - "2 harbison_meta regulator_symbol VARCHAR YES None None None\n", - "3 harbison_meta condition VARCHAR YES None None None\n", + "1 harbison_meta condition VARCHAR YES None None None\n", + "2 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", + "3 harbison_meta regulator_symbol VARCHAR YES None None None\n", "4 harbison_meta carbon_source VARCHAR YES None None None\n", "5 harbison_meta temperature_celsius DOUBLE YES None None None" ] @@ -517,7 +518,7 @@ "type": "unknown" } ], - "ref": "001db2c7-a5c2-4561-9b12-35733ce1b2e6", + "ref": "41220a48-48eb-4c49-8ebf-9489d5041d87", "rows": [ [ "0", @@ -858,17 +859,17 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "condition", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "condition", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, @@ -883,50 +884,50 @@ "type": "float" } ], - "ref": "e5bb4909-b231-44d7-85b8-5219b51f4a4b", + "ref": "8ff35aa7-7992-44bb-ab5d-5d6376acd6f5", "rows": [ [ "0", - "166", - "YIL131C", - "FKH1", + "206", "YPD", + "YKL072W", + "STB6", "glucose", "30.0" ], [ "1", - "3", - "YBL005W", - "PDR3", + "249", "YPD", + "YMR016C", + "SOK2", "glucose", "30.0" ], [ "2", - "173", - "YIR023W", - "DAL81", + "309", "YPD", + "YOR162C", + "YRR1", "glucose", "30.0" ], [ "3", - "220", - "YLR014C", - "PPR1", + "209", "YPD", + "YKL109W", + "HAP4", "glucose", "30.0" ], [ "4", - "83", - "YEL009C", - "GCN4", + "189", "YPD", + "YJR140C", + "HIR3", "glucose", "30.0" ] @@ -956,9 +957,9 @@ " \n", " \n", " sample_id\n", + " condition\n", " regulator_locus_tag\n", " regulator_symbol\n", - " condition\n", " carbon_source\n", " temperature_celsius\n", " \n", @@ -966,46 +967,46 @@ " \n", " \n", " 0\n", - " 166\n", - " YIL131C\n", - " FKH1\n", + " 206\n", " YPD\n", + " YKL072W\n", + " STB6\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", - " 3\n", - " YBL005W\n", - " PDR3\n", + " 249\n", " YPD\n", + " YMR016C\n", + " SOK2\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 173\n", - " YIR023W\n", - " DAL81\n", + " 309\n", " YPD\n", + " YOR162C\n", + " YRR1\n", " glucose\n", " 30.0\n", " \n", " \n", " 3\n", - " 220\n", - " YLR014C\n", - " PPR1\n", + " 209\n", " YPD\n", + " YKL109W\n", + " HAP4\n", " glucose\n", " 30.0\n", " \n", " \n", " 4\n", - " 83\n", - " YEL009C\n", - " GCN4\n", + " 189\n", " YPD\n", + " YJR140C\n", + " HIR3\n", " glucose\n", " 30.0\n", " \n", @@ -1014,12 +1015,12 @@ "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 166 YIL131C FKH1 YPD glucose \n", - "1 3 YBL005W PDR3 YPD glucose \n", - "2 173 YIR023W DAL81 YPD glucose \n", - "3 220 YLR014C PPR1 YPD glucose \n", - "4 83 YEL009C GCN4 YPD glucose \n", + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 206 YPD YKL072W STB6 glucose \n", + "1 249 YPD YMR016C SOK2 glucose \n", + "2 309 YPD YOR162C YRR1 glucose \n", + "3 209 YPD YKL109W HAP4 glucose \n", + "4 189 YPD YJR140C HIR3 glucose \n", "\n", " temperature_celsius \n", "0 30.0 \n", @@ -1124,75 +1125,75 @@ "type": "float" } ], - "ref": "a6cb8a91-c1c2-4bc8-af51-12e900d7a4bf", + "ref": "30929ca9-daa0-428a-a352-2a29f36f06eb", "rows": [ [ "0", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR204W", "YPR204W", - "0.78449615", - "0.53566521", + "0.85288861", + "0.76943045", "glucose", "30.0" ], [ "1", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR203W", "YPR203W", - "1.4509147", - "0.95955603", + "1.2490028", + "0.11237602", "glucose", "30.0" ], [ "2", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR202W", "YPR202W", - "1.4509147", - "0.95955603", + "1.2490028", + "0.11237602", "glucose", "30.0" ], [ "3", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR201W", "ARR3", - "0.92586339", - "0.45367192", + "1.5137073", + "0.1681333", "glucose", "30.0" ], [ "4", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR200C", "ARR2", - "0.92586339", - "0.45367192", + "1.5137073", + "0.1681333", "glucose", "30.0" ] @@ -1237,71 +1238,71 @@ " \n", " \n", " 0\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR204W\n", " YPR204W\n", - " 0.784496\n", - " 0.535665\n", + " 0.852889\n", + " 0.769430\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR203W\n", " YPR203W\n", - " 1.450915\n", - " 0.959556\n", + " 1.249003\n", + " 0.112376\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR202W\n", " YPR202W\n", - " 1.450915\n", - " 0.959556\n", + " 1.249003\n", + " 0.112376\n", " glucose\n", " 30.0\n", " \n", " \n", " 3\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR201W\n", " ARR3\n", - " 0.925863\n", - " 0.453672\n", + " 1.513707\n", + " 0.168133\n", " glucose\n", " 30.0\n", " \n", " \n", " 4\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR200C\n", " ARR2\n", - " 0.925863\n", - " 0.453672\n", + " 1.513707\n", + " 0.168133\n", " glucose\n", " 30.0\n", " \n", @@ -1311,18 +1312,18 @@ ], "text/plain": [ " sample_id db_id regulator_locus_tag regulator_symbol condition \\\n", - "0 14 13.0 YBR049C REB1 H2O2Lo \n", - "1 14 13.0 YBR049C REB1 H2O2Lo \n", - "2 14 13.0 YBR049C REB1 H2O2Lo \n", - "3 14 13.0 YBR049C REB1 H2O2Lo \n", - "4 14 13.0 YBR049C REB1 H2O2Lo \n", + "0 15 14.0 YBR049C REB1 YPD \n", + "1 15 14.0 YBR049C REB1 YPD \n", + "2 15 14.0 YBR049C REB1 YPD \n", + "3 15 14.0 YBR049C REB1 YPD \n", + "4 15 14.0 YBR049C REB1 YPD \n", "\n", " target_locus_tag target_symbol effect pvalue carbon_source \\\n", - "0 YPR204W YPR204W 0.784496 0.535665 glucose \n", - "1 YPR203W YPR203W 1.450915 0.959556 glucose \n", - "2 YPR202W YPR202W 1.450915 0.959556 glucose \n", - "3 YPR201W ARR3 0.925863 0.453672 glucose \n", - "4 YPR200C ARR2 0.925863 0.453672 glucose \n", + "0 YPR204W YPR204W 0.852889 0.769430 glucose \n", + "1 YPR203W YPR203W 1.249003 0.112376 glucose \n", + "2 YPR202W YPR202W 1.249003 0.112376 glucose \n", + "3 YPR201W ARR3 1.513707 0.168133 glucose \n", + "4 YPR200C ARR2 1.513707 0.168133 glucose \n", "\n", " temperature_celsius \n", "0 30.0 \n", @@ -1385,7 +1386,7 @@ "type": "integer" } ], - "ref": "9234aaf4-a313-42c2-838a-a13568eed01d", + "ref": "4b3d2bd2-6088-4f18-90c4-adf37845e56d", "rows": [ [ "0", @@ -1399,7 +1400,7 @@ ], [ "2", - "HSF1", + "RTG3", "4" ], [ @@ -1409,232 +1410,232 @@ ], [ "4", - "RTG3", + "HSF1", "4" ], [ "5", - "YAP1", + "DIG1", "4" ], [ "6", - "SKN7", + "YAP1", "4" ], [ "7", - "DIG1", + "SKN7", "4" ], [ "8", - "GAT1", + "ROX1", "3" ], [ "9", - "RPN4", + "MOT3", "3" ], [ "10", - "YAP7", + "FHL1", "3" ], [ "11", - "TEC1", + "FKH2", "3" ], [ "12", - "AFT1", + "REB1", "3" ], [ "13", - "MAL33", + "PHD1", "3" ], [ "14", - "PHO2", + "NRG1", "3" ], [ "15", - "MBP1", + "YAP7", "3" ], [ "16", - "KSS1", + "RPH1", "3" ], [ "17", - "SFP1", + "AFT2", "3" ], [ "18", - "CIN5", + "GAT1", "3" ], [ "19", - "YJL206C", + "TEC1", "3" ], [ "20", - "GZF3", + "MBP1", "3" ], [ "21", - "MOT3", + "RPN4", "3" ], [ "22", - "FHL1", + "AFT1", "3" ], [ "23", - "ROX1", + "YJL206C", "3" ], [ "24", - "FKH2", + "SFP1", "3" ], [ "25", - "AFT2", + "YAP6", "3" ], [ "26", - "REB1", + "RIM101", "3" ], [ "27", - "RIM101", + "MAL33", "3" ], [ "28", - "YAP6", + "PHO2", "3" ], [ "29", - "RPH1", + "CIN5", "3" ], [ "30", - "PHD1", + "KSS1", "3" ], [ "31", - "NRG1", + "GZF3", "3" ], [ "32", - "MGA1", + "RLM1", "2" ], [ "33", - "UME1", + "UGA3", "2" ], [ "34", - "YAP3", + "MIG2", "2" ], [ "35", - "XBP1", + "SOK2", "2" ], [ "36", - "RDS1", + "YAP5", "2" ], [ "37", - "MSS11", + "XBP1", "2" ], [ "38", - "HAP2", + "UME6", "2" ], [ "39", - "MCM1", + "MAC1", "2" ], [ "40", - "ADR1", + "CAD1", "2" ], [ "41", - "GCN4", + "YAP3", "2" ], [ "42", - "MIG2", + "ARR1", "2" ], [ "43", - "SOK2", + "IME4", "2" ], [ "44", - "RTG1", + "DAL80", "2" ], [ "45", - "MOT2", + "RDS1", "2" ], [ "46", - "UGA3", + "GLN3", "2" ], [ "47", - "PUT3", + "ASH1", "2" ], [ "48", - "YAP5", + "DAL81", "2" ], [ "49", - "UME6", + "HAP4", "2" ] ], @@ -1679,7 +1680,7 @@ " \n", " \n", " 2\n", - " HSF1\n", + " RTG3\n", " 4\n", " \n", " \n", @@ -1689,7 +1690,7 @@ " \n", " \n", " 4\n", - " RTG3\n", + " HSF1\n", " 4\n", " \n", " \n", @@ -1699,27 +1700,27 @@ " \n", " \n", " 58\n", - " DAL82\n", + " GCN4\n", " 2\n", " \n", " \n", " 59\n", - " DAL80\n", + " PUT3\n", " 2\n", " \n", " \n", " 60\n", - " HAP4\n", + " RTG1\n", " 2\n", " \n", " \n", " 61\n", - " PDR1\n", + " MOT2\n", " 2\n", " \n", " \n", " 62\n", - " RLM1\n", + " ADR1\n", " 2\n", " \n", " \n", @@ -1731,15 +1732,15 @@ " regulator_symbol n\n", "0 MSN2 6\n", "1 MSN4 5\n", - "2 HSF1 4\n", + "2 RTG3 4\n", "3 STE12 4\n", - "4 RTG3 4\n", + "4 HSF1 4\n", ".. ... ..\n", - "58 DAL82 2\n", - "59 DAL80 2\n", - "60 HAP4 2\n", - "61 PDR1 2\n", - "62 RLM1 2\n", + "58 GCN4 2\n", + "59 PUT3 2\n", + "60 RTG1 2\n", + "61 MOT2 2\n", + "62 ADR1 2\n", "\n", "[63 rows x 2 columns]" ] @@ -1869,58 +1870,58 @@ "type": "string" } ], - "ref": "3464c093-78d3-4dde-9a28-850a7be5d032", + "ref": "2b39d73c-9d98-49fa-84b0-080c44a12e2c", "rows": [ [ "0", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;85", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.004", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "105", "harbison", - "85", - "BrentLab/Hackett_2020;hackett_2020" + "10", + "BrentLab/hughes_2006;overexpression" ], [ "1", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;83", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "108", "harbison", - "83", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "2", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;84", - "2.0", - "1.0", - "3.0", - "1.0", - "0.0", - "0.011", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "109", "harbison", - "84", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ] ], "shape": { @@ -1966,89 +1967,89 @@ " \n", " \n", " 0\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;85\n", - " 2.0\n", - " 2.0\n", - " 3.0\n", - " 2.0\n", - " 0.000225\n", - " 0.004\n", + " BrentLab/harbison_2004;harbison_2004;105\n", + " BrentLab/hughes_2006;overexpression;10\n", + " 11.0\n", + " 206.0\n", + " 12.0\n", + " 206.0\n", + " 0.041293\n", + " 0.017\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 105\n", " harbison\n", - " 85\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 10\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 1\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;83\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;108\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 60.0\n", + " 67.0\n", + " 60.0\n", + " 67.0\n", + " 0.054284\n", + " 0.000\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 108\n", " harbison\n", - " 83\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 2\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;84\n", - " 2.0\n", - " 1.0\n", - " 3.0\n", - " 1.0\n", - " 0.000000\n", - " 0.011\n", + " BrentLab/harbison_2004;harbison_2004;109\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 27.0\n", + " 1265.0\n", + " 27.0\n", + " 1265.0\n", + " 0.123214\n", + " 0.057\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 109\n", " harbison\n", - " 84\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", "\n", "" ], "text/plain": [ - " binding_id \\\n", - "0 BrentLab/harbison_2004;harbison_2004;3 \n", - "1 BrentLab/harbison_2004;harbison_2004;3 \n", - "2 BrentLab/harbison_2004;harbison_2004;3 \n", + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", "\n", - " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", - "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", "\n", " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 2.0 3.0 2.0 \n", - "1 NaN NaN NaN \n", - "2 1.0 3.0 1.0 \n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", "\n", " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.000225 0.004 harbison_2004-harbison_2004 \n", - "1 NaN NaN harbison_2004-harbison_2004 \n", - "2 0.000000 0.011 harbison_2004-harbison_2004 \n", + "0 0.041293 0.017 harbison_2004-harbison_2004 \n", + "1 0.054284 0.000 harbison_2004-harbison_2004 \n", + "2 0.123214 0.057 harbison_2004-harbison_2004 \n", "\n", - " perturbation_repo_dataset binding_id_id binding_id_source \\\n", - "0 Hackett_2020-hackett_2020 3 harbison \n", - "1 Hackett_2020-hackett_2020 3 harbison \n", - "2 Hackett_2020-hackett_2020 3 harbison \n", + " perturbation_repo_dataset binding_id_id binding_id_source \\\n", + "0 hughes_2006-overexpression 105 harbison \n", + "1 hughes_2006-overexpression 108 harbison \n", + "2 hughes_2006-overexpression 109 harbison \n", "\n", - " perturbation_id_id perturbation_id_source \n", - "0 85 BrentLab/Hackett_2020;hackett_2020 \n", - "1 83 BrentLab/Hackett_2020;hackett_2020 \n", - "2 84 BrentLab/Hackett_2020;hackett_2020 " + " perturbation_id_id perturbation_id_source \n", + "0 10 BrentLab/hughes_2006;overexpression \n", + "1 11 BrentLab/hughes_2006;overexpression \n", + "2 11 BrentLab/hughes_2006;overexpression " ] }, "execution_count": 10, @@ -2082,17 +2083,17 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "condition", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "condition", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, @@ -2117,117 +2118,117 @@ "type": "float" } ], - "ref": "58c1f0ca-b0a7-4ce7-b29f-f4e789b74707", + "ref": "ed3eff3d-031c-4595-9982-ca59cb79b366", "rows": [ [ "0", - "50", - "YDR043C", - "NRG1", - "H2O2Lo", + "284", + "YPD", + "YNL216W", + "RAP1", "glucose", "30.0", "0.0", - "0.081863152643831" + "0.05363405732597472" ], [ "1", - "213", - "YKL222C", - "YKL222C", + "249", "YPD", + "YMR016C", + "SOK2", "glucose", "30.0", "0.0", - "0.0" + "0.12064753873577402" ], [ "2", - "18", - "YBR083W", - "TEC1", - "YPD", + "8", + "H2O2Lo", + "YBL103C", + "RTG3", "glucose", "30.0", "0.0", - "0.0620669105826265" + "0.035346907993966815" ], [ "3", - "7", - "YBL103C", - "RTG3", - "H2O2Hi", + "255", + "H2O2Lo", + "YMR037C", + "MSN2", "glucose", "30.0", "0.0", - "0.1577232390460343" + "0.16925856324356348" ], [ "4", - "277", - "YNL103W", - "MET4", + "83", "YPD", + "YEL009C", + "GCN4", "glucose", "30.0", "0.0", - "0.016281512605042" + "0.09684614829243568" ], [ "5", - "281", - "YNL199C", - "GCR2", - "SM", - "unspecified", + "119", + "H2O2Lo", + "YGL073W", + "HSF1", + "glucose", "30.0", "0.0", - "0.0296346442259623" + "0.11808548603694578" ], [ "6", - "86", - "YER040W", - "GLN3", - "SM", - "unspecified", + "224", + "YPD", + "YLR131C", + "ACE2", + "glucose", "30.0", "0.0", - "0.2298889521004841" + "0.02416289592760181" ], [ "7", - "225", - "YLR176C", - "RFX1", + "55", "YPD", + "YDR146C", + "SWI5", "glucose", "30.0", "0.0", - "0.0144559001906082" + "0.015389140271493212" ], [ "8", - "86", - "YER040W", - "GLN3", - "SM", - "unspecified", + "189", + "YPD", + "YJR140C", + "HIR3", + "glucose", "30.0", "0.0", - "0.0961169019780866" + "0.003987422530076558" ], [ "9", - "225", - "YLR176C", - "RFX1", + "246", "YPD", + "YML099C", + "ARG81", "glucose", "30.0", "0.0", - "0.0335260614428719" + "1.6275194000312485e-06" ] ], "shape": { @@ -2255,9 +2256,9 @@ " \n", " \n", " sample_id\n", + " condition\n", " regulator_locus_tag\n", " regulator_symbol\n", - " condition\n", " carbon_source\n", " temperature_celsius\n", " dto_empirical_pvalue\n", @@ -2267,142 +2268,142 @@ " \n", " \n", " 0\n", - " 50\n", - " YDR043C\n", - " NRG1\n", - " H2O2Lo\n", + " 284\n", + " YPD\n", + " YNL216W\n", + " RAP1\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.081863\n", + " 0.053634\n", " \n", " \n", " 1\n", - " 213\n", - " YKL222C\n", - " YKL222C\n", + " 249\n", " YPD\n", + " YMR016C\n", + " SOK2\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.000000\n", + " 0.120648\n", " \n", " \n", " 2\n", - " 18\n", - " YBR083W\n", - " TEC1\n", - " YPD\n", + " 8\n", + " H2O2Lo\n", + " YBL103C\n", + " RTG3\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.062067\n", + " 0.035347\n", " \n", " \n", " 3\n", - " 7\n", - " YBL103C\n", - " RTG3\n", - " H2O2Hi\n", + " 255\n", + " H2O2Lo\n", + " YMR037C\n", + " MSN2\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.157723\n", + " 0.169259\n", " \n", " \n", " 4\n", - " 277\n", - " YNL103W\n", - " MET4\n", + " 83\n", " YPD\n", + " YEL009C\n", + " GCN4\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.016282\n", + " 0.096846\n", " \n", " \n", " 5\n", - " 281\n", - " YNL199C\n", - " GCR2\n", - " SM\n", - " unspecified\n", + " 119\n", + " H2O2Lo\n", + " YGL073W\n", + " HSF1\n", + " glucose\n", " 30.0\n", " 0.0\n", - " 0.029635\n", + " 0.118085\n", " \n", " \n", " 6\n", - " 86\n", - " YER040W\n", - " GLN3\n", - " SM\n", - " unspecified\n", + " 224\n", + " YPD\n", + " YLR131C\n", + " ACE2\n", + " glucose\n", " 30.0\n", " 0.0\n", - " 0.229889\n", + " 0.024163\n", " \n", " \n", " 7\n", - " 225\n", - " YLR176C\n", - " RFX1\n", + " 55\n", " YPD\n", + " YDR146C\n", + " SWI5\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.014456\n", + " 0.015389\n", " \n", " \n", " 8\n", - " 86\n", - " YER040W\n", - " GLN3\n", - " SM\n", - " unspecified\n", + " 189\n", + " YPD\n", + " YJR140C\n", + " HIR3\n", + " glucose\n", " 30.0\n", " 0.0\n", - " 0.096117\n", + " 0.003987\n", " \n", " \n", " 9\n", - " 225\n", - " YLR176C\n", - " RFX1\n", + " 246\n", " YPD\n", + " YML099C\n", + " ARG81\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.033526\n", + " 0.000002\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 50 YDR043C NRG1 H2O2Lo glucose \n", - "1 213 YKL222C YKL222C YPD glucose \n", - "2 18 YBR083W TEC1 YPD glucose \n", - "3 7 YBL103C RTG3 H2O2Hi glucose \n", - "4 277 YNL103W MET4 YPD glucose \n", - "5 281 YNL199C GCR2 SM unspecified \n", - "6 86 YER040W GLN3 SM unspecified \n", - "7 225 YLR176C RFX1 YPD glucose \n", - "8 86 YER040W GLN3 SM unspecified \n", - "9 225 YLR176C RFX1 YPD glucose \n", + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 284 YPD YNL216W RAP1 glucose \n", + "1 249 YPD YMR016C SOK2 glucose \n", + "2 8 H2O2Lo YBL103C RTG3 glucose \n", + "3 255 H2O2Lo YMR037C MSN2 glucose \n", + "4 83 YPD YEL009C GCN4 glucose \n", + "5 119 H2O2Lo YGL073W HSF1 glucose \n", + "6 224 YPD YLR131C ACE2 glucose \n", + "7 55 YPD YDR146C SWI5 glucose \n", + "8 189 YPD YJR140C HIR3 glucose \n", + "9 246 YPD YML099C ARG81 glucose \n", "\n", " temperature_celsius dto_empirical_pvalue dto_fdr \n", - "0 30.0 0.0 0.081863 \n", - "1 30.0 0.0 0.000000 \n", - "2 30.0 0.0 0.062067 \n", - "3 30.0 0.0 0.157723 \n", - "4 30.0 0.0 0.016282 \n", - "5 30.0 0.0 0.029635 \n", - "6 30.0 0.0 0.229889 \n", - "7 30.0 0.0 0.014456 \n", - "8 30.0 0.0 0.096117 \n", - "9 30.0 0.0 0.033526 " + "0 30.0 0.0 0.053634 \n", + "1 30.0 0.0 0.120648 \n", + "2 30.0 0.0 0.035347 \n", + "3 30.0 0.0 0.169259 \n", + "4 30.0 0.0 0.096846 \n", + "5 30.0 0.0 0.118085 \n", + "6 30.0 0.0 0.024163 \n", + "7 30.0 0.0 0.015389 \n", + "8 30.0 0.0 0.003987 \n", + "9 30.0 0.0 0.000002 " ] }, "execution_count": 11, @@ -2460,77 +2461,77 @@ "type": "string" } ], - "ref": "b916ca80-75d1-448d-82a5-c82086ca1ed9", + "ref": "f1bd06b8-23de-4c3d-8e39-6910708e04da", "rows": [ [ "0", - "289", - "DAL82", + "224", + "ACE2", "0.0", - "1208" + "169" ], [ "1", - "251", - "MAC1", + "83", + "GCN4", "0.0", - "1103" + "48_190" ], [ "2", - "321", - "DIG1", + "189", + "HIR3", "0.0", - "1372" + "772" ], [ "3", - "238", - "YAP1", + "75", + "CAD1", "0.0", - "996" + "360" ], [ "4", - "303", - "CIN5", + "284", + "RAP1", "0.0", - "1365" + "96_238" ], [ "5", - "245", - "ARG81", + "283", + "RAP1", "0.0", - "1023" + "96_238" ], [ "6", - "184", - "CBF1", + "8", + "RTG3", "0.0", - "754" + "109_251" ], [ "7", - "252", - "MAC1", + "246", + "ARG81", "0.0", - "1103" + "187" ], [ "8", - "200", - "PHD1", + "55", + "SWI5", "0.0", - "890" + "128_270" ], [ "9", - "251", - "MAC1", + "119", + "HSF1", "0.0", - "1110" + "88" ] ], "shape": { @@ -2566,73 +2567,73 @@ " \n", " \n", " 0\n", - " 289\n", - " DAL82\n", + " 224\n", + " ACE2\n", " 0.0\n", - " 1208\n", + " 169\n", " \n", " \n", " 1\n", - " 251\n", - " MAC1\n", + " 83\n", + " GCN4\n", " 0.0\n", - " 1103\n", + " 48_190\n", " \n", " \n", " 2\n", - " 321\n", - " DIG1\n", + " 189\n", + " HIR3\n", " 0.0\n", - " 1372\n", + " 772\n", " \n", " \n", " 3\n", - " 238\n", - " YAP1\n", + " 75\n", + " CAD1\n", " 0.0\n", - " 996\n", + " 360\n", " \n", " \n", " 4\n", - " 303\n", - " CIN5\n", + " 284\n", + " RAP1\n", " 0.0\n", - " 1365\n", + " 96_238\n", " \n", " \n", " 5\n", - " 245\n", - " ARG81\n", + " 283\n", + " RAP1\n", " 0.0\n", - " 1023\n", + " 96_238\n", " \n", " \n", " 6\n", - " 184\n", - " CBF1\n", + " 8\n", + " RTG3\n", " 0.0\n", - " 754\n", + " 109_251\n", " \n", " \n", " 7\n", - " 252\n", - " MAC1\n", + " 246\n", + " ARG81\n", " 0.0\n", - " 1103\n", + " 187\n", " \n", " \n", " 8\n", - " 200\n", - " PHD1\n", + " 55\n", + " SWI5\n", " 0.0\n", - " 890\n", + " 128_270\n", " \n", " \n", " 9\n", - " 251\n", - " MAC1\n", + " 119\n", + " HSF1\n", " 0.0\n", - " 1110\n", + " 88\n", " \n", " \n", "\n", @@ -2640,16 +2641,16 @@ ], "text/plain": [ " harbison_sample_id regulator_symbol dto_empirical_pvalue hackett_sample_id\n", - "0 289 DAL82 0.0 1208\n", - "1 251 MAC1 0.0 1103\n", - "2 321 DIG1 0.0 1372\n", - "3 238 YAP1 0.0 996\n", - "4 303 CIN5 0.0 1365\n", - "5 245 ARG81 0.0 1023\n", - "6 184 CBF1 0.0 754\n", - "7 252 MAC1 0.0 1103\n", - "8 200 PHD1 0.0 890\n", - "9 251 MAC1 0.0 1110" + "0 224 ACE2 0.0 169\n", + "1 83 GCN4 0.0 48_190\n", + "2 189 HIR3 0.0 772\n", + "3 75 CAD1 0.0 360\n", + "4 284 RAP1 0.0 96_238\n", + "5 283 RAP1 0.0 96_238\n", + "6 8 RTG3 0.0 109_251\n", + "7 246 ARG81 0.0 187\n", + "8 55 SWI5 0.0 128_270\n", + "9 119 HSF1 0.0 88" ] }, "execution_count": 12, @@ -2729,12 +2730,12 @@ "type": "integer" } ], - "ref": "1185b490-3375-41d0-b61c-0f35dae2b815", + "ref": "85913540-d09c-4f16-8ff1-2ceca1cba4ee", "rows": [ [ "0", "SWI1", - "15.0", + "5.0", "ZEV", "P", "3" @@ -2742,7 +2743,7 @@ [ "1", "SWI1", - "30.0", + "15.0", "ZEV", "P", "3" @@ -2750,7 +2751,7 @@ [ "2", "SWI1", - "45.0", + "90.0", "ZEV", "P", "3" @@ -2758,7 +2759,7 @@ [ "3", "SWI1", - "5.0", + "45.0", "ZEV", "P", "3" @@ -2766,7 +2767,7 @@ [ "4", "SWI1", - "0.0", + "10.0", "ZEV", "P", "3" @@ -2774,7 +2775,7 @@ [ "5", "SWI1", - "90.0", + "20.0", "ZEV", "P", "3" @@ -2782,7 +2783,7 @@ [ "6", "SWI1", - "10.0", + "30.0", "ZEV", "P", "3" @@ -2790,22 +2791,22 @@ [ "7", "SWI1", - "20.0", + "0.0", "ZEV", "P", "3" ], [ "8", - "GCN4", - "0.0", + "RDS2", + "45.0", "ZEV", "P", "2" ], [ "9", - "GCN4", + "Z3EV", "30.0", "ZEV", "P", @@ -2813,80 +2814,80 @@ ], [ "10", - "MAC1", - "0.0", - "GEV", + "Z3EV", + "5.0", + "ZEV", "P", "2" ], [ "11", - "RDS2", - "5.0", + "Z3EV", + "10.0", "ZEV", "P", "2" ], [ "12", - "RDS2", - "45.0", + "GCN4", + "30.0", "ZEV", "P", "2" ], [ "13", - "Z3EV", - "30.0", + "RDS2", + "5.0", "ZEV", "P", "2" ], [ "14", - "GCN4", - "90.0", - "ZEV", + "MAC1", + "0.0", + "GEV", "P", "2" ], [ "15", - "Z3EV", - "15.0", - "ZEV", + "MAC1", + "45.0", + "GEV", "P", "2" ], [ "16", - "GCN4", - "45.0", - "ZEV", + "MAC1", + "5.0", + "GEV", "P", "2" ], [ "17", "MAC1", - "5.0", + "15.0", "GEV", "P", "2" ], [ "18", - "MAC1", - "90.0", - "GEV", + "RDS2", + "0.0", + "ZEV", "P", "2" ], [ "19", - "Z3EV", - "45.0", + "RDS2", + "90.0", "ZEV", "P", "2" @@ -2894,23 +2895,23 @@ [ "20", "RDS2", - "10.0", + "30.0", "ZEV", "P", "2" ], [ "21", - "GCN4", - "15.0", + "Z3EV", + "45.0", "ZEV", "P", "2" ], [ "22", - "RDS2", - "90.0", + "GCN4", + "0.0", "ZEV", "P", "2" @@ -2918,24 +2919,24 @@ [ "23", "RDS2", - "0.0", + "20.0", "ZEV", "P", "2" ], [ "24", - "Z3EV", - "5.0", - "ZEV", + "MAC1", + "90.0", + "GEV", "P", "2" ], [ "25", - "Z3EV", - "90.0", - "ZEV", + "MAC1", + "30.0", + "GEV", "P", "2" ], @@ -2949,8 +2950,8 @@ ], [ "27", - "RDS2", - "30.0", + "GCN4", + "45.0", "ZEV", "P", "2" @@ -2973,7 +2974,7 @@ ], [ "30", - "Z3EV", + "RDS2", "10.0", "ZEV", "P", @@ -2981,33 +2982,33 @@ ], [ "31", - "RDS2", - "20.0", + "GCN4", + "15.0", "ZEV", "P", "2" ], [ "32", - "MAC1", - "45.0", - "GEV", + "Z3EV", + "15.0", + "ZEV", "P", "2" ], [ "33", - "MAC1", - "15.0", - "GEV", + "GCN4", + "90.0", + "ZEV", "P", "2" ], [ "34", - "MAC1", - "30.0", - "GEV", + "Z3EV", + "90.0", + "ZEV", "P", "2" ] @@ -3047,7 +3048,7 @@ " \n", " 0\n", " SWI1\n", - " 15.0\n", + " 5.0\n", " ZEV\n", " P\n", " 3\n", @@ -3055,7 +3056,7 @@ " \n", " 1\n", " SWI1\n", - " 30.0\n", + " 15.0\n", " ZEV\n", " P\n", " 3\n", @@ -3063,7 +3064,7 @@ " \n", " 2\n", " SWI1\n", - " 45.0\n", + " 90.0\n", " ZEV\n", " P\n", " 3\n", @@ -3071,7 +3072,7 @@ " \n", " 3\n", " SWI1\n", - " 5.0\n", + " 45.0\n", " ZEV\n", " P\n", " 3\n", @@ -3079,7 +3080,7 @@ " \n", " 4\n", " SWI1\n", - " 0.0\n", + " 10.0\n", " ZEV\n", " P\n", " 3\n", @@ -3087,7 +3088,7 @@ " \n", " 5\n", " SWI1\n", - " 90.0\n", + " 20.0\n", " ZEV\n", " P\n", " 3\n", @@ -3095,7 +3096,7 @@ " \n", " 6\n", " SWI1\n", - " 10.0\n", + " 30.0\n", " ZEV\n", " P\n", " 3\n", @@ -3103,22 +3104,22 @@ " \n", " 7\n", " SWI1\n", - " 20.0\n", + " 0.0\n", " ZEV\n", " P\n", " 3\n", " \n", " \n", " 8\n", - " GCN4\n", - " 0.0\n", + " RDS2\n", + " 45.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 9\n", - " GCN4\n", + " Z3EV\n", " 30.0\n", " ZEV\n", " P\n", @@ -3126,80 +3127,80 @@ " \n", " \n", " 10\n", - " MAC1\n", - " 0.0\n", - " GEV\n", + " Z3EV\n", + " 5.0\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 11\n", - " RDS2\n", - " 5.0\n", + " Z3EV\n", + " 10.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 12\n", - " RDS2\n", - " 45.0\n", + " GCN4\n", + " 30.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 13\n", - " Z3EV\n", - " 30.0\n", + " RDS2\n", + " 5.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 14\n", - " GCN4\n", - " 90.0\n", - " ZEV\n", + " MAC1\n", + " 0.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 15\n", - " Z3EV\n", - " 15.0\n", - " ZEV\n", + " MAC1\n", + " 45.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 16\n", - " GCN4\n", - " 45.0\n", - " ZEV\n", + " MAC1\n", + " 5.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 17\n", " MAC1\n", - " 5.0\n", + " 15.0\n", " GEV\n", " P\n", " 2\n", " \n", " \n", " 18\n", - " MAC1\n", - " 90.0\n", - " GEV\n", + " RDS2\n", + " 0.0\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 19\n", - " Z3EV\n", - " 45.0\n", + " RDS2\n", + " 90.0\n", " ZEV\n", " P\n", " 2\n", @@ -3207,23 +3208,23 @@ " \n", " 20\n", " RDS2\n", - " 10.0\n", + " 30.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 21\n", - " GCN4\n", - " 15.0\n", + " Z3EV\n", + " 45.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 22\n", - " RDS2\n", - " 90.0\n", + " GCN4\n", + " 0.0\n", " ZEV\n", " P\n", " 2\n", @@ -3231,24 +3232,24 @@ " \n", " 23\n", " RDS2\n", - " 0.0\n", + " 20.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 24\n", - " Z3EV\n", - " 5.0\n", - " ZEV\n", + " MAC1\n", + " 90.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 25\n", - " Z3EV\n", - " 90.0\n", - " ZEV\n", + " MAC1\n", + " 30.0\n", + " GEV\n", " P\n", " 2\n", " \n", @@ -3262,8 +3263,8 @@ " \n", " \n", " 27\n", - " RDS2\n", - " 30.0\n", + " GCN4\n", + " 45.0\n", " ZEV\n", " P\n", " 2\n", @@ -3286,7 +3287,7 @@ " \n", " \n", " 30\n", - " Z3EV\n", + " RDS2\n", " 10.0\n", " ZEV\n", " P\n", @@ -3294,33 +3295,33 @@ " \n", " \n", " 31\n", - " RDS2\n", - " 20.0\n", + " GCN4\n", + " 15.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 32\n", - " MAC1\n", - " 45.0\n", - " GEV\n", + " Z3EV\n", + " 15.0\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 33\n", - " MAC1\n", - " 15.0\n", - " GEV\n", + " GCN4\n", + " 90.0\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 34\n", - " MAC1\n", - " 30.0\n", - " GEV\n", + " Z3EV\n", + " 90.0\n", + " ZEV\n", " P\n", " 2\n", " \n", @@ -3330,41 +3331,41 @@ ], "text/plain": [ " regulator_symbol time mechanism restriction n\n", - "0 SWI1 15.0 ZEV P 3\n", - "1 SWI1 30.0 ZEV P 3\n", - "2 SWI1 45.0 ZEV P 3\n", - "3 SWI1 5.0 ZEV P 3\n", - "4 SWI1 0.0 ZEV P 3\n", - "5 SWI1 90.0 ZEV P 3\n", - "6 SWI1 10.0 ZEV P 3\n", - "7 SWI1 20.0 ZEV P 3\n", - "8 GCN4 0.0 ZEV P 2\n", - "9 GCN4 30.0 ZEV P 2\n", - "10 MAC1 0.0 GEV P 2\n", - "11 RDS2 5.0 ZEV P 2\n", - "12 RDS2 45.0 ZEV P 2\n", - "13 Z3EV 30.0 ZEV P 2\n", - "14 GCN4 90.0 ZEV P 2\n", - "15 Z3EV 15.0 ZEV P 2\n", - "16 GCN4 45.0 ZEV P 2\n", - "17 MAC1 5.0 GEV P 2\n", - "18 MAC1 90.0 GEV P 2\n", - "19 Z3EV 45.0 ZEV P 2\n", - "20 RDS2 10.0 ZEV P 2\n", - "21 GCN4 15.0 ZEV P 2\n", - "22 RDS2 90.0 ZEV P 2\n", - "23 RDS2 0.0 ZEV P 2\n", - "24 Z3EV 5.0 ZEV P 2\n", - "25 Z3EV 90.0 ZEV P 2\n", + "0 SWI1 5.0 ZEV P 3\n", + "1 SWI1 15.0 ZEV P 3\n", + "2 SWI1 90.0 ZEV P 3\n", + "3 SWI1 45.0 ZEV P 3\n", + "4 SWI1 10.0 ZEV P 3\n", + "5 SWI1 20.0 ZEV P 3\n", + "6 SWI1 30.0 ZEV P 3\n", + "7 SWI1 0.0 ZEV P 3\n", + "8 RDS2 45.0 ZEV P 2\n", + "9 Z3EV 30.0 ZEV P 2\n", + "10 Z3EV 5.0 ZEV P 2\n", + "11 Z3EV 10.0 ZEV P 2\n", + "12 GCN4 30.0 ZEV P 2\n", + "13 RDS2 5.0 ZEV P 2\n", + "14 MAC1 0.0 GEV P 2\n", + "15 MAC1 45.0 GEV P 2\n", + "16 MAC1 5.0 GEV P 2\n", + "17 MAC1 15.0 GEV P 2\n", + "18 RDS2 0.0 ZEV P 2\n", + "19 RDS2 90.0 ZEV P 2\n", + "20 RDS2 30.0 ZEV P 2\n", + "21 Z3EV 45.0 ZEV P 2\n", + "22 GCN4 0.0 ZEV P 2\n", + "23 RDS2 20.0 ZEV P 2\n", + "24 MAC1 90.0 GEV P 2\n", + "25 MAC1 30.0 GEV P 2\n", "26 Z3EV 20.0 ZEV P 2\n", - "27 RDS2 30.0 ZEV P 2\n", + "27 GCN4 45.0 ZEV P 2\n", "28 Z3EV 0.0 ZEV P 2\n", "29 RDS2 15.0 ZEV P 2\n", - "30 Z3EV 10.0 ZEV P 2\n", - "31 RDS2 20.0 ZEV P 2\n", - "32 MAC1 45.0 GEV P 2\n", - "33 MAC1 15.0 GEV P 2\n", - "34 MAC1 30.0 GEV P 2" + "30 RDS2 10.0 ZEV P 2\n", + "31 GCN4 15.0 ZEV P 2\n", + "32 Z3EV 15.0 ZEV P 2\n", + "33 GCN4 90.0 ZEV P 2\n", + "34 Z3EV 90.0 ZEV P 2" ] }, "execution_count": 13, @@ -3405,32 +3406,27 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "date", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "mechanism", "rawType": "object", "type": "string" }, { - "name": "time", - "rawType": "float64", - "type": "float" - }, - { - "name": "mechanism", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "restriction", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, { - "name": "date", + "name": "restriction", "rawType": "object", "type": "string" }, @@ -3439,6 +3435,11 @@ "rawType": "object", "type": "string" }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, { "name": "carbon_source", "rawType": "object", @@ -3450,44 +3451,44 @@ "type": "float" } ], - "ref": "440ab0a2-f84a-4505-8380-e218512394f7", + "ref": "c0a72de9-eed2-47d3-84b9-45c17121eaa1", "rows": [ [ "0", - "1620", + "1636", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", - "SMY2266a", + "SMY2266c", + "20.0", "glucose", "30.0" ], [ "1", "1628", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", "SMY2266b", + "20.0", "glucose", "30.0" ], [ "2", - "1636", + "1620", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", - "SMY2266c", + "SMY2266a", + "20.0", "glucose", "30.0" ] @@ -3517,13 +3518,13 @@ " \n", " \n", " sample_id\n", + " date\n", + " mechanism\n", " regulator_locus_tag\n", " regulator_symbol\n", - " time\n", - " mechanism\n", " restriction\n", - " date\n", " strain\n", + " time\n", " carbon_source\n", " temperature_celsius\n", " \n", @@ -3531,40 +3532,40 @@ " \n", " \n", " 0\n", - " 1620\n", + " 1636\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", - " SMY2266a\n", + " SMY2266c\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", " 1628\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", " SMY2266b\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 1636\n", + " 1620\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", - " SMY2266c\n", + " SMY2266a\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", @@ -3573,15 +3574,15 @@ "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol time mechanism restriction \\\n", - "0 1620 YPL016W SWI1 20.0 ZEV P \n", - "1 1628 YPL016W SWI1 20.0 ZEV P \n", - "2 1636 YPL016W SWI1 20.0 ZEV P \n", + " sample_id date mechanism regulator_locus_tag regulator_symbol \\\n", + "0 1636 20161117 ZEV YPL016W SWI1 \n", + "1 1628 20161117 ZEV YPL016W SWI1 \n", + "2 1620 20161117 ZEV YPL016W SWI1 \n", "\n", - " date strain carbon_source temperature_celsius \n", - "0 20161117 SMY2266a glucose 30.0 \n", - "1 20161117 SMY2266b glucose 30.0 \n", - "2 20161117 SMY2266c glucose 30.0 " + " restriction strain time carbon_source temperature_celsius \n", + "0 P SMY2266c 20.0 glucose 30.0 \n", + "1 P SMY2266b 20.0 glucose 30.0 \n", + "2 P SMY2266a 20.0 glucose 30.0 " ] }, "execution_count": 14, @@ -3610,7 +3611,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2']\n" + "['GCN4', 'MAC1', 'RDS2', 'Z3EV', 'SWI1']\n" ] } ], @@ -3638,7 +3639,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2', 'GEV']\n" + "['GCN4', 'MAC1', 'RDS2', 'Z3EV', 'SWI1', 'GEV']\n" ] } ], @@ -3650,7 +3651,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "abed8bc2", "metadata": {}, "outputs": [ @@ -3734,862 +3735,862 @@ "type": "string" } ], - "ref": "b9dead21-45e7-491d-82d4-a2358af05efe", + "ref": "93e4be83-7821-48c1-8980-427ad05543b9", "rows": [ [ "0", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;85", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.004", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "105", "harbison", - "85", - "BrentLab/Hackett_2020;hackett_2020" + "10", + "BrentLab/hughes_2006;overexpression" ], [ "1", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;83", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "108", "harbison", - "83", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "2", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;84", - "2.0", - "1.0", - "3.0", - "1.0", - "0.0", - "0.011", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "109", "harbison", - "84", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "3", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;78", - "487.0", - "96.0", - "479.0", - "92.0", - "0.4121918908550328", - "0.576", + "BrentLab/harbison_2004;harbison_2004;112", + "BrentLab/hughes_2006;overexpression;12", + "532.0", + "1093.0", + "532.0", + "1093.0", + "0.4363046674390623", + "0.092", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "112", "harbison", - "78", - "BrentLab/Hackett_2020;hackett_2020" + "12", + "BrentLab/hughes_2006;overexpression" ], [ "4", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;81", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;113", + "BrentLab/hughes_2006;overexpression;12", + "10.0", + "556.0", + "10.0", + "556.0", + "0.01756663927480034", + "0.002", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "113", "harbison", - "81", - "BrentLab/Hackett_2020;hackett_2020" + "12", + "BrentLab/hughes_2006;overexpression" ], [ "5", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;33", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;118", + "BrentLab/hughes_2006;overexpression;13", + "574.0", + "354.0", + "574.0", + "354.0", + "0.13894295437217577", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "118", "harbison", - "33", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "6", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;73", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;119", + "BrentLab/hughes_2006;overexpression;13", + "251.0", + "492.0", + "251.0", + "492.0", + "0.11808548603694578", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "119", "harbison", - "73", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "7", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;47", - "407.0", - "310.0", - "378.0", - "306.0", - "0.2038622347205313", - "0.441", + "BrentLab/harbison_2004;harbison_2004;120", + "BrentLab/hughes_2006;overexpression;13", + "14.0", + "2954.0", + "14.0", + "2954.0", + "0.1616346595561947", + "1.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "120", "harbison", - "47", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "8", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;46", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;121", + "BrentLab/hughes_2006;overexpression;13", + "422.0", + "544.0", + "423.0", + "544.0", + "0.401585299611564", + "0.001", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "121", "harbison", - "46", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "9", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;45", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;122", + "BrentLab/hughes_2006;overexpression;14", + "842.0", + "152.0", + "842.0", + "152.0", + "0.37750827352885596", + "0.106", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "122", "harbison", - "45", - "BrentLab/Hackett_2020;hackett_2020" + "14", + "BrentLab/hughes_2006;overexpression" ], [ "10", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;48", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;124", + "BrentLab/hughes_2006;overexpression;15", + "402.0", + "1417.0", + "402.0", + "1417.0", + "0.279937313245534", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "124", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "15", + "BrentLab/hughes_2006;overexpression" ], [ "11", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;34", - "198.0", - "26.0", - "193.0", - "24.0", - "0.7367526600236447", - "0.512", + "BrentLab/harbison_2004;harbison_2004;137", + "BrentLab/hughes_2006;overexpression;17", + "29.0", + "5.0", + "29.0", + "5.0", + "0.005954520941937803", + "0.043", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "137", "harbison", - "34", - "BrentLab/Hackett_2020;hackett_2020" + "17", + "BrentLab/hughes_2006;overexpression" ], [ "12", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;88", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;141", + "BrentLab/hughes_2006;overexpression;18", + "653.0", + "1620.0", + "654.0", + "1620.0", + "0.442997844156436", + "0.812", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "141", "harbison", - "88", - "BrentLab/Hackett_2020;hackett_2020" + "18", + "BrentLab/hughes_2006;overexpression" ], [ "13", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;79", - "278.0", - "82.0", - "275.0", - "76.0", - "0.3669436052366566", - "0.531", + "BrentLab/harbison_2004;harbison_2004;142", + "BrentLab/hughes_2006;overexpression;18", + "497.0", + "25.0", + "497.0", + "25.0", + "0.3308129606327521", + "0.921", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "142", "harbison", - "79", - "BrentLab/Hackett_2020;hackett_2020" + "18", + "BrentLab/hughes_2006;overexpression" ], [ "14", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;74", - "386.0", - "2.0", - "381.0", - "2.0", - "0.0478033736153071", - "0.596", + "BrentLab/harbison_2004;harbison_2004;150", + "BrentLab/hughes_2006;overexpression;19", + "91.0", + "1948.0", + "91.0", + "1948.0", + "0.2949755757517485", + "0.578", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "150", "harbison", - "74", - "BrentLab/Hackett_2020;hackett_2020" + "19", + "BrentLab/hughes_2006;overexpression" ], [ "15", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;87", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.01", + "BrentLab/harbison_2004;harbison_2004;151", + "BrentLab/hughes_2006;overexpression;21", + "57.0", + "386.0", + "57.0", + "386.0", + "0.0656826352687399", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "151", "harbison", - "87", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "16", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;82", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.005", + "BrentLab/harbison_2004;harbison_2004;152", + "BrentLab/hughes_2006;overexpression;21", + "272.0", + "526.0", + "272.0", + "526.0", + "0.2405177062735934", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "152", "harbison", - "82", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "17", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;40", - "233.0", - "887.0", - "228.0", - "853.0", - "0.4419109947643979", - "0.306", + "BrentLab/harbison_2004;harbison_2004;153", + "BrentLab/hughes_2006;overexpression;21", + "186.0", + "1060.0", + "186.0", + "1060.0", + "0.20770457061222172", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "153", "harbison", - "40", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "18", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;37", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;154", + "BrentLab/hughes_2006;overexpression;21", + "65.0", + "398.0", + "65.0", + "398.0", + "0.10461443622068167", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "154", "harbison", - "37", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "19", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;86", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.014", + "BrentLab/harbison_2004;harbison_2004;157", + "BrentLab/hughes_2006;overexpression;22", + "482.0", + "176.0", + "482.0", + "176.0", + "0.14485664209958654", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "157", "harbison", - "86", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "20", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;75", - "386.0", - "4.0", - "381.0", - "4.0", - "0.1752790365894595", - "0.871", + "BrentLab/harbison_2004;harbison_2004;158", + "BrentLab/hughes_2006;overexpression;22", + "354.0", + "215.0", + "354.0", + "215.0", + "0.12060713643717419", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "158", "harbison", - "75", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "21", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;77", - "487.0", - "15.0", - "479.0", - "13.0", - "0.1591137965760322", - "0.23", + "BrentLab/harbison_2004;harbison_2004;159", + "BrentLab/hughes_2006;overexpression;22", + "550.0", + "611.0", + "550.0", + "611.0", + "0.2924649934604871", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "159", "harbison", - "77", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "22", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;38", - "28.0", - "394.0", - "29.0", - "375.0", - "0.1464068569498395", - "0.309", + "BrentLab/harbison_2004;harbison_2004;160", + "BrentLab/hughes_2006;overexpression;22", + "77.0", + "625.0", + "77.0", + "625.0", + "0.1062495373846105", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "160", "harbison", - "38", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "23", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;36", - "242.0", - "239.0", - "237.0", - "230.0", - "0.4474384543548884", - "0.644", + "BrentLab/harbison_2004;harbison_2004;161", + "BrentLab/hughes_2006;overexpression;23", + "37.0", + "3236.0", + "37.0", + "3236.0", + "0.014875454821573575", + "0.456", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "161", "harbison", - "36", - "BrentLab/Hackett_2020;hackett_2020" + "23", + "BrentLab/hughes_2006;overexpression" ], [ "24", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;35", - "12.0", - "136.0", - "12.0", - "129.0", - "0.1014820131734504", - "0.411", + "BrentLab/harbison_2004;harbison_2004;162", + "BrentLab/hughes_2006;overexpression;24", + "417.0", + "1082.0", + "417.0", + "1082.0", + "0.22690440962955793", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "162", "harbison", - "35", - "BrentLab/Hackett_2020;hackett_2020" + "24", + "BrentLab/hughes_2006;overexpression" ], [ "25", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;39", - "236.0", - "462.0", - "231.0", - "442.0", - "0.4406392501266677", - "0.536", + "BrentLab/harbison_2004;harbison_2004;163", + "BrentLab/hughes_2006;overexpression;24", + "896.0", + "710.0", + "896.0", + "710.0", + "0.41161010647006896", + "0.002", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "163", "harbison", - "39", - "BrentLab/Hackett_2020;hackett_2020" + "24", + "BrentLab/hughes_2006;overexpression" ], [ "26", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;65", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;174", + "BrentLab/hughes_2006;overexpression;26", + "55.0", + "2135.0", + "55.0", + "2135.0", + "0.08879402276624998", + "0.006", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "174", "harbison", - "65", - "BrentLab/Hackett_2020;hackett_2020" + "26", + "BrentLab/hughes_2006;overexpression" ], [ "27", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;80", - "386.0", - "12.0", - "381.0", - "11.0", - "0.1530190500167841", - "0.26", + "BrentLab/harbison_2004;harbison_2004;175", + "BrentLab/hughes_2006;overexpression;27", + "79.0", + "354.0", + "79.0", + "354.0", + "0.36280804176948345", + "0.485", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "175", "harbison", - "80", - "BrentLab/Hackett_2020;hackett_2020" + "27", + "BrentLab/hughes_2006;overexpression" ], [ "28", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;76", - "386.0", - "13.0", - "381.0", - "13.0", - "0.3335221550855992", - "0.723", + "BrentLab/harbison_2004;harbison_2004;176", + "BrentLab/hughes_2006;overexpression;27", + "1.0", + "604.0", + "1.0", + "604.0", + "0.0", + "0.981", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "176", "harbison", - "76", - "BrentLab/Hackett_2020;hackett_2020" + "27", + "BrentLab/hughes_2006;overexpression" ], [ "29", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;48", - "467.0", - "60.0", - "454.0", - "60.0", - "0.1983655120981107", - "0.035", + "BrentLab/harbison_2004;harbison_2004;177", + "BrentLab/hughes_2006;overexpression;28", + "10.0", + "3654.0", + "10.0", + "3654.0", + "0.0", + "1.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "177", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "30", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;47", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;178", + "BrentLab/hughes_2006;overexpression;28", + "20.0", + "61.0", + "22.0", + "61.0", + "0.10253010965306489", + "0.707", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "178", "harbison", - "47", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "31", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;46", - "284.0", - "47.0", - "278.0", - "46.0", - "0.0992715955737997", - "0.003", + "BrentLab/harbison_2004;harbison_2004;179", + "BrentLab/hughes_2006;overexpression;28", + "6.0", + "1128.0", + "6.0", + "1128.0", + "0.15157064533525078", + "0.968", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "179", "harbison", - "46", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "32", - "BrentLab/harbison_2004;harbison_2004;11", - "BrentLab/Hackett_2020;hackett_2020;48", - "472.0", - "1.0", - "459.0", - "1.0", - "0.0", - "0.915", + "BrentLab/harbison_2004;harbison_2004;191", + "BrentLab/hughes_2006;overexpression;29", + "342.0", + "174.0", + "342.0", + "174.0", + "0.42452813230271436", + "0.452", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "11", + "hughes_2006-overexpression", + "191", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "29", + "BrentLab/hughes_2006;overexpression" ], [ "33", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;192", + "BrentLab/hughes_2006;overexpression;30", + "132.0", + "227.0", + "132.0", + "227.0", + "0.22362783869614716", + "0.002", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "192", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "34", - "BrentLab/harbison_2004;harbison_2004;16", - "BrentLab/Hackett_2020;hackett_2020;89", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;193", + "BrentLab/hughes_2006;overexpression;30", + "322.0", + "442.0", + "322.0", + "442.0", + "0.40950351528951207", + "0.021", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "16", + "hughes_2006-overexpression", + "193", "harbison", - "89", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "35", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;194", + "BrentLab/hughes_2006;overexpression;30", + "76.0", + "43.0", + "76.0", + "43.0", + "0.12124752831206184", + "0.395", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "194", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "36", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;43", - "2.0", - "330.0", - "2.0", - "318.0", - "0.0", - "0.195", + "BrentLab/harbison_2004;harbison_2004;201", + "BrentLab/hughes_2006;overexpression;31", + "136.0", + "1104.0", + "136.0", + "1104.0", + "0.2752121157648751", + "0.001", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "201", "harbison", - "43", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "37", - "BrentLab/harbison_2004;harbison_2004;16", - "BrentLab/Hackett_2020;hackett_2020;91", - "9.0", - "1.0", - "9.0", - "1.0", + "BrentLab/harbison_2004;harbison_2004;202", + "BrentLab/hughes_2006;overexpression;31", + "287.0", + "36.0", + "287.0", + "36.0", + "0.06401671759841812", "0.0", - "0.019", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "16", + "hughes_2006-overexpression", + "202", "harbison", - "91", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "38", - "BrentLab/harbison_2004;harbison_2004;17", - "BrentLab/Hackett_2020;hackett_2020;91", - "2.0", - "1.0", - "2.0", - "1.0", - "0.0", - "0.008", + "BrentLab/harbison_2004;harbison_2004;203", + "BrentLab/hughes_2006;overexpression;31", + "88.0", + "41.0", + "88.0", + "41.0", + "0.06563294471122981", + "0.003", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "17", + "hughes_2006-overexpression", + "203", "harbison", - "91", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "39", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;43", - "290.0", - "412.0", - "278.0", - "386.0", - "0.4521656634210855", - "0.208", + "BrentLab/harbison_2004;harbison_2004;204", + "BrentLab/hughes_2006;overexpression;31", + "318.0", + "1948.0", + "319.0", + "1948.0", + "0.380107954958676", + "0.57", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "204", "harbison", - "43", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "40", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;66", - "398.0", - "16.0", - "390.0", - "15.0", - "0.2406042358803986", - "0.431", + "BrentLab/harbison_2004;harbison_2004;205", + "BrentLab/hughes_2006;overexpression;31", + "467.0", + "646.0", + "467.0", + "646.0", + "0.42659723019346846", + "0.006", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "205", "harbison", - "66", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "41", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;42", - "122.0", - "212.0", - "120.0", - "206.0", - "0.3447911486822476", - "0.49", + "BrentLab/harbison_2004;harbison_2004;207", + "BrentLab/hughes_2006;overexpression;32", + "55.0", + "230.0", + "56.0", + "230.0", + "0.3233042722751513", + "0.796", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "207", "harbison", - "42", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "42", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;72", - "346.0", - "18.0", - "338.0", - "16.0", - "0.22671996124031", - "0.528", + "BrentLab/harbison_2004;harbison_2004;208", + "BrentLab/hughes_2006;overexpression;32", + "25.0", + "126.0", + "25.0", + "126.0", + "0.0489281862304512", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "208", "harbison", - "72", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "43", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;69", - "118.0", - "120.0", - "115.0", - "113.0", - "0.3139880952380952", - "0.454", + "BrentLab/harbison_2004;harbison_2004;209", + "BrentLab/hughes_2006;overexpression;32", + "122.0", + "688.0", + "122.0", + "688.0", + "0.10777396924484826", + "0.0", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "209", "harbison", - "69", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "44", - "BrentLab/harbison_2004;harbison_2004;20", - "BrentLab/Hackett_2020;hackett_2020;99", - "3.0", - "1.0", - "4.0", - "1.0", - "0.0", - "0.006", + "BrentLab/harbison_2004;harbison_2004;210", + "BrentLab/hughes_2006;overexpression;33", + "97.0", + "2113.0", + "97.0", + "2113.0", + "0.30052307036231024", + "0.807", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "20", + "hughes_2006-overexpression", + "210", "harbison", - "99", - "BrentLab/Hackett_2020;hackett_2020" + "33", + "BrentLab/hughes_2006;overexpression" ], [ "45", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;70", - "260.0", - "17.0", - "256.0", - "17.0", - "0.1850671373200443", - "0.455", + "BrentLab/harbison_2004;harbison_2004;219", + "BrentLab/hughes_2006;overexpression;34", + "172.0", + "245.0", + "172.0", + "245.0", + "0.41551695727724847", + "0.505", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "219", "harbison", - "70", - "BrentLab/Hackett_2020;hackett_2020" + "34", + "BrentLab/hughes_2006;overexpression" ], [ "46", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;67", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;225", + "BrentLab/hughes_2006;overexpression;35", + "314.0", + "12.0", + "314.0", + "12.0", + "0.15336823656300558", + "0.877", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "225", "harbison", - "67", - "BrentLab/Hackett_2020;hackett_2020" + "35", + "BrentLab/hughes_2006;overexpression" ], [ "47", - "BrentLab/harbison_2004;harbison_2004;20", - "BrentLab/Hackett_2020;hackett_2020;97", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;228", + "BrentLab/hughes_2006;overexpression;36", + "358.0", + "2316.0", + "358.0", + "2316.0", + "0.33853600995025945", + "0.804", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "20", + "hughes_2006-overexpression", + "228", "harbison", - "97", - "BrentLab/Hackett_2020;hackett_2020" + "36", + "BrentLab/hughes_2006;overexpression" ], [ "48", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;68", - "260.0", - "2.0", - "256.0", - "2.0", - "0.0317379568106312", - "0.647", + "BrentLab/harbison_2004;harbison_2004;231", + "BrentLab/hughes_2006;overexpression;38", + "77.0", + "362.0", + "77.0", + "362.0", + "0.32227814728264126", + "0.36", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "231", "harbison", - "68", - "BrentLab/Hackett_2020;hackett_2020" + "38", + "BrentLab/hughes_2006;overexpression" ], [ "49", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;232", + "BrentLab/hughes_2006;overexpression;38", + "40.0", + "3302.0", + "41.0", + "3302.0", + "0.01832419557792558", + "0.593", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "232", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "38", + "BrentLab/hughes_2006;overexpression" ] ], "shape": { "columns": 14, - "rows": 9604 + "rows": 29804 } }, "text/html": [ @@ -4630,88 +4631,88 @@ " \n", " \n", " 0\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;85\n", - " 2.0\n", - " 2.0\n", - " 3.0\n", - " 2.0\n", - " 0.000225\n", - " 0.004\n", + " BrentLab/harbison_2004;harbison_2004;105\n", + " BrentLab/hughes_2006;overexpression;10\n", + " 11.0\n", + " 206.0\n", + " 12.0\n", + " 206.0\n", + " 0.041293\n", + " 0.017\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 105\n", " harbison\n", - " 85\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 10\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 1\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;83\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;108\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 60.0\n", + " 67.0\n", + " 60.0\n", + " 67.0\n", + " 0.054284\n", + " 0.000\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 108\n", " harbison\n", - " 83\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 2\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;84\n", - " 2.0\n", - " 1.0\n", - " 3.0\n", - " 1.0\n", - " 0.000000\n", - " 0.011\n", + " BrentLab/harbison_2004;harbison_2004;109\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 27.0\n", + " 1265.0\n", + " 27.0\n", + " 1265.0\n", + " 0.123214\n", + " 0.057\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 109\n", " harbison\n", - " 84\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 3\n", - " BrentLab/harbison_2004;harbison_2004;4\n", - " BrentLab/Hackett_2020;hackett_2020;78\n", - " 487.0\n", - " 96.0\n", - " 479.0\n", - " 92.0\n", - " 0.412192\n", - " 0.576\n", + " BrentLab/harbison_2004;harbison_2004;112\n", + " BrentLab/hughes_2006;overexpression;12\n", + " 532.0\n", + " 1093.0\n", + " 532.0\n", + " 1093.0\n", + " 0.436305\n", + " 0.092\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 4\n", + " hughes_2006-overexpression\n", + " 112\n", " harbison\n", - " 78\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 12\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 4\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;81\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;113\n", + " BrentLab/hughes_2006;overexpression;12\n", + " 10.0\n", + " 556.0\n", + " 10.0\n", + " 556.0\n", + " 0.017567\n", + " 0.002\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 113\n", " harbison\n", - " 81\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 12\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " ...\n", @@ -4731,191 +4732,191 @@ " ...\n", " \n", " \n", - " 9599\n", - " BrentLab/callingcards;annotated_features;804\n", - " BrentLab/kemmeren_2014;kemmeren_2014;901\n", - " 14.0\n", - " 39.0\n", - " 13.0\n", - " 39.0\n", - " 0.000879\n", + " 29799\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;784\n", + " 154.0\n", + " 905.0\n", + " 154.0\n", + " 905.0\n", + " 0.090665\n", " 0.000\n", - " callingcards-annotated_features\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 804\n", - " BrentLab/callingcards;annotated_features\n", - " 901\n", + " 724-692-688\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 784\n", " kemmeren\n", " \n", " \n", - " 9600\n", - " BrentLab/callingcards;annotated_features;805\n", - " BrentLab/kemmeren_2014;kemmeren_2014;1053\n", - " 18.0\n", - " 278.0\n", - " 17.0\n", - " 171.0\n", - " 0.001455\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29800\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;666\n", + " 215.0\n", + " 108.0\n", + " 215.0\n", + " 108.0\n", + " 0.075036\n", + " 0.005\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 805\n", - " BrentLab/callingcards;annotated_features\n", - " 1053\n", + " 725-435-395\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 666\n", " kemmeren\n", " \n", " \n", - " 9601\n", - " BrentLab/callingcards;annotated_features;808\n", - " BrentLab/kemmeren_2014;kemmeren_2014;218\n", - " 20.0\n", - " 57.0\n", - " 19.0\n", - " 27.0\n", - " 0.003116\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29801\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;271\n", + " 221.0\n", + " 925.0\n", + " 221.0\n", + " 925.0\n", + " 0.403484\n", + " 0.126\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 808\n", - " BrentLab/callingcards;annotated_features\n", - " 218\n", + " 726-445-424\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 271\n", " kemmeren\n", " \n", " \n", - " 9602\n", - " BrentLab/callingcards;annotated_features;806\n", - " BrentLab/kemmeren_2014;kemmeren_2014;1023\n", - " 10.0\n", - " 9.0\n", - " 11.0\n", - " 9.0\n", - " 0.000000\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29802\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;1077\n", + " 281.0\n", + " 73.0\n", + " 283.0\n", + " 77.0\n", + " 0.095948\n", + " 0.174\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 806\n", - " BrentLab/callingcards;annotated_features\n", - " 1023\n", + " 79-33\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 1077\n", " kemmeren\n", " \n", " \n", - " 9603\n", - " BrentLab/callingcards;annotated_features;809\n", - " BrentLab/kemmeren_2014;kemmeren_2014;913\n", - " 150.0\n", - " 221.0\n", - " 140.0\n", - " 206.0\n", - " 0.116890\n", + " 29803\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;963\n", + " 526.0\n", + " 227.0\n", + " 527.0\n", + " 227.0\n", + " 0.064919\n", " 0.000\n", - " callingcards-annotated_features\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 809\n", - " BrentLab/callingcards;annotated_features\n", - " 913\n", + " 96-49\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 963\n", " kemmeren\n", " \n", " \n", "\n", - "

9604 rows × 14 columns

\n", + "

29804 rows × 14 columns

\n", "" ], "text/plain": [ - " binding_id \\\n", - "0 BrentLab/harbison_2004;harbison_2004;3 \n", - "1 BrentLab/harbison_2004;harbison_2004;3 \n", - "2 BrentLab/harbison_2004;harbison_2004;3 \n", - "3 BrentLab/harbison_2004;harbison_2004;4 \n", - "4 BrentLab/harbison_2004;harbison_2004;3 \n", - "... ... \n", - "9599 BrentLab/callingcards;annotated_features;804 \n", - "9600 BrentLab/callingcards;annotated_features;805 \n", - "9601 BrentLab/callingcards;annotated_features;808 \n", - "9602 BrentLab/callingcards;annotated_features;806 \n", - "9603 BrentLab/callingcards;annotated_features;809 \n", + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", + "3 BrentLab/harbison_2004;harbison_2004;112 \n", + "4 BrentLab/harbison_2004;harbison_2004;113 \n", + "... ... \n", + "29799 BrentLab/callingcards;annotated_features_combi... \n", + "29800 BrentLab/callingcards;annotated_features_combi... \n", + "29801 BrentLab/callingcards;annotated_features_combi... \n", + "29802 BrentLab/callingcards;annotated_features_combi... \n", + "29803 BrentLab/callingcards;annotated_features_combi... \n", "\n", - " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", - "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", - "3 BrentLab/Hackett_2020;hackett_2020;78 487.0 \n", - "4 BrentLab/Hackett_2020;hackett_2020;81 NaN \n", - "... ... ... \n", - "9599 BrentLab/kemmeren_2014;kemmeren_2014;901 14.0 \n", - "9600 BrentLab/kemmeren_2014;kemmeren_2014;1053 18.0 \n", - "9601 BrentLab/kemmeren_2014;kemmeren_2014;218 20.0 \n", - "9602 BrentLab/kemmeren_2014;kemmeren_2014;1023 10.0 \n", - "9603 BrentLab/kemmeren_2014;kemmeren_2014;913 150.0 \n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", + "3 BrentLab/hughes_2006;overexpression;12 532.0 \n", + "4 BrentLab/hughes_2006;overexpression;12 10.0 \n", + "... ... ... \n", + "29799 BrentLab/kemmeren_2014;kemmeren_2014;784 154.0 \n", + "29800 BrentLab/kemmeren_2014;kemmeren_2014;666 215.0 \n", + "29801 BrentLab/kemmeren_2014;kemmeren_2014;271 221.0 \n", + "29802 BrentLab/kemmeren_2014;kemmeren_2014;1077 281.0 \n", + "29803 BrentLab/kemmeren_2014;kemmeren_2014;963 526.0 \n", "\n", - " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 2.0 3.0 2.0 \n", - "1 NaN NaN NaN \n", - "2 1.0 3.0 1.0 \n", - "3 96.0 479.0 92.0 \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "9599 39.0 13.0 39.0 \n", - "9600 278.0 17.0 171.0 \n", - "9601 57.0 19.0 27.0 \n", - "9602 9.0 11.0 9.0 \n", - "9603 221.0 140.0 206.0 \n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", + "3 1093.0 532.0 1093.0 \n", + "4 556.0 10.0 556.0 \n", + "... ... ... ... \n", + "29799 905.0 154.0 905.0 \n", + "29800 108.0 215.0 108.0 \n", + "29801 925.0 221.0 925.0 \n", + "29802 73.0 283.0 77.0 \n", + "29803 227.0 527.0 227.0 \n", "\n", - " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.000225 0.004 harbison_2004-harbison_2004 \n", - "1 NaN NaN harbison_2004-harbison_2004 \n", - "2 0.000000 0.011 harbison_2004-harbison_2004 \n", - "3 0.412192 0.576 harbison_2004-harbison_2004 \n", - "4 NaN NaN harbison_2004-harbison_2004 \n", - "... ... ... ... \n", - "9599 0.000879 0.000 callingcards-annotated_features \n", - "9600 0.001455 0.000 callingcards-annotated_features \n", - "9601 0.003116 0.000 callingcards-annotated_features \n", - "9602 0.000000 0.000 callingcards-annotated_features \n", - "9603 0.116890 0.000 callingcards-annotated_features \n", + " dto_fdr dto_empirical_pvalue \\\n", + "0 0.041293 0.017 \n", + "1 0.054284 0.000 \n", + "2 0.123214 0.057 \n", + "3 0.436305 0.092 \n", + "4 0.017567 0.002 \n", + "... ... ... \n", + "29799 0.090665 0.000 \n", + "29800 0.075036 0.005 \n", + "29801 0.403484 0.126 \n", + "29802 0.095948 0.174 \n", + "29803 0.064919 0.000 \n", "\n", - " perturbation_repo_dataset binding_id_id \\\n", - "0 Hackett_2020-hackett_2020 3 \n", - "1 Hackett_2020-hackett_2020 3 \n", - "2 Hackett_2020-hackett_2020 3 \n", - "3 Hackett_2020-hackett_2020 4 \n", - "4 Hackett_2020-hackett_2020 3 \n", - "... ... ... \n", - "9599 kemmeren_2014-kemmeren_2014 804 \n", - "9600 kemmeren_2014-kemmeren_2014 805 \n", - "9601 kemmeren_2014-kemmeren_2014 808 \n", - "9602 kemmeren_2014-kemmeren_2014 806 \n", - "9603 kemmeren_2014-kemmeren_2014 809 \n", + " binding_repo_dataset perturbation_repo_dataset \\\n", + "0 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "1 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "2 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "3 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "4 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "... ... ... \n", + "29799 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29800 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29801 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29802 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29803 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", "\n", - " binding_id_source perturbation_id_id \\\n", - "0 harbison 85 \n", - "1 harbison 83 \n", - "2 harbison 84 \n", - "3 harbison 78 \n", - "4 harbison 81 \n", - "... ... ... \n", - "9599 BrentLab/callingcards;annotated_features 901 \n", - "9600 BrentLab/callingcards;annotated_features 1053 \n", - "9601 BrentLab/callingcards;annotated_features 218 \n", - "9602 BrentLab/callingcards;annotated_features 1023 \n", - "9603 BrentLab/callingcards;annotated_features 913 \n", + " binding_id_id binding_id_source \\\n", + "0 105 harbison \n", + "1 108 harbison \n", + "2 109 harbison \n", + "3 112 harbison \n", + "4 113 harbison \n", + "... ... ... \n", + "29799 724-692-688 BrentLab/callingcards;annotated_features_combined \n", + "29800 725-435-395 BrentLab/callingcards;annotated_features_combined \n", + "29801 726-445-424 BrentLab/callingcards;annotated_features_combined \n", + "29802 79-33 BrentLab/callingcards;annotated_features_combined \n", + "29803 96-49 BrentLab/callingcards;annotated_features_combined \n", "\n", - " perturbation_id_source \n", - "0 BrentLab/Hackett_2020;hackett_2020 \n", - "1 BrentLab/Hackett_2020;hackett_2020 \n", - "2 BrentLab/Hackett_2020;hackett_2020 \n", - "3 BrentLab/Hackett_2020;hackett_2020 \n", - "4 BrentLab/Hackett_2020;hackett_2020 \n", - "... ... \n", - "9599 kemmeren \n", - "9600 kemmeren \n", - "9601 kemmeren \n", - "9602 kemmeren \n", - "9603 kemmeren \n", + " perturbation_id_id perturbation_id_source \n", + "0 10 BrentLab/hughes_2006;overexpression \n", + "1 11 BrentLab/hughes_2006;overexpression \n", + "2 11 BrentLab/hughes_2006;overexpression \n", + "3 12 BrentLab/hughes_2006;overexpression \n", + "4 12 BrentLab/hughes_2006;overexpression \n", + "... ... ... \n", + "29799 784 kemmeren \n", + "29800 666 kemmeren \n", + "29801 271 kemmeren \n", + "29802 1077 kemmeren \n", + "29803 963 kemmeren \n", "\n", - "[9604 rows x 14 columns]" + "[29804 rows x 14 columns]" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -4926,7 +4927,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "cell-25", "metadata": {}, "outputs": [ @@ -4942,39 +4943,39 @@ "4 448 ACA1 15.0 ZEV \n", "\n", " binding_id \\\n", - "0 BrentLab/callingcards;annotated_features;803 \n", - "1 BrentLab/callingcards;annotated_features;156 \n", + "0 BrentLab/callingcards;annotated_features;156 \n", + "1 BrentLab/callingcards;annotated_features;146 \n", "2 BrentLab/callingcards;annotated_features;126 \n", - "3 BrentLab/callingcards;annotated_features;189 \n", - "4 BrentLab/callingcards;annotated_features;146 \n", + "3 BrentLab/callingcards;annotated_features;803 \n", + "4 BrentLab/callingcards;annotated_features;390 \n", "\n", " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;448 112.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;448 31.0 \n", - "2 BrentLab/Hackett_2020;hackett_2020;448 21.0 \n", - "3 BrentLab/Hackett_2020;hackett_2020;448 164.0 \n", - "4 BrentLab/Hackett_2020;hackett_2020;448 23.0 \n", + "0 BrentLab/hackett_2020;hackett_2020;448 374.0 \n", + "1 BrentLab/hackett_2020;hackett_2020;448 452.0 \n", + "2 BrentLab/hackett_2020;hackett_2020;448 437.0 \n", + "3 BrentLab/hackett_2020;hackett_2020;448 110.0 \n", + "4 BrentLab/hackett_2020;hackett_2020;448 50.0 \n", "\n", " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 98.0 108.0 90.0 \n", - "1 98.0 26.0 90.0 \n", - "2 98.0 17.0 90.0 \n", - "3 154.0 150.0 144.0 \n", - "4 98.0 18.0 90.0 \n", + "0 1.0 376.0 5591.0 \n", + "1 1.0 454.0 5591.0 \n", + "2 1.0 442.0 5591.0 \n", + "3 346.0 113.0 346.0 \n", + "4 417.0 50.0 417.0 \n", "\n", " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.187319 0.074 callingcards-annotated_features \n", - "1 0.072561 0.047 callingcards-annotated_features \n", - "2 0.061941 0.071 callingcards-annotated_features \n", - "3 0.213716 0.011 callingcards-annotated_features \n", - "4 0.066616 0.171 callingcards-annotated_features \n", + "0 0.000000 1.000 callingcards-annotated_features \n", + "1 0.000000 1.000 callingcards-annotated_features \n", + "2 0.000000 1.000 callingcards-annotated_features \n", + "3 0.236207 0.001 callingcards-annotated_features \n", + "4 0.148370 0.001 callingcards-annotated_features \n", "\n", " perturbation_repo_dataset binding_id_id \\\n", - "0 Hackett_2020-hackett_2020 803 \n", - "1 Hackett_2020-hackett_2020 156 \n", - "2 Hackett_2020-hackett_2020 126 \n", - "3 Hackett_2020-hackett_2020 189 \n", - "4 Hackett_2020-hackett_2020 146 \n", + "0 hackett_2020-hackett_2020 156 \n", + "1 hackett_2020-hackett_2020 146 \n", + "2 hackett_2020-hackett_2020 126 \n", + "3 hackett_2020-hackett_2020 803 \n", + "4 hackett_2020-hackett_2020 390 \n", "\n", " binding_id_source perturbation_id_id \\\n", "0 BrentLab/callingcards;annotated_features 448 \n", @@ -4983,12 +4984,12 @@ "3 BrentLab/callingcards;annotated_features 448 \n", "4 BrentLab/callingcards;annotated_features 448 \n", "\n", - " perturbation_id_source \n", - "0 BrentLab/Hackett_2020;hackett_2020 \n", - "1 BrentLab/Hackett_2020;hackett_2020 \n", - "2 BrentLab/Hackett_2020;hackett_2020 \n", - "3 BrentLab/Hackett_2020;hackett_2020 \n", - "4 BrentLab/Hackett_2020;hackett_2020 \n" + " perturbation_id_source \n", + "0 hackett \n", + "1 hackett \n", + "2 hackett \n", + "3 hackett \n", + "4 hackett \n" ] } ], @@ -5016,7 +5017,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "cell-26", "metadata": {}, "outputs": [], diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md index 45320d4..cbb3fe6 100644 --- a/docs/virtual_db_configuration.md +++ b/docs/virtual_db_configuration.md @@ -10,8 +10,10 @@ levels. repositories: # Each repository defines a "table" in the virtual database BrentLab/harbison_2004: - # REQUIRED: Specify which field is the sample identifier. At this level, it means - # that all datasets have a field `sample_id` that uniquely identifies samples. + # REQUIRED: Specify which column is the sample identifier. The `field` + # value is the actual column name in the parquet data. At the repo level, + # it applies to all datasets in this repository. If not specified at + # either level, the default column name "sample_id" is assumed. sample_id: field: sample_id # Repository-wide properties (apply to all datasets in this repository) @@ -47,8 +49,9 @@ repositories: kemmeren_2014: # optional -- see the note for `db_name` in harbison above db_name: kemmeren - # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be - # defined at the dataset level for each dataset in the repo + # REQUIRED: If `sample_id` isn't defined at the repo level, it must be + # defined at the dataset level. The `field` value is the actual column + # name in the parquet data (does not need to be literally "sample_id"). sample_id: field: sample_id # Same logical fields, different physical paths @@ -152,9 +155,10 @@ Each row relates 2+ samples from other datasets. ### Structure -Comparative datasets use `source_sample` fields instead of a single `sample_id`: +Comparative datasets use `source_sample` fields instead of a single sample +identifier column: - Multiple fields with `role: source_sample` -- Each contains composite identifier: `"repo_id;config_name;sample_id"` +- Each contains composite identifier: `"repo_id;config_name;sample_id_value"` - Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` ### Fields @@ -206,10 +210,11 @@ build on each other. Using `harbison` as an example primary dataset and **1. Metadata view** -One row per unique `sample_id`. Derived columns from the configuration -(e.g., `carbon_source`, `temperature_celsius`) are resolved here using -datacard definitions, factor aliases, and missing value labels. This is -the primary view for querying sample-level metadata. +One row per unique sample identifier (the column configured via +`sample_id: {field: }`). Derived columns from the +configuration (e.g., `carbon_source`, `temperature_celsius`) are resolved +here using datacard definitions, factor aliases, and missing value labels. +This is the primary view for querying sample-level metadata. **2. Raw data view** @@ -239,7 +244,7 @@ or filter by source dataset without parsing composite IDs in SQL. ``` __harbison_parquet (raw parquet, not directly exposed) | - +-> harbison_meta (deduplicated, one row per sample_id, + +-> harbison_meta (deduplicated, one row per sample identifier, | with derived columns from config) | +-> harbison (full parquet joined to harbison_meta) diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py index abf94dd..358ab4d 100644 --- a/tfbpapi/datacard.py +++ b/tfbpapi/datacard.py @@ -17,6 +17,7 @@ """ import logging +from dataclasses import dataclass from typing import Any from pydantic import ValidationError @@ -36,6 +37,32 @@ ) +@dataclass +class DatasetSchema: + """ + Complete schema summary for a data configuration. + + Derived entirely from the DataCard YAML -- no DuckDB introspection needed. Used by + VirtualDB to determine column partitioning between data and metadata parquets. + + :ivar data_columns: Column names present in the data parquet :ivar metadata_columns: + Column names that are metadata :ivar join_columns: Columns common to both data and + metadata parquets (used as JOIN keys for external metadata). Empty for + embedded metadata (same parquet, no JOIN needed). :ivar metadata_source: One of + "embedded", "external", or "none" :ivar external_metadata_config: Config name of the + external metadata config, or None if metadata is embedded or absent :ivar + is_partitioned: Whether the data parquet is partitioned + + """ + + data_columns: set[str] + metadata_columns: set[str] + join_columns: set[str] + metadata_source: str + external_metadata_config: str | None + is_partitioned: bool + + class DataCard: """ Parser and explorer for HuggingFace dataset metadata. @@ -91,6 +118,7 @@ def __init__(self, repo_id: str, token: str | None = None): # Cache for parsed card self._dataset_card: DatasetCard | None = None self._metadata_cache: dict[str, list[ExtractedMetadata]] = {} + self._metadata_fields_map: dict[str, list[str]] = {} @property def dataset_card(self) -> DatasetCard: @@ -115,6 +143,7 @@ def _load_and_validate_card(self) -> None: # Validate using Pydantic model self._dataset_card = DatasetCard(**card_data) + self._build_metadata_fields_map() self.logger.debug(f"Successfully validated dataset card for {self.repo_id}") except ValidationError as e: @@ -241,6 +270,186 @@ def get_metadata_relationships( return relationships + def _build_metadata_fields_map(self) -> None: + """ + Build a mapping from data config names to their metadata fields. + + Called during card loading. For each data config, resolves metadata + fields from two sources: + + 1. Embedded: the data config has ``metadata_fields`` listing which + of its own columns are metadata. + 2. External: a separate metadata-type config has ``applies_to`` + including this config name. The metadata fields are the feature + names from that metadata config. + + Embedded takes priority. For external, the first matching metadata + config wins. + + """ + assert self._dataset_card is not None + self._metadata_fields_map = {} + meta_configs = self._dataset_card.get_metadata_configs() + + for data_cfg in self._dataset_card.get_data_configs(): + name = data_cfg.config_name + # Embedded case + if data_cfg.metadata_fields: + self._metadata_fields_map[name] = list(data_cfg.metadata_fields) + continue + # External case: find metadata config with applies_to + for meta_cfg in meta_configs: + if meta_cfg.applies_to and name in meta_cfg.applies_to: + self._metadata_fields_map[name] = [ + f.name for f in meta_cfg.dataset_info.features + ] + break + else: + self.logger.warning( + "No metadata fields found for data config '%s' " + "in repo '%s' -- no embedded metadata_fields and " + "no metadata config with applies_to", + name, + self.repo_id, + ) + + def get_metadata_fields(self, config_name: str) -> list[str] | None: + """ + Get metadata field names for a data configuration. + + Returns pre-computed metadata fields resolved during card loading. + Handles both embedded metadata (``metadata_fields`` on the data + config) and external metadata (separate metadata config with + ``applies_to``). + + :param config_name: Name of the data configuration + :return: List of metadata field names, or None if no metadata + + """ + # Ensure card is loaded (triggers _build_metadata_fields_map) + _ = self.dataset_card + return self._metadata_fields_map.get(config_name) + + def get_data_col_names(self, config_name: str) -> set[str]: + """ + Return the column names from the data config's feature list. + + These are the columns present in the data parquet file, derived directly from + the DataCard feature definitions without any DuckDB introspection. + + :param config_name: Name of the data configuration + :return: Set of column names, empty if config not found + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return set() + return {f.name for f in config.dataset_info.features} + + def get_metadata_config_name(self, config_name: str) -> str | None: + """ + Return the config_name of the external metadata config, if any. + + If the data config has embedded ``metadata_fields``, or if no + metadata config with ``applies_to`` references this config, + returns None. + + :param config_name: Name of the data configuration + :return: The metadata config name, or None + + """ + _ = self.dataset_card # ensure loaded + data_cfg = self.get_config(config_name) + if not data_cfg: + return None + # Embedded metadata -- no external config needed + if data_cfg.metadata_fields: + return None + # Find external metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + return meta_cfg.config_name + return None + + def get_dataset_schema(self, config_name: str) -> DatasetSchema | None: + """ + Return schema summary for a data configuration. + + Determines whether metadata is embedded or external, which + columns belong to data vs metadata parquets, and which columns + are shared between them (join keys for external metadata). + All information is derived from the DataCard YAML -- no DuckDB + introspection is needed. + + :param config_name: Name of the data configuration + :return: DatasetSchema instance, or None if config not found + + Example -- embedded metadata:: + + schema = card.get_dataset_schema("harbison_2004") + # schema.metadata_source == "embedded" + # schema.join_columns == set() (same parquet, no JOIN) + + Example -- external metadata:: + + schema = card.get_dataset_schema("annotated_features") + # schema.metadata_source == "external" + # schema.external_metadata_config == "annotated_features_meta" + # schema.join_columns == {"id"} (common to both parquets) + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return None + + is_partitioned = bool( + config.dataset_info.partitioning + and config.dataset_info.partitioning.enabled + ) + + # Embedded: metadata_fields lists which of the config's own + # columns are metadata; all live in the same parquet + if config.metadata_fields: + all_cols = {f.name for f in config.dataset_info.features} + meta_cols = set(config.metadata_fields) + data_cols = all_cols - meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + + # External: find metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + data_cols = {f.name for f in config.dataset_info.features} + meta_cols = {f.name for f in meta_cfg.dataset_info.features} + join_cols = data_cols & meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=join_cols, + metadata_source="external", + external_metadata_config=meta_cfg.config_name, + is_partitioned=is_partitioned, + ) + + # No metadata relationship -- treat all columns as data + all_cols = {f.name for f in config.dataset_info.features} + return DatasetSchema( + data_columns=all_cols, + metadata_columns=set(), + join_columns=set(), + metadata_source="none", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + def get_repository_info(self) -> dict[str, Any]: """Get general repository information.""" card = self.dataset_card @@ -315,12 +524,13 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: raise DataCardError(f"Configuration '{config_name}' not found") schema: dict[str, Any] = { - "regulator_fields": [], # Fields with role=regulator_identifier - "target_fields": [], # Fields with role=target_identifier - "condition_fields": [], # Fields with role=experimental_condition - "condition_definitions": {}, # Field-level condition details - "top_level_conditions": None, # Repo-level conditions - "config_level_conditions": None, # Config-level conditions + "regulator_fields": [], + "target_fields": [], + "condition_fields": [], + "condition_definitions": {}, + "metadata_fields": None, + "top_level_conditions": None, + "config_level_conditions": None, } for feature in config.dataset_info.features: @@ -333,15 +543,32 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: if feature.definitions: schema["condition_definitions"][feature.name] = feature.definitions + # Include features from external metadata config + meta_fields = self.get_metadata_fields(config_name) + schema["metadata_fields"] = meta_fields + if meta_fields is not None and not config.metadata_fields: + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + for feature in meta_cfg.dataset_info.features: + if feature.role == "regulator_identifier": + schema["regulator_fields"].append(feature.name) + elif feature.role == "target_identifier": + schema["target_fields"].append(feature.name) + elif feature.role == "experimental_condition": + schema["condition_fields"].append(feature.name) + if feature.definitions: + schema["condition_definitions"][ + feature.name + ] = feature.definitions + break + # Add top-level conditions (applies to all configs/samples) - # Stored in model_extra as dict if self.dataset_card.model_extra: top_level = self.dataset_card.model_extra.get("experimental_conditions") if top_level: schema["top_level_conditions"] = top_level # Add config-level conditions (applies to this config's samples) - # Stored in model_extra as dict if config.model_extra: config_level = config.model_extra.get("experimental_conditions") if config_level: diff --git a/tfbpapi/models.py b/tfbpapi/models.py index a8660a1..c89281b 100644 --- a/tfbpapi/models.py +++ b/tfbpapi/models.py @@ -876,3 +876,32 @@ def get_property_mappings( mappings.update(dataset_config.property_mappings) return mappings + + def get_sample_id_field(self, repo_id: str, config_name: str) -> str: + """ + Resolve the actual column name for the sample identifier. + + Checks dataset-level ``sample_id`` first, then repo-level, + falling back to ``"sample_id"`` if neither is configured. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Column name for the sample identifier + + """ + repo_cfg = self.get_repository_config(repo_id) + if not repo_cfg: + return "sample_id" + + # Dataset-level takes precedence + if repo_cfg.dataset and config_name in repo_cfg.dataset: + ds_cfg = repo_cfg.dataset[config_name] + if ds_cfg.sample_id is not None and ds_cfg.sample_id.field: + return ds_cfg.sample_id.field + + # Repo-level fallback + repo_sample_id = repo_cfg.properties.get("sample_id") + if repo_sample_id is not None and repo_sample_id.field is not None: + return repo_sample_id.field + + return "sample_id" diff --git a/tfbpapi/tests/test_datacard.py b/tfbpapi/tests/test_datacard.py index 5f098de..b9228d1 100644 --- a/tfbpapi/tests/test_datacard.py +++ b/tfbpapi/tests/test_datacard.py @@ -5,10 +5,80 @@ import pytest from tfbpapi import DataCard +from tfbpapi.datacard import DatasetSchema from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError from tfbpapi.models import DatasetType +def _external_metadata_card_data(): + """Card data with external metadata (no embedded metadata_fields).""" + return { + "configs": [ + { + "config_name": "coverage_data", + "description": "Coverage measurements", + "dataset_type": "genome_map", + "default": True, + "data_files": [{"split": "train", "path": "coverage.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "chr", + "dtype": "string", + "description": "Chromosome", + "role": "genomic_coordinate", + }, + { + "name": "coverage", + "dtype": "float32", + "description": "Coverage value", + "role": "quantitative_measure", + }, + ] + }, + }, + { + "config_name": "sample_metadata", + "description": "Sample metadata", + "dataset_type": "metadata", + "applies_to": ["coverage_data"], + "data_files": [{"split": "train", "path": "metadata.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "batch", + "dtype": "string", + "description": "Batch ID", + }, + { + "name": "regulator_locus_tag", + "dtype": "string", + "description": "TF locus tag", + "role": "regulator_identifier", + }, + { + "name": "regulator_symbol", + "dtype": "string", + "description": "TF symbol", + "role": "regulator_identifier", + }, + ] + }, + }, + ], + } + + class TestDataCard: """Test suite for DataCard class.""" @@ -30,6 +100,7 @@ def test_init( assert datacard.token == test_token assert datacard._dataset_card is None assert datacard._metadata_cache == {} + assert datacard._metadata_fields_map == {} # Check that fetchers were initialized mock_card_fetcher.assert_called_once_with(token=test_token) @@ -447,3 +518,393 @@ def test_extract_partition_values_fetch_error( # Should return empty set on error assert values == set() + + +class TestGetMetadataFields: + """Tests for DataCard.get_metadata_fields().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata_fields on the data config are returned.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("binding_data") + + assert result == ["regulator_symbol", "experimental_condition"] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata via applies_to returns feature names.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("coverage_data") + + assert result == [ + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + ] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("genomic_features") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Unknown config name returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("nonexistent") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_schema_includes_external_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """extract_metadata_schema includes roles from external metadata.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + schema = datacard.extract_metadata_schema("coverage_data") + + # External metadata features with role=regulator_identifier + assert "regulator_locus_tag" in schema["regulator_fields"] + assert "regulator_symbol" in schema["regulator_fields"] + # metadata_fields key populated + assert schema["metadata_fields"] is not None + assert "sample_id" in schema["metadata_fields"] + + +class TestGetMetadataConfigName: + """Tests for DataCard.get_metadata_config_name().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_config_name( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """Returns metadata config name when applies_to matches.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("coverage_data") + + assert result == "sample_metadata" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None when metadata is embedded.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("binding_data") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("nonexistent") + + assert result is None + + +class TestGetDataColNames: + """Tests for DataCard.get_data_col_names().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_returns_feature_names( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns column names from the data config's features.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("binding_data") + + # binding_data features: regulator_symbol, target_gene, + # experimental_condition, binding_score + assert isinstance(result, set) + assert result == { + "regulator_symbol", + "target_gene", + "experimental_condition", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_config_returns_data_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """For external metadata, returns data config features only.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("coverage_data") + + # coverage_data features: sample_id, chr, coverage + assert result == {"sample_id", "chr", "coverage"} + # Must NOT include metadata-only columns + assert "batch" not in result + assert "regulator_locus_tag" not in result + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_empty_set( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns empty set for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("nonexistent") + + assert result == set() + + +class TestGetDatasetSchema: + """Tests for DataCard.get_dataset_schema().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata produces correct data/metadata column split.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # binding_data has metadata_fields: [regulator_symbol, + # experimental_condition] and features: regulator_symbol, + # target_gene, experimental_condition, binding_score + result = datacard.get_dataset_schema("binding_data") + + assert result is not None + assert isinstance(result, DatasetSchema) + assert result.metadata_source == "embedded" + assert result.external_metadata_config is None + assert result.join_columns == set() + assert result.metadata_columns == { + "regulator_symbol", + "experimental_condition", + } + # data_columns = all features minus metadata_columns + assert result.data_columns == { + "target_gene", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata produces correct split and join columns.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + # coverage_data features: sample_id, chr, coverage + # sample_metadata features: sample_id, batch, regulator_locus_tag, + # regulator_symbol + # join_columns = intersection = {sample_id} + result = datacard.get_dataset_schema("coverage_data") + + assert result is not None + assert result.metadata_source == "external" + assert result.external_metadata_config == "sample_metadata" + assert result.data_columns == {"sample_id", "chr", "coverage"} + assert result.metadata_columns == { + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + } + assert result.join_columns == {"sample_id"} + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_all_cols_as_data( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata relationship has all cols as data.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # genomic_features has no metadata_fields and no applies_to + result = datacard.get_dataset_schema("genomic_features") + + assert result is not None + assert result.metadata_source == "none" + assert result.external_metadata_config is None + assert result.metadata_columns == set() + assert result.join_columns == set() + assert result.data_columns == { + "gene_id", + "gene_symbol", + "chromosome", + "start", + "end", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for an unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_dataset_schema("nonexistent") + + assert result is None diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index e62b840..9bae041 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -14,6 +14,8 @@ import pytest import yaml # type: ignore +from tfbpapi.datacard import DatasetSchema +from tfbpapi.models import MetadataConfig from tfbpapi.virtual_db import VirtualDB # ------------------------------------------------------------------ @@ -313,27 +315,87 @@ def _make_mock_datacard(repo_id): card.get_config.return_value = config_mock card.get_field_definitions.return_value = HARBISON_CONDITION_DEFS card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = METADATA_FIELDS["harbison_2004"] + card.get_metadata_config_name.return_value = None + # Harbison: embedded metadata, condition is data col used for + # derived properties; metadata_cols are the three metadata fields + harbison_meta_cols = set(METADATA_FIELDS["harbison_2004"]) + harbison_data_cols = { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } - harbison_meta_cols + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns=harbison_data_cols + | { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=harbison_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) elif repo_id == "BrentLab/kemmeren": config_mock = MagicMock() config_mock.metadata_fields = METADATA_FIELDS["kemmeren_2014"] - # model_extra at config level (no experimental_conditions - # at this level for kemmeren) config_mock.model_extra = {} card.get_config.return_value = config_mock card.get_field_definitions.return_value = {} - # model_extra at top level with experimental_conditions - # wrapper -- matches real DataCard structure dataset_card_mock = MagicMock() dataset_card_mock.model_extra = { "experimental_conditions": KEMMEREN_EXP_CONDITIONS, } card.dataset_card = dataset_card_mock + card.get_metadata_fields.return_value = METADATA_FIELDS["kemmeren_2014"] + card.get_metadata_config_name.return_value = None + kemmeren_meta_cols = set(METADATA_FIELDS["kemmeren_2014"]) + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={ + "sample_id", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=kemmeren_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) else: config_mock = MagicMock() config_mock.metadata_fields = None card.get_config.return_value = config_mock card.get_field_definitions.return_value = {} card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = None + card.get_metadata_config_name.return_value = None + card.get_data_col_names.return_value = set() + card.get_dataset_schema.return_value = None return card @@ -780,3 +842,283 @@ def test_lazy_init(self, config_path): v = VirtualDB(config_path) assert v._conn is None assert not v._views_registered + + +# ------------------------------------------------------------------ +# Tests: dynamic sample_id column +# ------------------------------------------------------------------ + + +class TestDynamicSampleId: + """Tests that the sample identifier column is resolved from config.""" + + def test_non_default_sample_id(self, tmp_path, monkeypatch): + """Views work when sample_id maps to a non-default column.""" + import tfbpapi.virtual_db as vdb_module + + # Config uses experiment_id as the sample identifier + config = { + "repositories": { + "TestOrg/custom_id": { + "dataset": { + "custom_data": { + "db_name": "custom", + "sample_id": { + "field": "experiment_id", + }, + "regulator": { + "field": "regulator", + }, + } + } + } + } + } + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + + # Parquet uses experiment_id (not sample_id) + df = pd.DataFrame( + { + "experiment_id": [100, 100, 200, 200], + "regulator": ["TF1", "TF1", "TF2", "TF2"], + "target": ["G1", "G2", "G1", "G2"], + "score": [1.5, 0.8, 2.1, 0.3], + } + ) + parquet_path = tmp_path / "custom.parquet" + files = { + ("TestOrg/custom_id", "custom_data"): [_write_parquet(parquet_path, df)], + } + + # Mock datacard + mock_card = MagicMock() + mock_card.get_metadata_fields.return_value = [ + "regulator", + ] + mock_card.get_field_definitions.return_value = {} + mock_card.get_experimental_conditions.return_value = {} + mock_card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"experiment_id", "target", "score"}, + metadata_columns={"regulator"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + + v = VirtualDB(config_path) + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: mock_card, + ) + + # Meta view should have experiment_id + regulator + meta_df = v.query("SELECT * FROM custom_meta") + assert "experiment_id" in meta_df.columns + assert len(meta_df) == 2 # 2 distinct samples + + # Enriched raw view should JOIN on experiment_id + raw_df = v.query("SELECT * FROM custom") + assert "experiment_id" in raw_df.columns + assert len(raw_df) == 4 # all rows + + def test_get_sample_id_field_dataset_level(self): + """Dataset-level sample_id takes precedence.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "dataset": { + "ds": { + "sample_id": { + "field": "my_id", + }, + } + } + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "my_id" + + def test_get_sample_id_field_repo_level(self): + """Repo-level sample_id used when dataset has none.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_sid"}, + "dataset": {"ds": {}}, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "repo_sid" + + def test_get_sample_id_field_default(self): + """Falls back to 'sample_id' when not configured.""" + config = MetadataConfig.model_validate( + {"repositories": {"Org/repo": {"dataset": {"ds": {}}}}} + ) + assert config.get_sample_id_field("Org/repo", "ds") == "sample_id" + + def test_get_sample_id_field_dataset_overrides_repo(self): + """Dataset-level overrides repo-level.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_id_col"}, + "dataset": { + "ds": { + "sample_id": { + "field": "ds_id_col", + }, + } + }, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "ds_id_col" + + +class TestExternalMetadata: + """Tests for datasets with external metadata parquet files.""" + + def test_external_metadata_join(self, tmp_path, monkeypatch): + """Meta view JOINs data and metadata parquet when metadata is in a separate + config.""" + import tfbpapi.virtual_db as vdb_module + + # Data parquet: measurements with sample_id but no + # metadata columns like db_id or batch + data_df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL002W", + ], + "effect": [1.5, 0.8, 2.1, 0.3], + } + ) + # Metadata parquet: sample-level metadata + meta_df = pd.DataFrame( + { + "sample_id": [1, 2], + "db_id": [101, 102], + "regulator_locus_tag": ["YBR049C", "YDR463W"], + "background_hops": [500, 600], + } + ) + + data_path = _write_parquet(tmp_path / "data.parquet", data_df) + meta_path = _write_parquet(tmp_path / "meta.parquet", meta_df) + + parquet_files = { + ("TestOrg/repo", "chip_data"): [data_path], + ("TestOrg/repo", "sample_metadata"): [meta_path], + } + + config = { + "repositories": { + "TestOrg/repo": { + "sample_id": {"field": "sample_id"}, + "dataset": { + "chip_data": { + "db_name": "chip", + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + } + }, + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + # Mock DataCard: external metadata via applies_to + card = MagicMock() + config_mock = MagicMock() + config_mock.metadata_fields = None # no embedded + card.get_config.return_value = config_mock + card.get_metadata_fields.return_value = [ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + ] + card.get_metadata_config_name.return_value = "sample_metadata" + # Data parquet columns (from chip_data features) + card.get_data_col_names.return_value = { + "sample_id", + "target_locus_tag", + "effect", + } + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + # External metadata schema: data cols in data parquet, + # metadata cols in metadata parquet, joined on sample_id + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "target_locus_tag", "effect"}, + metadata_columns={ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + }, + join_columns={"sample_id"}, + metadata_source="external", + external_metadata_config="sample_metadata", + is_partitioned=False, + ) + + v = VirtualDB(config_file) + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cfg: parquet_files.get((repo_id, cfg), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + + # Trigger view registration + tables = v.tables() + assert "chip" in tables + assert "chip_meta" in tables + + # Meta view should have columns from both parquets + meta_result = v.query("SELECT * FROM chip_meta ORDER BY sample_id") + meta_cols = set(meta_result.columns) + assert "sample_id" in meta_cols + assert "db_id" in meta_cols + assert "regulator_locus_tag" in meta_cols + assert "background_hops" in meta_cols + + # Verify data is correct (joined properly) + assert len(meta_result) == 2 + row1 = meta_result[meta_result["sample_id"] == 1].iloc[0] + assert row1["db_id"] == 101 + assert row1["regulator_locus_tag"] == "YBR049C" + + # Enriched raw view should also work + raw_result = v.query("SELECT * FROM chip ORDER BY sample_id") + assert "db_id" in raw_result.columns + assert len(raw_result) == 4 # 4 data rows diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 96097a3..323c809 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -11,8 +11,8 @@ views (one row per sample with derived columns) and full data views (measurement-level data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded views that parse composite ID fields into ``_source`` (aliased to the configured -db_name) and ``_id`` (sample_id) columns. The expectation is that a developer will -use this interface to write SQL queries against the views to provide an API to +db_name) and ``_id`` (sample identifier) columns. The expectation is that a developer +will use this interface to write SQL queries against the views to provide an API to downstream users and applications. Example Usage:: @@ -49,8 +49,9 @@ import duckdb import pandas as pd +from duckdb import BinderException -from tfbpapi.datacard import DataCard +from tfbpapi.datacard import DataCard, DatasetSchema from tfbpapi.models import MetadataConfig logger = logging.getLogger(__name__) @@ -414,6 +415,62 @@ def _register_all_views(self) -> None: parquet_only=comparative, ) + # 1b. Resolve external metadata parquet views. + # When a data config's metadata lives in a separate HF config + # (applies_to), register its parquet as ___metadata_parquet. + # All information is derived from DataCard YAML -- no DuckDB + # introspection needed. + self._dataset_schemas: dict[str, DatasetSchema] = {} + self._external_meta_views: dict[str, str] = {} + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if self._is_comparative(repo_id, config_name): + continue + try: + card = _cached_datacard(repo_id, token=self.token) + schema = card.get_dataset_schema(config_name) + except Exception as exc: + logger.warning( + "Could not get dataset schema for %s/%s: %s", + repo_id, + config_name, + exc, + ) + continue + if schema is not None: + self._dataset_schemas[db_name] = schema + if ( + schema is None + or schema.metadata_source != "external" + or not schema.external_metadata_config + ): + continue + meta_view = f"__{db_name}_metadata_parquet" + files = self._resolve_parquet_files( + repo_id, schema.external_metadata_config + ) + if not files: + logger.warning( + "No parquet files for external metadata config " + "'%s' in repo '%s'", + schema.external_metadata_config, + repo_id, + ) + continue + files_sql = ", ".join(f"'{f}'" for f in files) + try: + self._db.execute( + f"CREATE OR REPLACE VIEW {meta_view} AS " + f"SELECT * FROM read_parquet([{files_sql}])" + ) + except Exception as exc: + logger.warning( + "Failed to create external metadata view '%s': %s", + meta_view, + exc, + ) + continue + self._external_meta_views[db_name] = meta_view + # 2. Metadata views for primary datasets (_meta) # This is based on the metadata defined in the datacard, # and includes any additional derived columns based on the @@ -567,62 +624,150 @@ def _register_raw_view( def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: """ - Register a ``_meta`` view with one row per sample_id. + Register a ``_meta`` view with one row per sample. - Includes raw metadata columns from the DataCard plus any derived columns from - config property mappings (resolved against DataCard definitions with factor - aliases applied). + Includes metadata columns from the DataCard plus any derived columns + from config property mappings (resolved against DataCard definitions + with factor aliases applied). + + For datasets with external metadata (a separate HF config with + ``applies_to``), JOINs the data parquet to the metadata parquet + on the configured sample_id column. The actual columns in the metadata + parquet are determined by DuckDB introspection (``DESCRIBE``) rather + than the DataCard feature list, because DataCard feature lists are + conceptual schemas that may include columns not physically present + in the parquet files. :param db_name: Base view name for the primary dataset :param repo_id: Repository ID :param config_name: Configuration name + raises ValueError: If no metadata fields are found. + raises BinderException: If view creation fails, with SQL details. + """ parquet_view = f"__{db_name}_parquet" if not self._view_exists(parquet_view): return - meta_cols = self._resolve_metadata_fields(repo_id, config_name) - prop_result = self._resolve_property_columns(repo_id, config_name) + sample_col = self._get_sample_id_col(db_name) - if prop_result is not None: - derived_exprs, prop_raw_cols = prop_result - # Raw cols = metadata_fields + any source fields needed - # by property mappings - if meta_cols is not None: - raw = list(dict.fromkeys(["sample_id"] + meta_cols + prop_raw_cols)) - else: - raw = list(dict.fromkeys(["sample_id"] + prop_raw_cols)) + # Pull ext_meta_view early -- needed for both meta_cols and + # FROM clause construction. + schema: DatasetSchema | None = getattr(self, "_dataset_schemas", {}).get( + db_name + ) + ext_meta_view: str | None = getattr(self, "_external_meta_views", {}).get( + db_name + ) - raw_sql = ", ".join(raw) + is_external = ( + ext_meta_view is not None + and schema is not None + and schema.metadata_source == "external" + ) - # Outer SELECT: raw cols + derived expressions - outer_parts = list(raw) + derived_exprs - outer_sql = ", ".join(outer_parts) + if is_external: + # DataCard feature lists are conceptual -- columns listed there + # may not be physically present in the parquet file. Use DuckDB + # introspection to get the actual columns in the metadata parquet. + assert ext_meta_view is not None + actual_meta_cols: set[str] = set(self._get_view_columns(ext_meta_view)) + meta_cols: list[str] = sorted(actual_meta_cols) + elif schema is not None: + actual_meta_cols = schema.metadata_columns + meta_cols = sorted(actual_meta_cols) + else: + meta_cols = self._resolve_metadata_fields(repo_id, config_name) or [] + actual_meta_cols = set(meta_cols) - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT {outer_sql} " - f"FROM (" - f"SELECT DISTINCT {raw_sql} " - f"FROM {parquet_view}" - f") AS __raw" + if not meta_cols: + raise ValueError( + f"No metadata fields found for {repo_id}/{config_name}. " + f"Cannot create meta view '{db_name}_meta'." ) - elif meta_cols is not None: - # Fallback: metadata_fields only, no property mappings - cols = list(dict.fromkeys(["sample_id"] + meta_cols)) - cols_sql = ", ".join(cols) - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT {cols_sql} " - f"FROM {parquet_view}" + + # FROM clause: JOIN data + metadata parquets when external, + # plain parquet view otherwise. + if is_external: + assert ext_meta_view is not None + # Use the configured sample_id column as the join key. + # The DataCard feature intersection (schema.join_columns) + # is unreliable because a data config's feature list may + # document columns that are physically only in the metadata + # parquet (present conceptually after a join, not in the + # physical data parquet file). + from_clause = ( + f"{parquet_view} d " f"JOIN {ext_meta_view} m " f"USING ({sample_col})" ) + is_join = True else: - # No metadata_fields at all -- all columns are metadata - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT * FROM {parquet_view}" - ) + from_clause = parquet_view + is_join = False + + def qualify(col: str) -> str: + """Return qualified column name for JOIN context.""" + if not is_join: + return col + if col == sample_col: + return col # USING makes join key unqualified + # Use the actual metadata parquet columns (from DuckDB + # introspection) to decide qualification, not the DataCard + # feature list which may be inaccurate. + if col in actual_meta_cols: + return f"m.{col}" + return f"d.{col}" + + # Build SELECT: sample_id + metadata cols (deduplicated) + seen: set[str] = set() + select_parts: list[str] = [] + + def add_col(col: str) -> None: + if col not in seen: + seen.add(col) + select_parts.append(qualify(col)) + + add_col(sample_col) + for col in meta_cols: + add_col(col) + + # Add derived property expressions from the VirtualDB config + prop_result = self._resolve_property_columns(repo_id, config_name) + if prop_result is not None: + derived_exprs, prop_raw_cols = prop_result + # Ensure source columns needed by expressions are selected + for col in prop_raw_cols: + add_col(col) + # Qualify source column references inside CASE WHEN expressions + if is_join: + qualified_exprs = [] + for expr in derived_exprs: + for raw_col in prop_raw_cols: + q = qualify(raw_col) + if q != raw_col: + # Replace bare column name in CASE WHEN patterns + expr = expr.replace( + f"CASE {raw_col} ", f"CASE {q} " + ).replace(f" {raw_col} = ", f" {q} = ") + qualified_exprs.append(expr) + derived_exprs = qualified_exprs + select_parts.extend(derived_exprs) + + cols_sql = ", ".join(select_parts) + sql = ( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT {cols_sql} FROM {from_clause}" + ) + try: + self._db.execute(sql) + except BinderException as exc: + raise BinderException( + f"Failed to create meta view '{db_name}_meta'.\n" + f" schema: {schema}\n" + f" from_clause: {from_clause}\n" + f" SQL: {sql}\n" + f" error: {exc}" + ) from exc def _enrich_raw_view(self, db_name: str) -> None: """ @@ -648,40 +793,58 @@ def _enrich_raw_view(self, db_name: str) -> None: if not extra_cols: return + sample_col = self._get_sample_id_col(db_name) extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) self._db.execute( f"CREATE OR REPLACE VIEW {db_name} AS " f"SELECT r.*, {extra_select} " f"FROM {parquet_name} r " - f"JOIN {meta_name} m USING (sample_id)" + f"JOIN {meta_name} m USING ({sample_col})" ) def _get_view_columns(self, view: str) -> list[str]: - """Return column names for a view.""" - df = self._db.execute( - f"SELECT column_name FROM information_schema.columns " - f"WHERE table_name = '{view}'" - ).fetchdf() + """ + Return column names for a view. + + Uses ``DESCRIBE`` rather than ``information_schema`` to force + eager schema resolution for ``read_parquet``-backed views, + which DuckDB may evaluate lazily. + + """ + df = self._db.execute(f"DESCRIBE {view}").fetchdf() return df["column_name"].tolist() + def _get_sample_id_col(self, db_name: str) -> str: + """ + Resolve the sample identifier column name for a dataset. + + :param db_name: Resolved database view name + :return: Actual column name for the sample identifier + + """ + repo_id, config_name = self._db_name_map[db_name] + return self.config.get_sample_id_field(repo_id, config_name) + def _resolve_metadata_fields( self, repo_id: str, config_name: str ) -> list[str] | None: """ - Get the metadata_fields list from the DataCard config. + Get metadata field names from the DataCard. + + Delegates to ``DataCard.get_metadata_fields()`` which handles + both embedded metadata_fields and external metadata configs + (via applies_to). :param repo_id: Repository ID :param config_name: Configuration name - :return: List of metadata field names, or None if not specified + :return: List of metadata field names, or None if not found """ try: card = _cached_datacard(repo_id, token=self.token) - config = card.get_config(config_name) - if config and config.metadata_fields: - return list(config.metadata_fields) + return card.get_metadata_fields(config_name) except Exception: - logger.debug( + logger.error( "Could not resolve metadata_fields for %s/%s", repo_id, config_name, @@ -975,7 +1138,7 @@ def _register_comparative_expanded_view( - ``_source`` -- the ``repo_id;config_name`` prefix, aliased to the configured ``db_name`` when available. - - ``_id`` -- the sample_id component. + - ``_id`` -- the sample identifier component. :param db_name: Base view name for the comparative dataset :param ds_cfg: DatasetVirtualDBConfig with ``links`` From f00c15a23d71d27fd56caf7a6c305ec573a9b8d6 Mon Sep 17 00:00:00 2001 From: Chase Mateusiak Date: Wed, 18 Feb 2026 14:02:40 -0600 Subject: [PATCH 2/2] Update tfbpapi/datacard.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tfbpapi/datacard.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py index 358ab4d..734a5f3 100644 --- a/tfbpapi/datacard.py +++ b/tfbpapi/datacard.py @@ -45,13 +45,15 @@ class DatasetSchema: Derived entirely from the DataCard YAML -- no DuckDB introspection needed. Used by VirtualDB to determine column partitioning between data and metadata parquets. - :ivar data_columns: Column names present in the data parquet :ivar metadata_columns: - Column names that are metadata :ivar join_columns: Columns common to both data and - metadata parquets (used as JOIN keys for external metadata). Empty for - embedded metadata (same parquet, no JOIN needed). :ivar metadata_source: One of - "embedded", "external", or "none" :ivar external_metadata_config: Config name of the - external metadata config, or None if metadata is embedded or absent :ivar - is_partitioned: Whether the data parquet is partitioned + :ivar data_columns: Column names present in the data parquet. + :ivar metadata_columns: Column names that are metadata. + :ivar join_columns: Columns common to both data and metadata parquets (used as JOIN + keys for external metadata). Empty for embedded metadata (same parquet, no JOIN + needed). + :ivar metadata_source: One of ``"embedded"``, ``"external"``, or ``"none"``. + :ivar external_metadata_config: Config name of the external metadata config, or + ``None`` if metadata is embedded or absent. + :ivar is_partitioned: Whether the data parquet is partitioned. """