diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb index 7305146..bb07e75 100644 --- a/docs/tutorials/virtual_db_tutorial.ipynb +++ b/docs/tutorials/virtual_db_tutorial.ipynb @@ -33,7 +33,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Config saved to: /tmp/tmpf610qghb/vdb_config.yaml\n" + "Config saved to: /tmp/tmp_krovt13/vdb_config.yaml\n" ] } ], @@ -41,6 +41,10 @@ "config_yaml = \"\"\"\n", "repositories:\n", " BrentLab/harbison_2004:\n", + " tags:\n", + " assay: binding\n", + " method: chip-chip\n", + " organism: yeast\n", " dataset:\n", " harbison_2004:\n", " db_name: harbison\n", @@ -59,6 +63,10 @@ " field: regulator_symbol\n", "\n", " BrentLab/kemmeren_2014:\n", + " tags:\n", + " assay: perturbation\n", + " method: microarray\n", + " organism: yeast\n", " dataset:\n", " kemmeren_2014:\n", " db_name: kemmeren\n", @@ -75,8 +83,17 @@ " field: regulator_symbol\n", "\n", " BrentLab/hackett_2020:\n", + " # Repo-level tags apply to all datasets in this repository\n", + " tags:\n", + " method: test_overwrite\n", + " organism: yeast\n", " dataset:\n", " hackett_2020:\n", + " # Dataset-level tags: 'assay' is new,\n", + " # 'method' overrides the repo-level value\n", + " tags:\n", + " assay: perturbation\n", + " method: overexpression\n", " db_name: hackett\n", " sample_id:\n", " field: sample_id\n", @@ -169,6 +186,56 @@ "print(repr(vdb))" ] }, + { + "cell_type": "markdown", + "id": "0f10c138", + "metadata": {}, + "source": [ + "## Tags\n", + "\n", + "Tags are arbitrary key/value annotations defined in the configuration. They\n", + "follow the same hierarchy as property mappings: repo-level tags apply to all\n", + "datasets in that repository, and dataset-level tags override repo-level tags\n", + "with the same key.\n", + "\n", + "Use `config.get_tags(repo_id, config_name)` to retrieve the merged tags for\n", + "any dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f7d73db0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "harbison tags: {'assay': 'binding', 'method': 'chip-chip', 'organism': 'yeast'}\n", + "kemmeren tags: {'assay': 'perturbation', 'method': 'microarray', 'organism': 'yeast'}\n", + "hackett tags: {'method': 'overexpression', 'organism': 'yeast', 'assay': 'perturbation'}\n", + "dto tags: {}\n" + ] + } + ], + "source": [ + "\n", + "# Tags are accessible directly from the VirtualDB instance using the db_name.\n", + "# No need to import MetadataConfig or specify repo_id.\n", + "print(\"harbison tags:\", vdb.get_tags(\"harbison\"))\n", + "print(\"kemmeren tags:\", vdb.get_tags(\"kemmeren\"))\n", + "\n", + "# Hackett has tags at both levels:\n", + "# 'organism' comes from the repo level only,\n", + "# 'assay' is added at the dataset level only,\n", + "# 'method' is defined at both levels -- the dataset value wins.\n", + "print(\"hackett tags:\", vdb.get_tags(\"hackett\"))\n", + "\n", + "# Dataset with no tags returns empty dict\n", + "print(\"dto tags:\", vdb.get_tags(\"dto\"))" + ] + }, { "cell_type": "markdown", "id": "cell-5", @@ -187,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "cell-6", "metadata": {}, "outputs": [ @@ -202,10 +269,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12264.05it/s]\n", - "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9731.56it/s]\n", - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 21883.33it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10407.70it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1770.50it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:20<00:00, 20.31s/it]\n", + "No metadata fields found for data config 'dto' in repo 'BrentLab/yeast_comparative_analysis' -- no embedded metadata_fields and no metadata config with applies_to\n", + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 55091.56it/s]\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", @@ -235,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "pdebujnqb9q", "metadata": {}, "outputs": [ @@ -284,7 +352,7 @@ "type": "unknown" } ], - "ref": "8720a362-ea0c-4293-9656-ba6725dcaa3d", + "ref": "955566a4-2a55-483f-a0d4-11f1757f6a28", "rows": [ [ "0", @@ -299,7 +367,7 @@ [ "1", "harbison_meta", - "regulator_locus_tag", + "condition", "VARCHAR", "YES", null, @@ -309,7 +377,7 @@ [ "2", "harbison_meta", - "regulator_symbol", + "regulator_locus_tag", "VARCHAR", "YES", null, @@ -319,7 +387,7 @@ [ "3", "harbison_meta", - "condition", + "regulator_symbol", "VARCHAR", "YES", null, @@ -394,7 +462,7 @@ " \n", " 1\n", " harbison_meta\n", - " regulator_locus_tag\n", + " condition\n", " VARCHAR\n", " YES\n", " None\n", @@ -404,7 +472,7 @@ " \n", " 2\n", " harbison_meta\n", - " regulator_symbol\n", + " regulator_locus_tag\n", " VARCHAR\n", " YES\n", " None\n", @@ -414,7 +482,7 @@ " \n", " 3\n", " harbison_meta\n", - " condition\n", + " regulator_symbol\n", " VARCHAR\n", " YES\n", " None\n", @@ -448,14 +516,14 @@ "text/plain": [ " table column_name column_type null key default extra\n", "0 harbison_meta sample_id INTEGER YES None None None\n", - "1 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", - "2 harbison_meta regulator_symbol VARCHAR YES None None None\n", - "3 harbison_meta condition VARCHAR YES None None None\n", + "1 harbison_meta condition VARCHAR YES None None None\n", + "2 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", + "3 harbison_meta regulator_symbol VARCHAR YES None None None\n", "4 harbison_meta carbon_source VARCHAR YES None None None\n", "5 harbison_meta temperature_celsius DOUBLE YES None None None" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -468,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "9deee334", "metadata": {}, "outputs": [ @@ -517,7 +585,7 @@ "type": "unknown" } ], - "ref": "001db2c7-a5c2-4561-9b12-35733ce1b2e6", + "ref": "012ff714-cded-469d-9c53-642872a5d487", "rows": [ [ "0", @@ -793,7 +861,7 @@ "10 harbison temperature_celsius DOUBLE YES None None None" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -805,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "cell-9", "metadata": {}, "outputs": [ @@ -839,7 +907,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "1a705f1c", "metadata": {}, "outputs": [ @@ -858,17 +926,17 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "condition", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "condition", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, @@ -883,50 +951,50 @@ "type": "float" } ], - "ref": "e5bb4909-b231-44d7-85b8-5219b51f4a4b", + "ref": "b5c797e5-c834-4f9d-a8fc-89789ef0cc68", "rows": [ [ "0", - "166", - "YIL131C", - "FKH1", + "300", "YPD", + "YOL116W", + "MSN1", "glucose", "30.0" ], [ "1", - "3", - "YBL005W", - "PDR3", + "113", "YPD", + "YGL035C", + "MIG1", "glucose", "30.0" ], [ "2", - "173", - "YIR023W", - "DAL81", - "YPD", + "81", + "RAPA", + "YEL009C", + "GCN4", "glucose", "30.0" ], [ "3", - "220", - "YLR014C", - "PPR1", + "279", "YPD", + "YNL139C", + "THO2", "glucose", "30.0" ], [ "4", - "83", - "YEL009C", - "GCN4", - "YPD", + "73", + "H2O2Hi", + "YDR423C", + "CAD1", "glucose", "30.0" ] @@ -956,9 +1024,9 @@ " \n", " \n", " sample_id\n", + " condition\n", " regulator_locus_tag\n", " regulator_symbol\n", - " condition\n", " carbon_source\n", " temperature_celsius\n", " \n", @@ -966,46 +1034,46 @@ " \n", " \n", " 0\n", - " 166\n", - " YIL131C\n", - " FKH1\n", + " 300\n", " YPD\n", + " YOL116W\n", + " MSN1\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", - " 3\n", - " YBL005W\n", - " PDR3\n", + " 113\n", " YPD\n", + " YGL035C\n", + " MIG1\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 173\n", - " YIR023W\n", - " DAL81\n", - " YPD\n", + " 81\n", + " RAPA\n", + " YEL009C\n", + " GCN4\n", " glucose\n", " 30.0\n", " \n", " \n", " 3\n", - " 220\n", - " YLR014C\n", - " PPR1\n", + " 279\n", " YPD\n", + " YNL139C\n", + " THO2\n", " glucose\n", " 30.0\n", " \n", " \n", " 4\n", - " 83\n", - " YEL009C\n", - " GCN4\n", - " YPD\n", + " 73\n", + " H2O2Hi\n", + " YDR423C\n", + " CAD1\n", " glucose\n", " 30.0\n", " \n", @@ -1014,12 +1082,12 @@ "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 166 YIL131C FKH1 YPD glucose \n", - "1 3 YBL005W PDR3 YPD glucose \n", - "2 173 YIR023W DAL81 YPD glucose \n", - "3 220 YLR014C PPR1 YPD glucose \n", - "4 83 YEL009C GCN4 YPD glucose \n", + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 300 YPD YOL116W MSN1 glucose \n", + "1 113 YPD YGL035C MIG1 glucose \n", + "2 81 RAPA YEL009C GCN4 glucose \n", + "3 279 YPD YNL139C THO2 glucose \n", + "4 73 H2O2Hi YDR423C CAD1 glucose \n", "\n", " temperature_celsius \n", "0 30.0 \n", @@ -1029,7 +1097,7 @@ "4 30.0 " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1055,7 +1123,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "cell-17", "metadata": {}, "outputs": [ @@ -1124,75 +1192,75 @@ "type": "float" } ], - "ref": "a6cb8a91-c1c2-4bc8-af51-12e900d7a4bf", + "ref": "06d7e391-9665-4a5f-9276-359ff8e71c3e", "rows": [ [ "0", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR204W", "YPR204W", - "0.78449615", - "0.53566521", + "0.85288861", + "0.76943045", "glucose", "30.0" ], [ "1", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR203W", "YPR203W", - "1.4509147", - "0.95955603", + "1.2490028", + "0.11237602", "glucose", "30.0" ], [ "2", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR202W", "YPR202W", - "1.4509147", - "0.95955603", + "1.2490028", + "0.11237602", "glucose", "30.0" ], [ "3", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR201W", "ARR3", - "0.92586339", - "0.45367192", + "1.5137073", + "0.1681333", "glucose", "30.0" ], [ "4", - "14", - "13.0", + "15", + "14.0", "YBR049C", "REB1", - "H2O2Lo", + "YPD", "YPR200C", "ARR2", - "0.92586339", - "0.45367192", + "1.5137073", + "0.1681333", "glucose", "30.0" ] @@ -1237,71 +1305,71 @@ " \n", " \n", " 0\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR204W\n", " YPR204W\n", - " 0.784496\n", - " 0.535665\n", + " 0.852889\n", + " 0.769430\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR203W\n", " YPR203W\n", - " 1.450915\n", - " 0.959556\n", + " 1.249003\n", + " 0.112376\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR202W\n", " YPR202W\n", - " 1.450915\n", - " 0.959556\n", + " 1.249003\n", + " 0.112376\n", " glucose\n", " 30.0\n", " \n", " \n", " 3\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR201W\n", " ARR3\n", - " 0.925863\n", - " 0.453672\n", + " 1.513707\n", + " 0.168133\n", " glucose\n", " 30.0\n", " \n", " \n", " 4\n", - " 14\n", - " 13.0\n", + " 15\n", + " 14.0\n", " YBR049C\n", " REB1\n", - " H2O2Lo\n", + " YPD\n", " YPR200C\n", " ARR2\n", - " 0.925863\n", - " 0.453672\n", + " 1.513707\n", + " 0.168133\n", " glucose\n", " 30.0\n", " \n", @@ -1311,18 +1379,18 @@ ], "text/plain": [ " sample_id db_id regulator_locus_tag regulator_symbol condition \\\n", - "0 14 13.0 YBR049C REB1 H2O2Lo \n", - "1 14 13.0 YBR049C REB1 H2O2Lo \n", - "2 14 13.0 YBR049C REB1 H2O2Lo \n", - "3 14 13.0 YBR049C REB1 H2O2Lo \n", - "4 14 13.0 YBR049C REB1 H2O2Lo \n", + "0 15 14.0 YBR049C REB1 YPD \n", + "1 15 14.0 YBR049C REB1 YPD \n", + "2 15 14.0 YBR049C REB1 YPD \n", + "3 15 14.0 YBR049C REB1 YPD \n", + "4 15 14.0 YBR049C REB1 YPD \n", "\n", " target_locus_tag target_symbol effect pvalue carbon_source \\\n", - "0 YPR204W YPR204W 0.784496 0.535665 glucose \n", - "1 YPR203W YPR203W 1.450915 0.959556 glucose \n", - "2 YPR202W YPR202W 1.450915 0.959556 glucose \n", - "3 YPR201W ARR3 0.925863 0.453672 glucose \n", - "4 YPR200C ARR2 0.925863 0.453672 glucose \n", + "0 YPR204W YPR204W 0.852889 0.769430 glucose \n", + "1 YPR203W YPR203W 1.249003 0.112376 glucose \n", + "2 YPR202W YPR202W 1.249003 0.112376 glucose \n", + "3 YPR201W ARR3 1.513707 0.168133 glucose \n", + "4 YPR200C ARR2 1.513707 0.168133 glucose \n", "\n", " temperature_celsius \n", "0 30.0 \n", @@ -1332,7 +1400,7 @@ "4 30.0 " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1361,7 +1429,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "cell-19", "metadata": {}, "outputs": [ @@ -1385,7 +1453,7 @@ "type": "integer" } ], - "ref": "9234aaf4-a313-42c2-838a-a13568eed01d", + "ref": "6d8b4d37-3b6b-40f1-833d-aa6711694bcb", "rows": [ [ "0", @@ -1399,17 +1467,17 @@ ], [ "2", - "HSF1", + "STE12", "4" ], [ "3", - "STE12", + "RTG3", "4" ], [ "4", - "RTG3", + "DIG1", "4" ], [ @@ -1419,62 +1487,62 @@ ], [ "6", - "SKN7", + "HSF1", "4" ], [ "7", - "DIG1", + "SKN7", "4" ], [ "8", - "GAT1", + "RPN4", "3" ], [ "9", - "RPN4", + "GAT1", "3" ], [ "10", - "YAP7", + "AFT2", "3" ], [ "11", - "TEC1", + "YAP7", "3" ], [ "12", - "AFT1", + "TEC1", "3" ], [ "13", - "MAL33", + "MOT3", "3" ], [ "14", - "PHO2", + "ROX1", "3" ], [ "15", - "MBP1", + "GZF3", "3" ], [ "16", - "KSS1", + "PHO2", "3" ], [ "17", - "SFP1", + "MAL33", "3" ], [ @@ -1484,37 +1552,37 @@ ], [ "19", - "YJL206C", + "SFP1", "3" ], [ "20", - "GZF3", + "KSS1", "3" ], [ "21", - "MOT3", + "YAP6", "3" ], [ "22", - "FHL1", + "RPH1", "3" ], [ "23", - "ROX1", + "NRG1", "3" ], [ "24", - "FKH2", + "PHD1", "3" ], [ "25", - "AFT2", + "FHL1", "3" ], [ @@ -1524,117 +1592,117 @@ ], [ "27", - "RIM101", + "FKH2", "3" ], [ "28", - "YAP6", + "MBP1", "3" ], [ "29", - "RPH1", + "RIM101", "3" ], [ "30", - "PHD1", + "YJL206C", "3" ], [ "31", - "NRG1", + "AFT1", "3" ], [ "32", - "MGA1", + "RLM1", "2" ], [ "33", - "UME1", + "XBP1", "2" ], [ "34", - "YAP3", + "IME4", "2" ], [ "35", - "XBP1", + "MCM1", "2" ], [ "36", - "RDS1", + "DAL80", "2" ], [ "37", - "MSS11", + "YAP3", "2" ], [ "38", - "HAP2", + "YAP5", "2" ], [ "39", - "MCM1", + "MAC1", "2" ], [ "40", - "ADR1", + "UME6", "2" ], [ "41", - "GCN4", + "PDR1", "2" ], [ "42", - "MIG2", + "UME1", "2" ], [ "43", - "SOK2", + "CAD1", "2" ], [ "44", - "RTG1", + "MGA1", "2" ], [ "45", - "MOT2", + "HAP4", "2" ], [ "46", - "UGA3", + "MIG2", "2" ], [ "47", - "PUT3", + "GCN4", "2" ], [ "48", - "YAP5", + "RTG1", "2" ], [ "49", - "UME6", + "PUT3", "2" ] ], @@ -1679,17 +1747,17 @@ " \n", " \n", " 2\n", - " HSF1\n", + " STE12\n", " 4\n", " \n", " \n", " 3\n", - " STE12\n", + " RTG3\n", " 4\n", " \n", " \n", " 4\n", - " RTG3\n", + " DIG1\n", " 4\n", " \n", " \n", @@ -1699,27 +1767,27 @@ " \n", " \n", " 58\n", - " DAL82\n", + " IME1\n", " 2\n", " \n", " \n", " 59\n", - " DAL80\n", + " RDS1\n", " 2\n", " \n", " \n", " 60\n", - " HAP4\n", + " MSS11\n", " 2\n", " \n", " \n", " 61\n", - " PDR1\n", + " HAP2\n", " 2\n", " \n", " \n", " 62\n", - " RLM1\n", + " ARR1\n", " 2\n", " \n", " \n", @@ -1731,20 +1799,20 @@ " regulator_symbol n\n", "0 MSN2 6\n", "1 MSN4 5\n", - "2 HSF1 4\n", - "3 STE12 4\n", - "4 RTG3 4\n", + "2 STE12 4\n", + "3 RTG3 4\n", + "4 DIG1 4\n", ".. ... ..\n", - "58 DAL82 2\n", - "59 DAL80 2\n", - "60 HAP4 2\n", - "61 PDR1 2\n", - "62 RLM1 2\n", + "58 IME1 2\n", + "59 RDS1 2\n", + "60 MSS11 2\n", + "61 HAP2 2\n", + "62 ARR1 2\n", "\n", "[63 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1785,7 +1853,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "cell-21", "metadata": {}, "outputs": [ @@ -1838,6 +1906,11 @@ "rawType": "float64", "type": "float" }, + { + "name": "pr_ranking_column", + "rawType": "object", + "type": "string" + }, { "name": "binding_repo_dataset", "rawType": "object", @@ -1869,62 +1942,65 @@ "type": "string" } ], - "ref": "3464c093-78d3-4dde-9a28-850a7be5d032", + "ref": "1ce4dce9-5191-4116-b848-394fcdb3b5fc", "rows": [ [ "0", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;85", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.004", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "105", "harbison", - "85", - "BrentLab/Hackett_2020;hackett_2020" + "10", + "BrentLab/hughes_2006;overexpression" ], [ "1", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;83", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "108", "harbison", - "83", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "2", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;84", - "2.0", - "1.0", - "3.0", - "1.0", - "0.0", - "0.011", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "109", "harbison", - "84", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ] ], "shape": { - "columns": 14, + "columns": 15, "rows": 3 } }, @@ -1955,6 +2031,7 @@ " perturbation_set_size\n", " dto_fdr\n", " dto_empirical_pvalue\n", + " pr_ranking_column\n", " binding_repo_dataset\n", " perturbation_repo_dataset\n", " binding_id_id\n", @@ -1966,92 +2043,95 @@ " \n", " \n", " 0\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;85\n", - " 2.0\n", - " 2.0\n", - " 3.0\n", - " 2.0\n", - " 0.000225\n", - " 0.004\n", + " BrentLab/harbison_2004;harbison_2004;105\n", + " BrentLab/hughes_2006;overexpression;10\n", + " 11.0\n", + " 206.0\n", + " 12.0\n", + " 206.0\n", + " 0.041293\n", + " 0.017\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 105\n", " harbison\n", - " 85\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 10\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 1\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;83\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;108\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 60.0\n", + " 67.0\n", + " 60.0\n", + " 67.0\n", + " 0.054284\n", + " 0.000\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 108\n", " harbison\n", - " 83\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 2\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;84\n", - " 2.0\n", - " 1.0\n", - " 3.0\n", - " 1.0\n", - " 0.000000\n", - " 0.011\n", + " BrentLab/harbison_2004;harbison_2004;109\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 27.0\n", + " 1265.0\n", + " 27.0\n", + " 1265.0\n", + " 0.123214\n", + " 0.057\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 109\n", " harbison\n", - " 84\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", "\n", "" ], "text/plain": [ - " binding_id \\\n", - "0 BrentLab/harbison_2004;harbison_2004;3 \n", - "1 BrentLab/harbison_2004;harbison_2004;3 \n", - "2 BrentLab/harbison_2004;harbison_2004;3 \n", + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", "\n", - " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", - "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", "\n", " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 2.0 3.0 2.0 \n", - "1 NaN NaN NaN \n", - "2 1.0 3.0 1.0 \n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", "\n", - " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.000225 0.004 harbison_2004-harbison_2004 \n", - "1 NaN NaN harbison_2004-harbison_2004 \n", - "2 0.000000 0.011 harbison_2004-harbison_2004 \n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.041293 0.017 log2fc \n", + "1 0.054284 0.000 log2fc \n", + "2 0.123214 0.057 log2fc \n", "\n", - " perturbation_repo_dataset binding_id_id binding_id_source \\\n", - "0 Hackett_2020-hackett_2020 3 harbison \n", - "1 Hackett_2020-hackett_2020 3 harbison \n", - "2 Hackett_2020-hackett_2020 3 harbison \n", + " binding_repo_dataset perturbation_repo_dataset binding_id_id \\\n", + "0 harbison_2004-harbison_2004 hughes_2006-overexpression 105 \n", + "1 harbison_2004-harbison_2004 hughes_2006-overexpression 108 \n", + "2 harbison_2004-harbison_2004 hughes_2006-overexpression 109 \n", "\n", - " perturbation_id_id perturbation_id_source \n", - "0 85 BrentLab/Hackett_2020;hackett_2020 \n", - "1 83 BrentLab/Hackett_2020;hackett_2020 \n", - "2 84 BrentLab/Hackett_2020;hackett_2020 " + " binding_id_source perturbation_id_id perturbation_id_source \n", + "0 harbison 10 BrentLab/hughes_2006;overexpression \n", + "1 harbison 11 BrentLab/hughes_2006;overexpression \n", + "2 harbison 11 BrentLab/hughes_2006;overexpression " ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -2063,7 +2143,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "cell-22", "metadata": {}, "outputs": [ @@ -2082,17 +2162,17 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "condition", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "condition", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, @@ -2117,117 +2197,117 @@ "type": "float" } ], - "ref": "58c1f0ca-b0a7-4ce7-b29f-f4e789b74707", + "ref": "8e604dcf-efad-42a8-a049-7bf684faa9b6", "rows": [ [ "0", - "50", - "YDR043C", - "NRG1", - "H2O2Lo", + "18", + "YPD", + "YBR083W", + "TEC1", "glucose", "30.0", "0.0", - "0.081863152643831" + "0.08188235294117648" ], [ "1", - "213", - "YKL222C", - "YKL222C", - "YPD", + "157", + "H2O2Hi", + "YHR206W", + "SKN7", "glucose", "30.0", "0.0", - "0.0" + "0.13931986462735127" ], [ "2", - "18", - "YBR083W", - "TEC1", + "93", "YPD", + "YER111C", + "SWI4", "glucose", "30.0", "0.0", - "0.0620669105826265" + "0.17005078106191404" ], [ "3", - "7", - "YBL103C", - "RTG3", - "H2O2Hi", + "72", + "YPD", + "YDR421W", + "ARO80", "glucose", "30.0", "0.0", - "0.1577232390460343" + "0.00011392635800218739" ], [ "4", - "277", - "YNL103W", - "MET4", - "YPD", - "glucose", + "71", + "SM", + "YDR421W", + "ARO80", + "unspecified", "30.0", "0.0", - "0.016281512605042" + "0.00011392635800218739" ], [ "5", - "281", - "YNL199C", - "GCR2", - "SM", - "unspecified", + "346", + "RAPA", + "YPR104C", + "FHL1", + "glucose", "30.0", "0.0", - "0.0296346442259623" + "0.019746237283784218" ], [ "6", - "86", - "YER040W", - "GLN3", - "SM", - "unspecified", + "226", + "YPD", + "YLR182W", + "SWI6", + "glucose", "30.0", "0.0", - "0.2298889521004841" + "0.07368989186287292" ], [ "7", - "225", - "YLR176C", - "RFX1", + "286", "YPD", + "YNL309W", + "STB1", "glucose", "30.0", "0.0", - "0.0144559001906082" + "0.1821470588235294" ], [ "8", - "86", - "YER040W", - "GLN3", + "172", "SM", + "YIR023W", + "DAL81", "unspecified", "30.0", "0.0", - "0.0961169019780866" + "0.21656240134694307" ], [ "9", - "225", - "YLR176C", - "RFX1", + "320", "YPD", + "YPL038W", + "MET31", "glucose", "30.0", "0.0", - "0.0335260614428719" + "0.0661219662690251" ] ], "shape": { @@ -2255,9 +2335,9 @@ " \n", " \n", " sample_id\n", + " condition\n", " regulator_locus_tag\n", " regulator_symbol\n", - " condition\n", " carbon_source\n", " temperature_celsius\n", " dto_empirical_pvalue\n", @@ -2267,145 +2347,145 @@ " \n", " \n", " 0\n", - " 50\n", - " YDR043C\n", - " NRG1\n", - " H2O2Lo\n", + " 18\n", + " YPD\n", + " YBR083W\n", + " TEC1\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.081863\n", + " 0.081882\n", " \n", " \n", " 1\n", - " 213\n", - " YKL222C\n", - " YKL222C\n", - " YPD\n", + " 157\n", + " H2O2Hi\n", + " YHR206W\n", + " SKN7\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.000000\n", + " 0.139320\n", " \n", " \n", " 2\n", - " 18\n", - " YBR083W\n", - " TEC1\n", + " 93\n", " YPD\n", + " YER111C\n", + " SWI4\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.062067\n", + " 0.170051\n", " \n", " \n", " 3\n", - " 7\n", - " YBL103C\n", - " RTG3\n", - " H2O2Hi\n", + " 72\n", + " YPD\n", + " YDR421W\n", + " ARO80\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.157723\n", + " 0.000114\n", " \n", " \n", " 4\n", - " 277\n", - " YNL103W\n", - " MET4\n", - " YPD\n", - " glucose\n", + " 71\n", + " SM\n", + " YDR421W\n", + " ARO80\n", + " unspecified\n", " 30.0\n", " 0.0\n", - " 0.016282\n", + " 0.000114\n", " \n", " \n", " 5\n", - " 281\n", - " YNL199C\n", - " GCR2\n", - " SM\n", - " unspecified\n", + " 346\n", + " RAPA\n", + " YPR104C\n", + " FHL1\n", + " glucose\n", " 30.0\n", " 0.0\n", - " 0.029635\n", + " 0.019746\n", " \n", " \n", " 6\n", - " 86\n", - " YER040W\n", - " GLN3\n", - " SM\n", - " unspecified\n", + " 226\n", + " YPD\n", + " YLR182W\n", + " SWI6\n", + " glucose\n", " 30.0\n", " 0.0\n", - " 0.229889\n", + " 0.073690\n", " \n", " \n", " 7\n", - " 225\n", - " YLR176C\n", - " RFX1\n", + " 286\n", " YPD\n", + " YNL309W\n", + " STB1\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.014456\n", + " 0.182147\n", " \n", " \n", " 8\n", - " 86\n", - " YER040W\n", - " GLN3\n", + " 172\n", " SM\n", + " YIR023W\n", + " DAL81\n", " unspecified\n", " 30.0\n", " 0.0\n", - " 0.096117\n", + " 0.216562\n", " \n", " \n", " 9\n", - " 225\n", - " YLR176C\n", - " RFX1\n", + " 320\n", " YPD\n", + " YPL038W\n", + " MET31\n", " glucose\n", " 30.0\n", " 0.0\n", - " 0.033526\n", + " 0.066122\n", " \n", " \n", "\n", "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 50 YDR043C NRG1 H2O2Lo glucose \n", - "1 213 YKL222C YKL222C YPD glucose \n", - "2 18 YBR083W TEC1 YPD glucose \n", - "3 7 YBL103C RTG3 H2O2Hi glucose \n", - "4 277 YNL103W MET4 YPD glucose \n", - "5 281 YNL199C GCR2 SM unspecified \n", - "6 86 YER040W GLN3 SM unspecified \n", - "7 225 YLR176C RFX1 YPD glucose \n", - "8 86 YER040W GLN3 SM unspecified \n", - "9 225 YLR176C RFX1 YPD glucose \n", + " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n", + "0 18 YPD YBR083W TEC1 glucose \n", + "1 157 H2O2Hi YHR206W SKN7 glucose \n", + "2 93 YPD YER111C SWI4 glucose \n", + "3 72 YPD YDR421W ARO80 glucose \n", + "4 71 SM YDR421W ARO80 unspecified \n", + "5 346 RAPA YPR104C FHL1 glucose \n", + "6 226 YPD YLR182W SWI6 glucose \n", + "7 286 YPD YNL309W STB1 glucose \n", + "8 172 SM YIR023W DAL81 unspecified \n", + "9 320 YPD YPL038W MET31 glucose \n", "\n", " temperature_celsius dto_empirical_pvalue dto_fdr \n", - "0 30.0 0.0 0.081863 \n", - "1 30.0 0.0 0.000000 \n", - "2 30.0 0.0 0.062067 \n", - "3 30.0 0.0 0.157723 \n", - "4 30.0 0.0 0.016282 \n", - "5 30.0 0.0 0.029635 \n", - "6 30.0 0.0 0.229889 \n", - "7 30.0 0.0 0.014456 \n", - "8 30.0 0.0 0.096117 \n", - "9 30.0 0.0 0.033526 " + "0 30.0 0.0 0.081882 \n", + "1 30.0 0.0 0.139320 \n", + "2 30.0 0.0 0.170051 \n", + "3 30.0 0.0 0.000114 \n", + "4 30.0 0.0 0.000114 \n", + "5 30.0 0.0 0.019746 \n", + "6 30.0 0.0 0.073690 \n", + "7 30.0 0.0 0.182147 \n", + "8 30.0 0.0 0.216562 \n", + "9 30.0 0.0 0.066122 " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -2426,7 +2506,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "cell-23", "metadata": {}, "outputs": [ @@ -2460,77 +2540,77 @@ "type": "string" } ], - "ref": "b916ca80-75d1-448d-82a5-c82086ca1ed9", + "ref": "18363370-be4d-4693-8836-96409a2ac869", "rows": [ [ "0", - "289", - "DAL82", + "15", + "REB1", "0.0", - "1208" + "100_242" ], [ "1", - "251", - "MAC1", + "303", + "CIN5", "0.0", - "1103" + "1280" ], [ "2", - "321", - "DIG1", + "330", + "CUP9", "0.0", - "1372" + "256" ], [ "3", - "238", - "YAP1", + "114", + "AFT1", "0.0", - "996" + "87" ], [ "4", - "303", - "CIN5", + "9", + "RTG3", "0.0", - "1365" + "57" ], [ "5", - "245", - "ARG81", + "118", + "HSF1", "0.0", - "1023" + "88" ], [ "6", - "184", - "CBF1", + "15", + "REB1", "0.0", - "754" + "100_242" ], [ "7", - "252", - "MAC1", + "162", + "XBP1", "0.0", - "1103" + "24" ], [ "8", - "200", - "PHD1", + "240", + "YAP1", "0.0", - "890" + "182" ], [ "9", - "251", - "MAC1", + "150", + "STP2", "0.0", - "1110" + "604" ] ], "shape": { @@ -2566,73 +2646,73 @@ " \n", " \n", " 0\n", - " 289\n", - " DAL82\n", + " 15\n", + " REB1\n", " 0.0\n", - " 1208\n", + " 100_242\n", " \n", " \n", " 1\n", - " 251\n", - " MAC1\n", + " 303\n", + " CIN5\n", " 0.0\n", - " 1103\n", + " 1280\n", " \n", " \n", " 2\n", - " 321\n", - " DIG1\n", + " 330\n", + " CUP9\n", " 0.0\n", - " 1372\n", + " 256\n", " \n", " \n", " 3\n", - " 238\n", - " YAP1\n", + " 114\n", + " AFT1\n", " 0.0\n", - " 996\n", + " 87\n", " \n", " \n", " 4\n", - " 303\n", - " CIN5\n", + " 9\n", + " RTG3\n", " 0.0\n", - " 1365\n", + " 57\n", " \n", " \n", " 5\n", - " 245\n", - " ARG81\n", + " 118\n", + " HSF1\n", " 0.0\n", - " 1023\n", + " 88\n", " \n", " \n", " 6\n", - " 184\n", - " CBF1\n", + " 15\n", + " REB1\n", " 0.0\n", - " 754\n", + " 100_242\n", " \n", " \n", " 7\n", - " 252\n", - " MAC1\n", + " 162\n", + " XBP1\n", " 0.0\n", - " 1103\n", + " 24\n", " \n", " \n", " 8\n", - " 200\n", - " PHD1\n", + " 240\n", + " YAP1\n", " 0.0\n", - " 890\n", + " 182\n", " \n", " \n", " 9\n", - " 251\n", - " MAC1\n", + " 150\n", + " STP2\n", " 0.0\n", - " 1110\n", + " 604\n", " \n", " \n", "\n", @@ -2640,19 +2720,19 @@ ], "text/plain": [ " harbison_sample_id regulator_symbol dto_empirical_pvalue hackett_sample_id\n", - "0 289 DAL82 0.0 1208\n", - "1 251 MAC1 0.0 1103\n", - "2 321 DIG1 0.0 1372\n", - "3 238 YAP1 0.0 996\n", - "4 303 CIN5 0.0 1365\n", - "5 245 ARG81 0.0 1023\n", - "6 184 CBF1 0.0 754\n", - "7 252 MAC1 0.0 1103\n", - "8 200 PHD1 0.0 890\n", - "9 251 MAC1 0.0 1110" + "0 15 REB1 0.0 100_242\n", + "1 303 CIN5 0.0 1280\n", + "2 330 CUP9 0.0 256\n", + "3 114 AFT1 0.0 87\n", + "4 9 RTG3 0.0 57\n", + "5 118 HSF1 0.0 88\n", + "6 15 REB1 0.0 100_242\n", + "7 162 XBP1 0.0 24\n", + "8 240 YAP1 0.0 182\n", + "9 150 STP2 0.0 604" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -2690,7 +2770,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "f03e942a", "metadata": {}, "outputs": [ @@ -2729,12 +2809,12 @@ "type": "integer" } ], - "ref": "1185b490-3375-41d0-b61c-0f35dae2b815", + "ref": "0736a331-fb06-4ba3-abe4-dff7ac0e65a3", "rows": [ [ "0", "SWI1", - "15.0", + "5.0", "ZEV", "P", "3" @@ -2750,7 +2830,7 @@ [ "2", "SWI1", - "45.0", + "20.0", "ZEV", "P", "3" @@ -2758,7 +2838,7 @@ [ "3", "SWI1", - "5.0", + "10.0", "ZEV", "P", "3" @@ -2766,7 +2846,7 @@ [ "4", "SWI1", - "0.0", + "90.0", "ZEV", "P", "3" @@ -2774,7 +2854,7 @@ [ "5", "SWI1", - "90.0", + "0.0", "ZEV", "P", "3" @@ -2782,7 +2862,7 @@ [ "6", "SWI1", - "10.0", + "15.0", "ZEV", "P", "3" @@ -2790,31 +2870,31 @@ [ "7", "SWI1", - "20.0", + "45.0", "ZEV", "P", "3" ], [ "8", - "GCN4", - "0.0", + "RDS2", + "10.0", "ZEV", "P", "2" ], [ "9", - "GCN4", - "30.0", - "ZEV", + "MAC1", + "90.0", + "GEV", "P", "2" ], [ "10", "MAC1", - "0.0", + "15.0", "GEV", "P", "2" @@ -2822,22 +2902,22 @@ [ "11", "RDS2", - "5.0", + "20.0", "ZEV", "P", "2" ], [ "12", - "RDS2", + "MAC1", "45.0", - "ZEV", + "GEV", "P", "2" ], [ "13", - "Z3EV", + "RDS2", "30.0", "ZEV", "P", @@ -2846,46 +2926,46 @@ [ "14", "GCN4", - "90.0", + "15.0", "ZEV", "P", "2" ], [ "15", - "Z3EV", - "15.0", - "ZEV", + "MAC1", + "30.0", + "GEV", "P", "2" ], [ "16", - "GCN4", - "45.0", - "ZEV", + "MAC1", + "5.0", + "GEV", "P", "2" ], [ "17", - "MAC1", - "5.0", - "GEV", + "GCN4", + "45.0", + "ZEV", "P", "2" ], [ "18", - "MAC1", + "GCN4", "90.0", - "GEV", + "ZEV", "P", "2" ], [ "19", - "Z3EV", + "RDS2", "45.0", "ZEV", "P", @@ -2894,38 +2974,38 @@ [ "20", "RDS2", - "10.0", + "0.0", "ZEV", "P", "2" ], [ "21", - "GCN4", - "15.0", + "RDS2", + "90.0", "ZEV", "P", "2" ], [ "22", - "RDS2", - "90.0", + "GCN4", + "30.0", "ZEV", "P", "2" ], [ "23", - "RDS2", + "MAC1", "0.0", - "ZEV", + "GEV", "P", "2" ], [ "24", - "Z3EV", + "RDS2", "5.0", "ZEV", "P", @@ -2933,88 +3013,24 @@ ], [ "25", - "Z3EV", - "90.0", - "ZEV", - "P", - "2" - ], - [ - "26", - "Z3EV", - "20.0", - "ZEV", - "P", - "2" - ], - [ - "27", - "RDS2", - "30.0", - "ZEV", - "P", - "2" - ], - [ - "28", - "Z3EV", + "GCN4", "0.0", "ZEV", "P", "2" ], [ - "29", + "26", "RDS2", "15.0", "ZEV", "P", "2" - ], - [ - "30", - "Z3EV", - "10.0", - "ZEV", - "P", - "2" - ], - [ - "31", - "RDS2", - "20.0", - "ZEV", - "P", - "2" - ], - [ - "32", - "MAC1", - "45.0", - "GEV", - "P", - "2" - ], - [ - "33", - "MAC1", - "15.0", - "GEV", - "P", - "2" - ], - [ - "34", - "MAC1", - "30.0", - "GEV", - "P", - "2" ] ], "shape": { "columns": 5, - "rows": 35 + "rows": 27 } }, "text/html": [ @@ -3047,7 +3063,7 @@ " \n", " 0\n", " SWI1\n", - " 15.0\n", + " 5.0\n", " ZEV\n", " P\n", " 3\n", @@ -3063,7 +3079,7 @@ " \n", " 2\n", " SWI1\n", - " 45.0\n", + " 20.0\n", " ZEV\n", " P\n", " 3\n", @@ -3071,7 +3087,7 @@ " \n", " 3\n", " SWI1\n", - " 5.0\n", + " 10.0\n", " ZEV\n", " P\n", " 3\n", @@ -3079,7 +3095,7 @@ " \n", " 4\n", " SWI1\n", - " 0.0\n", + " 90.0\n", " ZEV\n", " P\n", " 3\n", @@ -3087,7 +3103,7 @@ " \n", " 5\n", " SWI1\n", - " 90.0\n", + " 0.0\n", " ZEV\n", " P\n", " 3\n", @@ -3095,7 +3111,7 @@ " \n", " 6\n", " SWI1\n", - " 10.0\n", + " 15.0\n", " ZEV\n", " P\n", " 3\n", @@ -3103,31 +3119,31 @@ " \n", " 7\n", " SWI1\n", - " 20.0\n", + " 45.0\n", " ZEV\n", " P\n", " 3\n", " \n", " \n", " 8\n", - " GCN4\n", - " 0.0\n", + " RDS2\n", + " 10.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 9\n", - " GCN4\n", - " 30.0\n", - " ZEV\n", + " MAC1\n", + " 90.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 10\n", " MAC1\n", - " 0.0\n", + " 15.0\n", " GEV\n", " P\n", " 2\n", @@ -3135,22 +3151,22 @@ " \n", " 11\n", " RDS2\n", - " 5.0\n", + " 20.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 12\n", - " RDS2\n", + " MAC1\n", " 45.0\n", - " ZEV\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 13\n", - " Z3EV\n", + " RDS2\n", " 30.0\n", " ZEV\n", " P\n", @@ -3159,46 +3175,46 @@ " \n", " 14\n", " GCN4\n", - " 90.0\n", + " 15.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 15\n", - " Z3EV\n", - " 15.0\n", - " ZEV\n", + " MAC1\n", + " 30.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 16\n", - " GCN4\n", - " 45.0\n", - " ZEV\n", + " MAC1\n", + " 5.0\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 17\n", - " MAC1\n", - " 5.0\n", - " GEV\n", + " GCN4\n", + " 45.0\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 18\n", - " MAC1\n", + " GCN4\n", " 90.0\n", - " GEV\n", + " ZEV\n", " P\n", " 2\n", " \n", " \n", " 19\n", - " Z3EV\n", + " RDS2\n", " 45.0\n", " ZEV\n", " P\n", @@ -3207,38 +3223,38 @@ " \n", " 20\n", " RDS2\n", - " 10.0\n", + " 0.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 21\n", - " GCN4\n", - " 15.0\n", + " RDS2\n", + " 90.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 22\n", - " RDS2\n", - " 90.0\n", + " GCN4\n", + " 30.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", " 23\n", - " RDS2\n", + " MAC1\n", " 0.0\n", - " ZEV\n", + " GEV\n", " P\n", " 2\n", " \n", " \n", " 24\n", - " Z3EV\n", + " RDS2\n", " 5.0\n", " ZEV\n", " P\n", @@ -3246,128 +3262,56 @@ " \n", " \n", " 25\n", - " Z3EV\n", - " 90.0\n", - " ZEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 26\n", - " Z3EV\n", - " 20.0\n", - " ZEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 27\n", - " RDS2\n", - " 30.0\n", - " ZEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 28\n", - " Z3EV\n", + " GCN4\n", " 0.0\n", " ZEV\n", " P\n", " 2\n", " \n", " \n", - " 29\n", + " 26\n", " RDS2\n", " 15.0\n", " ZEV\n", " P\n", " 2\n", " \n", - " \n", - " 30\n", - " Z3EV\n", - " 10.0\n", - " ZEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 31\n", - " RDS2\n", - " 20.0\n", - " ZEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 32\n", - " MAC1\n", - " 45.0\n", - " GEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 33\n", - " MAC1\n", - " 15.0\n", - " GEV\n", - " P\n", - " 2\n", - " \n", - " \n", - " 34\n", - " MAC1\n", - " 30.0\n", - " GEV\n", - " P\n", - " 2\n", - " \n", " \n", "\n", "" ], "text/plain": [ " regulator_symbol time mechanism restriction n\n", - "0 SWI1 15.0 ZEV P 3\n", + "0 SWI1 5.0 ZEV P 3\n", "1 SWI1 30.0 ZEV P 3\n", - "2 SWI1 45.0 ZEV P 3\n", - "3 SWI1 5.0 ZEV P 3\n", - "4 SWI1 0.0 ZEV P 3\n", - "5 SWI1 90.0 ZEV P 3\n", - "6 SWI1 10.0 ZEV P 3\n", - "7 SWI1 20.0 ZEV P 3\n", - "8 GCN4 0.0 ZEV P 2\n", - "9 GCN4 30.0 ZEV P 2\n", - "10 MAC1 0.0 GEV P 2\n", - "11 RDS2 5.0 ZEV P 2\n", - "12 RDS2 45.0 ZEV P 2\n", - "13 Z3EV 30.0 ZEV P 2\n", - "14 GCN4 90.0 ZEV P 2\n", - "15 Z3EV 15.0 ZEV P 2\n", - "16 GCN4 45.0 ZEV P 2\n", - "17 MAC1 5.0 GEV P 2\n", - "18 MAC1 90.0 GEV P 2\n", - "19 Z3EV 45.0 ZEV P 2\n", - "20 RDS2 10.0 ZEV P 2\n", - "21 GCN4 15.0 ZEV P 2\n", - "22 RDS2 90.0 ZEV P 2\n", - "23 RDS2 0.0 ZEV P 2\n", - "24 Z3EV 5.0 ZEV P 2\n", - "25 Z3EV 90.0 ZEV P 2\n", - "26 Z3EV 20.0 ZEV P 2\n", - "27 RDS2 30.0 ZEV P 2\n", - "28 Z3EV 0.0 ZEV P 2\n", - "29 RDS2 15.0 ZEV P 2\n", - "30 Z3EV 10.0 ZEV P 2\n", - "31 RDS2 20.0 ZEV P 2\n", - "32 MAC1 45.0 GEV P 2\n", - "33 MAC1 15.0 GEV P 2\n", - "34 MAC1 30.0 GEV P 2" + "2 SWI1 20.0 ZEV P 3\n", + "3 SWI1 10.0 ZEV P 3\n", + "4 SWI1 90.0 ZEV P 3\n", + "5 SWI1 0.0 ZEV P 3\n", + "6 SWI1 15.0 ZEV P 3\n", + "7 SWI1 45.0 ZEV P 3\n", + "8 RDS2 10.0 ZEV P 2\n", + "9 MAC1 90.0 GEV P 2\n", + "10 MAC1 15.0 GEV P 2\n", + "11 RDS2 20.0 ZEV P 2\n", + "12 MAC1 45.0 GEV P 2\n", + "13 RDS2 30.0 ZEV P 2\n", + "14 GCN4 15.0 ZEV P 2\n", + "15 MAC1 30.0 GEV P 2\n", + "16 MAC1 5.0 GEV P 2\n", + "17 GCN4 45.0 ZEV P 2\n", + "18 GCN4 90.0 ZEV P 2\n", + "19 RDS2 45.0 ZEV P 2\n", + "20 RDS2 0.0 ZEV P 2\n", + "21 RDS2 90.0 ZEV P 2\n", + "22 GCN4 30.0 ZEV P 2\n", + "23 MAC1 0.0 GEV P 2\n", + "24 RDS2 5.0 ZEV P 2\n", + "25 GCN4 0.0 ZEV P 2\n", + "26 RDS2 15.0 ZEV P 2" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -3386,7 +3330,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "4d869036", "metadata": {}, "outputs": [ @@ -3405,32 +3349,27 @@ "type": "integer" }, { - "name": "regulator_locus_tag", + "name": "date", "rawType": "object", "type": "string" }, { - "name": "regulator_symbol", + "name": "mechanism", "rawType": "object", "type": "string" }, { - "name": "time", - "rawType": "float64", - "type": "float" - }, - { - "name": "mechanism", + "name": "regulator_locus_tag", "rawType": "object", "type": "string" }, { - "name": "restriction", + "name": "regulator_symbol", "rawType": "object", "type": "string" }, { - "name": "date", + "name": "restriction", "rawType": "object", "type": "string" }, @@ -3439,6 +3378,11 @@ "rawType": "object", "type": "string" }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, { "name": "carbon_source", "rawType": "object", @@ -3450,44 +3394,44 @@ "type": "float" } ], - "ref": "440ab0a2-f84a-4505-8380-e218512394f7", + "ref": "0f36c45d-0bab-4761-98f1-0e2a625be2df", "rows": [ [ "0", - "1620", + "1636", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", - "SMY2266a", + "SMY2266c", + "20.0", "glucose", "30.0" ], [ "1", "1628", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", "SMY2266b", + "20.0", "glucose", "30.0" ], [ "2", - "1636", + "1620", + "20161117", + "ZEV", "YPL016W", "SWI1", - "20.0", - "ZEV", "P", - "20161117", - "SMY2266c", + "SMY2266a", + "20.0", "glucose", "30.0" ] @@ -3517,13 +3461,13 @@ " \n", " \n", " sample_id\n", + " date\n", + " mechanism\n", " regulator_locus_tag\n", " regulator_symbol\n", - " time\n", - " mechanism\n", " restriction\n", - " date\n", " strain\n", + " time\n", " carbon_source\n", " temperature_celsius\n", " \n", @@ -3531,40 +3475,40 @@ " \n", " \n", " 0\n", - " 1620\n", + " 1636\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", - " SMY2266a\n", + " SMY2266c\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", " \n", " 1\n", " 1628\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", " SMY2266b\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", " \n", " 2\n", - " 1636\n", + " 1620\n", + " 20161117\n", + " ZEV\n", " YPL016W\n", " SWI1\n", - " 20.0\n", - " ZEV\n", " P\n", - " 20161117\n", - " SMY2266c\n", + " SMY2266a\n", + " 20.0\n", " glucose\n", " 30.0\n", " \n", @@ -3573,18 +3517,18 @@ "" ], "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol time mechanism restriction \\\n", - "0 1620 YPL016W SWI1 20.0 ZEV P \n", - "1 1628 YPL016W SWI1 20.0 ZEV P \n", - "2 1636 YPL016W SWI1 20.0 ZEV P \n", + " sample_id date mechanism regulator_locus_tag regulator_symbol \\\n", + "0 1636 20161117 ZEV YPL016W SWI1 \n", + "1 1628 20161117 ZEV YPL016W SWI1 \n", + "2 1620 20161117 ZEV YPL016W SWI1 \n", "\n", - " date strain carbon_source temperature_celsius \n", - "0 20161117 SMY2266a glucose 30.0 \n", - "1 20161117 SMY2266b glucose 30.0 \n", - "2 20161117 SMY2266c glucose 30.0 " + " restriction strain time carbon_source temperature_celsius \n", + "0 P SMY2266c 20.0 glucose 30.0 \n", + "1 P SMY2266b 20.0 glucose 30.0 \n", + "2 P SMY2266a 20.0 glucose 30.0 " ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -3602,7 +3546,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "89408d2b", "metadata": {}, "outputs": [ @@ -3610,7 +3554,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2']\n" + "['MAC1', 'SWI1', 'GCN4', 'RDS2']\n" ] } ], @@ -3630,7 +3574,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "5a3b802b", "metadata": {}, "outputs": [ @@ -3638,7 +3582,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2', 'GEV']\n" + "['MAC1', 'SWI1', 'GCN4', 'RDS2', 'GEV']\n" ] } ], @@ -3650,7 +3594,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "abed8bc2", "metadata": {}, "outputs": [ @@ -3703,6 +3647,11 @@ "rawType": "float64", "type": "float" }, + { + "name": "pr_ranking_column", + "rawType": "object", + "type": "string" + }, { "name": "binding_repo_dataset", "rawType": "object", @@ -3734,862 +3683,912 @@ "type": "string" } ], - "ref": "b9dead21-45e7-491d-82d4-a2358af05efe", + "ref": "b0a3d538-3af3-4f72-8610-7722a73a7a4f", "rows": [ [ "0", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;85", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.004", + "BrentLab/harbison_2004;harbison_2004;105", + "BrentLab/hughes_2006;overexpression;10", + "11.0", + "206.0", + "12.0", + "206.0", + "0.041292917490562644", + "0.017", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "105", "harbison", - "85", - "BrentLab/Hackett_2020;hackett_2020" + "10", + "BrentLab/hughes_2006;overexpression" ], [ "1", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;83", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;108", + "BrentLab/hughes_2006;overexpression;11", + "60.0", + "67.0", + "60.0", + "67.0", + "0.05428351009647073", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "108", "harbison", - "83", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "2", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;84", - "2.0", - "1.0", - "3.0", - "1.0", - "0.0", - "0.011", + "BrentLab/harbison_2004;harbison_2004;109", + "BrentLab/hughes_2006;overexpression;11", + "27.0", + "1265.0", + "27.0", + "1265.0", + "0.12321364371741866", + "0.057", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "109", "harbison", - "84", - "BrentLab/Hackett_2020;hackett_2020" + "11", + "BrentLab/hughes_2006;overexpression" ], [ "3", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;78", - "487.0", - "96.0", - "479.0", - "92.0", - "0.4121918908550328", - "0.576", + "BrentLab/harbison_2004;harbison_2004;112", + "BrentLab/hughes_2006;overexpression;12", + "532.0", + "1093.0", + "532.0", + "1093.0", + "0.4363046674390623", + "0.092", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "112", "harbison", - "78", - "BrentLab/Hackett_2020;hackett_2020" + "12", + "BrentLab/hughes_2006;overexpression" ], [ "4", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;81", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;113", + "BrentLab/hughes_2006;overexpression;12", + "10.0", + "556.0", + "10.0", + "556.0", + "0.01756663927480034", + "0.002", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "113", "harbison", - "81", - "BrentLab/Hackett_2020;hackett_2020" + "12", + "BrentLab/hughes_2006;overexpression" ], [ "5", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;33", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;118", + "BrentLab/hughes_2006;overexpression;13", + "574.0", + "354.0", + "574.0", + "354.0", + "0.13894295437217577", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "118", "harbison", - "33", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "6", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;73", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;119", + "BrentLab/hughes_2006;overexpression;13", + "251.0", + "492.0", + "251.0", + "492.0", + "0.11808548603694578", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "119", "harbison", - "73", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "7", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;47", - "407.0", - "310.0", - "378.0", - "306.0", - "0.2038622347205313", - "0.441", + "BrentLab/harbison_2004;harbison_2004;120", + "BrentLab/hughes_2006;overexpression;13", + "14.0", + "2954.0", + "14.0", + "2954.0", + "0.1616346595561947", + "1.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "120", "harbison", - "47", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "8", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;46", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;121", + "BrentLab/hughes_2006;overexpression;13", + "422.0", + "544.0", + "423.0", + "544.0", + "0.401585299611564", + "0.001", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "121", "harbison", - "46", - "BrentLab/Hackett_2020;hackett_2020" + "13", + "BrentLab/hughes_2006;overexpression" ], [ "9", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;45", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;122", + "BrentLab/hughes_2006;overexpression;14", + "842.0", + "152.0", + "842.0", + "152.0", + "0.37750827352885596", + "0.106", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "122", "harbison", - "45", - "BrentLab/Hackett_2020;hackett_2020" + "14", + "BrentLab/hughes_2006;overexpression" ], [ "10", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;48", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;124", + "BrentLab/hughes_2006;overexpression;15", + "402.0", + "1417.0", + "402.0", + "1417.0", + "0.279937313245534", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "124", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "15", + "BrentLab/hughes_2006;overexpression" ], [ "11", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;34", - "198.0", - "26.0", - "193.0", - "24.0", - "0.7367526600236447", - "0.512", + "BrentLab/harbison_2004;harbison_2004;137", + "BrentLab/hughes_2006;overexpression;17", + "29.0", + "5.0", + "29.0", + "5.0", + "0.005954520941937803", + "0.043", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "137", "harbison", - "34", - "BrentLab/Hackett_2020;hackett_2020" + "17", + "BrentLab/hughes_2006;overexpression" ], [ "12", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;88", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;141", + "BrentLab/hughes_2006;overexpression;18", + "653.0", + "1620.0", + "654.0", + "1620.0", + "0.442997844156436", + "0.812", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "141", "harbison", - "88", - "BrentLab/Hackett_2020;hackett_2020" + "18", + "BrentLab/hughes_2006;overexpression" ], [ "13", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;79", - "278.0", - "82.0", - "275.0", - "76.0", - "0.3669436052366566", - "0.531", + "BrentLab/harbison_2004;harbison_2004;142", + "BrentLab/hughes_2006;overexpression;18", + "497.0", + "25.0", + "497.0", + "25.0", + "0.3308129606327521", + "0.921", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "142", "harbison", - "79", - "BrentLab/Hackett_2020;hackett_2020" + "18", + "BrentLab/hughes_2006;overexpression" ], [ "14", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;74", - "386.0", - "2.0", - "381.0", - "2.0", - "0.0478033736153071", - "0.596", + "BrentLab/harbison_2004;harbison_2004;150", + "BrentLab/hughes_2006;overexpression;19", + "91.0", + "1948.0", + "91.0", + "1948.0", + "0.2949755757517485", + "0.578", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "150", "harbison", - "74", - "BrentLab/Hackett_2020;hackett_2020" + "19", + "BrentLab/hughes_2006;overexpression" ], [ "15", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;87", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.01", + "BrentLab/harbison_2004;harbison_2004;151", + "BrentLab/hughes_2006;overexpression;21", + "57.0", + "386.0", + "57.0", + "386.0", + "0.0656826352687399", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "151", "harbison", - "87", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "16", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;82", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.005", + "BrentLab/harbison_2004;harbison_2004;152", + "BrentLab/hughes_2006;overexpression;21", + "272.0", + "526.0", + "272.0", + "526.0", + "0.2405177062735934", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "152", "harbison", - "82", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "17", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;40", - "233.0", - "887.0", - "228.0", - "853.0", - "0.4419109947643979", - "0.306", + "BrentLab/harbison_2004;harbison_2004;153", + "BrentLab/hughes_2006;overexpression;21", + "186.0", + "1060.0", + "186.0", + "1060.0", + "0.20770457061222172", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "153", "harbison", - "40", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "18", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;37", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;154", + "BrentLab/hughes_2006;overexpression;21", + "65.0", + "398.0", + "65.0", + "398.0", + "0.10461443622068167", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "154", "harbison", - "37", - "BrentLab/Hackett_2020;hackett_2020" + "21", + "BrentLab/hughes_2006;overexpression" ], [ "19", - "BrentLab/harbison_2004;harbison_2004;3", - "BrentLab/Hackett_2020;hackett_2020;86", - "2.0", - "2.0", - "3.0", - "2.0", - "0.0002250900360144", - "0.014", + "BrentLab/harbison_2004;harbison_2004;157", + "BrentLab/hughes_2006;overexpression;22", + "482.0", + "176.0", + "482.0", + "176.0", + "0.14485664209958654", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "3", + "hughes_2006-overexpression", + "157", "harbison", - "86", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "20", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;75", - "386.0", - "4.0", - "381.0", - "4.0", - "0.1752790365894595", - "0.871", + "BrentLab/harbison_2004;harbison_2004;158", + "BrentLab/hughes_2006;overexpression;22", + "354.0", + "215.0", + "354.0", + "215.0", + "0.12060713643717419", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "158", "harbison", - "75", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "21", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;77", - "487.0", - "15.0", - "479.0", - "13.0", - "0.1591137965760322", - "0.23", + "BrentLab/harbison_2004;harbison_2004;159", + "BrentLab/hughes_2006;overexpression;22", + "550.0", + "611.0", + "550.0", + "611.0", + "0.2924649934604871", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "159", "harbison", - "77", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "22", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;38", - "28.0", - "394.0", - "29.0", - "375.0", - "0.1464068569498395", - "0.309", + "BrentLab/harbison_2004;harbison_2004;160", + "BrentLab/hughes_2006;overexpression;22", + "77.0", + "625.0", + "77.0", + "625.0", + "0.1062495373846105", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "160", "harbison", - "38", - "BrentLab/Hackett_2020;hackett_2020" + "22", + "BrentLab/hughes_2006;overexpression" ], [ "23", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;36", - "242.0", - "239.0", - "237.0", - "230.0", - "0.4474384543548884", - "0.644", + "BrentLab/harbison_2004;harbison_2004;161", + "BrentLab/hughes_2006;overexpression;23", + "37.0", + "3236.0", + "37.0", + "3236.0", + "0.014875454821573575", + "0.456", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "161", "harbison", - "36", - "BrentLab/Hackett_2020;hackett_2020" + "23", + "BrentLab/hughes_2006;overexpression" ], [ "24", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;35", - "12.0", - "136.0", - "12.0", - "129.0", - "0.1014820131734504", - "0.411", + "BrentLab/harbison_2004;harbison_2004;162", + "BrentLab/hughes_2006;overexpression;24", + "417.0", + "1082.0", + "417.0", + "1082.0", + "0.22690440962955793", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "162", "harbison", - "35", - "BrentLab/Hackett_2020;hackett_2020" + "24", + "BrentLab/hughes_2006;overexpression" ], [ "25", - "BrentLab/harbison_2004;harbison_2004;2", - "BrentLab/Hackett_2020;hackett_2020;39", - "236.0", - "462.0", - "231.0", - "442.0", - "0.4406392501266677", - "0.536", + "BrentLab/harbison_2004;harbison_2004;163", + "BrentLab/hughes_2006;overexpression;24", + "896.0", + "710.0", + "896.0", + "710.0", + "0.41161010647006896", + "0.002", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "2", + "hughes_2006-overexpression", + "163", "harbison", - "39", - "BrentLab/Hackett_2020;hackett_2020" + "24", + "BrentLab/hughes_2006;overexpression" ], [ "26", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;65", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;174", + "BrentLab/hughes_2006;overexpression;26", + "55.0", + "2135.0", + "55.0", + "2135.0", + "0.08879402276624998", + "0.006", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "174", "harbison", - "65", - "BrentLab/Hackett_2020;hackett_2020" + "26", + "BrentLab/hughes_2006;overexpression" ], [ "27", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;80", - "386.0", - "12.0", - "381.0", - "11.0", - "0.1530190500167841", - "0.26", + "BrentLab/harbison_2004;harbison_2004;175", + "BrentLab/hughes_2006;overexpression;27", + "79.0", + "354.0", + "79.0", + "354.0", + "0.36280804176948345", + "0.485", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "175", "harbison", - "80", - "BrentLab/Hackett_2020;hackett_2020" + "27", + "BrentLab/hughes_2006;overexpression" ], [ "28", - "BrentLab/harbison_2004;harbison_2004;4", - "BrentLab/Hackett_2020;hackett_2020;76", - "386.0", - "13.0", - "381.0", - "13.0", - "0.3335221550855992", - "0.723", + "BrentLab/harbison_2004;harbison_2004;176", + "BrentLab/hughes_2006;overexpression;27", + "1.0", + "604.0", + "1.0", + "604.0", + "0.0", + "0.981", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "4", + "hughes_2006-overexpression", + "176", "harbison", - "76", - "BrentLab/Hackett_2020;hackett_2020" + "27", + "BrentLab/hughes_2006;overexpression" ], [ "29", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;48", - "467.0", - "60.0", - "454.0", - "60.0", - "0.1983655120981107", - "0.035", + "BrentLab/harbison_2004;harbison_2004;177", + "BrentLab/hughes_2006;overexpression;28", + "10.0", + "3654.0", + "10.0", + "3654.0", + "0.0", + "1.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "177", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "30", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;47", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;178", + "BrentLab/hughes_2006;overexpression;28", + "20.0", + "61.0", + "22.0", + "61.0", + "0.10253010965306489", + "0.707", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "178", "harbison", - "47", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "31", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;46", - "284.0", - "47.0", - "278.0", - "46.0", - "0.0992715955737997", - "0.003", + "BrentLab/harbison_2004;harbison_2004;179", + "BrentLab/hughes_2006;overexpression;28", + "6.0", + "1128.0", + "6.0", + "1128.0", + "0.15157064533525078", + "0.968", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "179", "harbison", - "46", - "BrentLab/Hackett_2020;hackett_2020" + "28", + "BrentLab/hughes_2006;overexpression" ], [ "32", - "BrentLab/harbison_2004;harbison_2004;11", - "BrentLab/Hackett_2020;hackett_2020;48", - "472.0", - "1.0", - "459.0", - "1.0", - "0.0", - "0.915", + "BrentLab/harbison_2004;harbison_2004;191", + "BrentLab/hughes_2006;overexpression;29", + "342.0", + "174.0", + "342.0", + "174.0", + "0.42452813230271436", + "0.452", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "11", + "hughes_2006-overexpression", + "191", "harbison", - "48", - "BrentLab/Hackett_2020;hackett_2020" + "29", + "BrentLab/hughes_2006;overexpression" ], [ "33", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;192", + "BrentLab/hughes_2006;overexpression;30", + "132.0", + "227.0", + "132.0", + "227.0", + "0.22362783869614716", + "0.002", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "192", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "34", - "BrentLab/harbison_2004;harbison_2004;16", - "BrentLab/Hackett_2020;hackett_2020;89", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;193", + "BrentLab/hughes_2006;overexpression;30", + "322.0", + "442.0", + "322.0", + "442.0", + "0.40950351528951207", + "0.021", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "16", + "hughes_2006-overexpression", + "193", "harbison", - "89", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "35", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;194", + "BrentLab/hughes_2006;overexpression;30", + "76.0", + "43.0", + "76.0", + "43.0", + "0.12124752831206184", + "0.395", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "194", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "30", + "BrentLab/hughes_2006;overexpression" ], [ "36", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;43", - "2.0", - "330.0", - "2.0", - "318.0", - "0.0", - "0.195", + "BrentLab/harbison_2004;harbison_2004;201", + "BrentLab/hughes_2006;overexpression;31", + "136.0", + "1104.0", + "136.0", + "1104.0", + "0.2752121157648751", + "0.001", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "201", "harbison", - "43", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "37", - "BrentLab/harbison_2004;harbison_2004;16", - "BrentLab/Hackett_2020;hackett_2020;91", - "9.0", - "1.0", - "9.0", - "1.0", + "BrentLab/harbison_2004;harbison_2004;202", + "BrentLab/hughes_2006;overexpression;31", + "287.0", + "36.0", + "287.0", + "36.0", + "0.06401671759841812", "0.0", - "0.019", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "16", + "hughes_2006-overexpression", + "202", "harbison", - "91", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "38", - "BrentLab/harbison_2004;harbison_2004;17", - "BrentLab/Hackett_2020;hackett_2020;91", - "2.0", - "1.0", - "2.0", - "1.0", - "0.0", - "0.008", + "BrentLab/harbison_2004;harbison_2004;203", + "BrentLab/hughes_2006;overexpression;31", + "88.0", + "41.0", + "88.0", + "41.0", + "0.06563294471122981", + "0.003", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "17", + "hughes_2006-overexpression", + "203", "harbison", - "91", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "39", - "BrentLab/harbison_2004;harbison_2004;8", - "BrentLab/Hackett_2020;hackett_2020;43", - "290.0", - "412.0", - "278.0", - "386.0", - "0.4521656634210855", - "0.208", + "BrentLab/harbison_2004;harbison_2004;204", + "BrentLab/hughes_2006;overexpression;31", + "318.0", + "1948.0", + "319.0", + "1948.0", + "0.380107954958676", + "0.57", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "8", + "hughes_2006-overexpression", + "204", "harbison", - "43", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "40", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;66", - "398.0", - "16.0", - "390.0", - "15.0", - "0.2406042358803986", - "0.431", + "BrentLab/harbison_2004;harbison_2004;205", + "BrentLab/hughes_2006;overexpression;31", + "467.0", + "646.0", + "467.0", + "646.0", + "0.42659723019346846", + "0.006", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "205", "harbison", - "66", - "BrentLab/Hackett_2020;hackett_2020" + "31", + "BrentLab/hughes_2006;overexpression" ], [ "41", - "BrentLab/harbison_2004;harbison_2004;7", - "BrentLab/Hackett_2020;hackett_2020;42", - "122.0", - "212.0", - "120.0", - "206.0", - "0.3447911486822476", - "0.49", + "BrentLab/harbison_2004;harbison_2004;207", + "BrentLab/hughes_2006;overexpression;32", + "55.0", + "230.0", + "56.0", + "230.0", + "0.3233042722751513", + "0.796", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "7", + "hughes_2006-overexpression", + "207", "harbison", - "42", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "42", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;72", - "346.0", - "18.0", - "338.0", - "16.0", - "0.22671996124031", - "0.528", + "BrentLab/harbison_2004;harbison_2004;208", + "BrentLab/hughes_2006;overexpression;32", + "25.0", + "126.0", + "25.0", + "126.0", + "0.0489281862304512", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "208", "harbison", - "72", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "43", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;69", - "118.0", - "120.0", - "115.0", - "113.0", - "0.3139880952380952", - "0.454", + "BrentLab/harbison_2004;harbison_2004;209", + "BrentLab/hughes_2006;overexpression;32", + "122.0", + "688.0", + "122.0", + "688.0", + "0.10777396924484826", + "0.0", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "209", "harbison", - "69", - "BrentLab/Hackett_2020;hackett_2020" + "32", + "BrentLab/hughes_2006;overexpression" ], [ "44", - "BrentLab/harbison_2004;harbison_2004;20", - "BrentLab/Hackett_2020;hackett_2020;99", - "3.0", - "1.0", - "4.0", - "1.0", - "0.0", - "0.006", + "BrentLab/harbison_2004;harbison_2004;210", + "BrentLab/hughes_2006;overexpression;33", + "97.0", + "2113.0", + "97.0", + "2113.0", + "0.30052307036231024", + "0.807", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "20", + "hughes_2006-overexpression", + "210", "harbison", - "99", - "BrentLab/Hackett_2020;hackett_2020" + "33", + "BrentLab/hughes_2006;overexpression" ], [ "45", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;70", - "260.0", - "17.0", - "256.0", - "17.0", - "0.1850671373200443", - "0.455", + "BrentLab/harbison_2004;harbison_2004;219", + "BrentLab/hughes_2006;overexpression;34", + "172.0", + "245.0", + "172.0", + "245.0", + "0.41551695727724847", + "0.505", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "219", "harbison", - "70", - "BrentLab/Hackett_2020;hackett_2020" + "34", + "BrentLab/hughes_2006;overexpression" ], [ "46", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;67", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;225", + "BrentLab/hughes_2006;overexpression;35", + "314.0", + "12.0", + "314.0", + "12.0", + "0.15336823656300558", + "0.877", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "225", "harbison", - "67", - "BrentLab/Hackett_2020;hackett_2020" + "35", + "BrentLab/hughes_2006;overexpression" ], [ "47", - "BrentLab/harbison_2004;harbison_2004;20", - "BrentLab/Hackett_2020;hackett_2020;97", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;228", + "BrentLab/hughes_2006;overexpression;36", + "358.0", + "2316.0", + "358.0", + "2316.0", + "0.33853600995025945", + "0.804", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "20", + "hughes_2006-overexpression", + "228", "harbison", - "97", - "BrentLab/Hackett_2020;hackett_2020" + "36", + "BrentLab/hughes_2006;overexpression" ], [ "48", - "BrentLab/harbison_2004;harbison_2004;5", - "BrentLab/Hackett_2020;hackett_2020;68", - "260.0", - "2.0", - "256.0", - "2.0", - "0.0317379568106312", - "0.647", + "BrentLab/harbison_2004;harbison_2004;231", + "BrentLab/hughes_2006;overexpression;38", + "77.0", + "362.0", + "77.0", + "362.0", + "0.32227814728264126", + "0.36", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "5", + "hughes_2006-overexpression", + "231", "harbison", - "68", - "BrentLab/Hackett_2020;hackett_2020" + "38", + "BrentLab/hughes_2006;overexpression" ], [ "49", - "BrentLab/harbison_2004;harbison_2004;10", - "BrentLab/Hackett_2020;hackett_2020;41", - null, - null, - null, - null, - null, - null, + "BrentLab/harbison_2004;harbison_2004;232", + "BrentLab/hughes_2006;overexpression;38", + "40.0", + "3302.0", + "41.0", + "3302.0", + "0.01832419557792558", + "0.593", + "log2fc", "harbison_2004-harbison_2004", - "Hackett_2020-hackett_2020", - "10", + "hughes_2006-overexpression", + "232", "harbison", - "41", - "BrentLab/Hackett_2020;hackett_2020" + "38", + "BrentLab/hughes_2006;overexpression" ] ], "shape": { - "columns": 14, - "rows": 9604 + "columns": 15, + "rows": 29804 } }, "text/html": [ @@ -4619,6 +4618,7 @@ " perturbation_set_size\n", " dto_fdr\n", " dto_empirical_pvalue\n", + " pr_ranking_column\n", " binding_repo_dataset\n", " perturbation_repo_dataset\n", " binding_id_id\n", @@ -4630,88 +4630,93 @@ " \n", " \n", " 0\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;85\n", - " 2.0\n", - " 2.0\n", - " 3.0\n", - " 2.0\n", - " 0.000225\n", - " 0.004\n", + " BrentLab/harbison_2004;harbison_2004;105\n", + " BrentLab/hughes_2006;overexpression;10\n", + " 11.0\n", + " 206.0\n", + " 12.0\n", + " 206.0\n", + " 0.041293\n", + " 0.017\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 105\n", " harbison\n", - " 85\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 10\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 1\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;83\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;108\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 60.0\n", + " 67.0\n", + " 60.0\n", + " 67.0\n", + " 0.054284\n", + " 0.000\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 108\n", " harbison\n", - " 83\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 2\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;84\n", - " 2.0\n", - " 1.0\n", - " 3.0\n", - " 1.0\n", - " 0.000000\n", - " 0.011\n", + " BrentLab/harbison_2004;harbison_2004;109\n", + " BrentLab/hughes_2006;overexpression;11\n", + " 27.0\n", + " 1265.0\n", + " 27.0\n", + " 1265.0\n", + " 0.123214\n", + " 0.057\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 109\n", " harbison\n", - " 84\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 11\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 3\n", - " BrentLab/harbison_2004;harbison_2004;4\n", - " BrentLab/Hackett_2020;hackett_2020;78\n", - " 487.0\n", - " 96.0\n", - " 479.0\n", - " 92.0\n", - " 0.412192\n", - " 0.576\n", + " BrentLab/harbison_2004;harbison_2004;112\n", + " BrentLab/hughes_2006;overexpression;12\n", + " 532.0\n", + " 1093.0\n", + " 532.0\n", + " 1093.0\n", + " 0.436305\n", + " 0.092\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 4\n", + " hughes_2006-overexpression\n", + " 112\n", " harbison\n", - " 78\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 12\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " 4\n", - " BrentLab/harbison_2004;harbison_2004;3\n", - " BrentLab/Hackett_2020;hackett_2020;81\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " BrentLab/harbison_2004;harbison_2004;113\n", + " BrentLab/hughes_2006;overexpression;12\n", + " 10.0\n", + " 556.0\n", + " 10.0\n", + " 556.0\n", + " 0.017567\n", + " 0.002\n", + " log2fc\n", " harbison_2004-harbison_2004\n", - " Hackett_2020-hackett_2020\n", - " 3\n", + " hughes_2006-overexpression\n", + " 113\n", " harbison\n", - " 81\n", - " BrentLab/Hackett_2020;hackett_2020\n", + " 12\n", + " BrentLab/hughes_2006;overexpression\n", " \n", " \n", " ...\n", @@ -4729,193 +4734,199 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " 9599\n", - " BrentLab/callingcards;annotated_features;804\n", - " BrentLab/kemmeren_2014;kemmeren_2014;901\n", - " 14.0\n", - " 39.0\n", - " 13.0\n", - " 39.0\n", - " 0.000879\n", + " 29799\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;784\n", + " 154.0\n", + " 905.0\n", + " 154.0\n", + " 905.0\n", + " 0.090665\n", " 0.000\n", - " callingcards-annotated_features\n", + " pvalue\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 804\n", - " BrentLab/callingcards;annotated_features\n", - " 901\n", + " 724-692-688\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 784\n", " kemmeren\n", " \n", " \n", - " 9600\n", - " BrentLab/callingcards;annotated_features;805\n", - " BrentLab/kemmeren_2014;kemmeren_2014;1053\n", - " 18.0\n", - " 278.0\n", - " 17.0\n", - " 171.0\n", - " 0.001455\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29800\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;666\n", + " 215.0\n", + " 108.0\n", + " 215.0\n", + " 108.0\n", + " 0.075036\n", + " 0.005\n", + " pvalue\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 805\n", - " BrentLab/callingcards;annotated_features\n", - " 1053\n", + " 725-435-395\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 666\n", " kemmeren\n", " \n", " \n", - " 9601\n", - " BrentLab/callingcards;annotated_features;808\n", - " BrentLab/kemmeren_2014;kemmeren_2014;218\n", - " 20.0\n", - " 57.0\n", - " 19.0\n", - " 27.0\n", - " 0.003116\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29801\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;271\n", + " 221.0\n", + " 925.0\n", + " 221.0\n", + " 925.0\n", + " 0.403484\n", + " 0.126\n", + " pvalue\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 808\n", - " BrentLab/callingcards;annotated_features\n", - " 218\n", + " 726-445-424\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 271\n", " kemmeren\n", " \n", " \n", - " 9602\n", - " BrentLab/callingcards;annotated_features;806\n", - " BrentLab/kemmeren_2014;kemmeren_2014;1023\n", - " 10.0\n", - " 9.0\n", - " 11.0\n", - " 9.0\n", - " 0.000000\n", - " 0.000\n", - " callingcards-annotated_features\n", + " 29802\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;1077\n", + " 281.0\n", + " 73.0\n", + " 283.0\n", + " 77.0\n", + " 0.095948\n", + " 0.174\n", + " pvalue\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 806\n", - " BrentLab/callingcards;annotated_features\n", - " 1023\n", + " 79-33\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 1077\n", " kemmeren\n", " \n", " \n", - " 9603\n", - " BrentLab/callingcards;annotated_features;809\n", - " BrentLab/kemmeren_2014;kemmeren_2014;913\n", - " 150.0\n", - " 221.0\n", - " 140.0\n", - " 206.0\n", - " 0.116890\n", + " 29803\n", + " BrentLab/callingcards;annotated_features_combi...\n", + " BrentLab/kemmeren_2014;kemmeren_2014;963\n", + " 526.0\n", + " 227.0\n", + " 527.0\n", + " 227.0\n", + " 0.064919\n", " 0.000\n", - " callingcards-annotated_features\n", + " pvalue\n", + " callingcards-annotated_features_combined\n", " kemmeren_2014-kemmeren_2014\n", - " 809\n", - " BrentLab/callingcards;annotated_features\n", - " 913\n", + " 96-49\n", + " BrentLab/callingcards;annotated_features_combined\n", + " 963\n", " kemmeren\n", " \n", " \n", "\n", - "

9604 rows × 14 columns

\n", + "

29804 rows × 15 columns

\n", "" ], "text/plain": [ - " binding_id \\\n", - "0 BrentLab/harbison_2004;harbison_2004;3 \n", - "1 BrentLab/harbison_2004;harbison_2004;3 \n", - "2 BrentLab/harbison_2004;harbison_2004;3 \n", - "3 BrentLab/harbison_2004;harbison_2004;4 \n", - "4 BrentLab/harbison_2004;harbison_2004;3 \n", - "... ... \n", - "9599 BrentLab/callingcards;annotated_features;804 \n", - "9600 BrentLab/callingcards;annotated_features;805 \n", - "9601 BrentLab/callingcards;annotated_features;808 \n", - "9602 BrentLab/callingcards;annotated_features;806 \n", - "9603 BrentLab/callingcards;annotated_features;809 \n", + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;105 \n", + "1 BrentLab/harbison_2004;harbison_2004;108 \n", + "2 BrentLab/harbison_2004;harbison_2004;109 \n", + "3 BrentLab/harbison_2004;harbison_2004;112 \n", + "4 BrentLab/harbison_2004;harbison_2004;113 \n", + "... ... \n", + "29799 BrentLab/callingcards;annotated_features_combi... \n", + "29800 BrentLab/callingcards;annotated_features_combi... \n", + "29801 BrentLab/callingcards;annotated_features_combi... \n", + "29802 BrentLab/callingcards;annotated_features_combi... \n", + "29803 BrentLab/callingcards;annotated_features_combi... \n", "\n", - " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", - "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", - "3 BrentLab/Hackett_2020;hackett_2020;78 487.0 \n", - "4 BrentLab/Hackett_2020;hackett_2020;81 NaN \n", - "... ... ... \n", - "9599 BrentLab/kemmeren_2014;kemmeren_2014;901 14.0 \n", - "9600 BrentLab/kemmeren_2014;kemmeren_2014;1053 18.0 \n", - "9601 BrentLab/kemmeren_2014;kemmeren_2014;218 20.0 \n", - "9602 BrentLab/kemmeren_2014;kemmeren_2014;1023 10.0 \n", - "9603 BrentLab/kemmeren_2014;kemmeren_2014;913 150.0 \n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/hughes_2006;overexpression;10 11.0 \n", + "1 BrentLab/hughes_2006;overexpression;11 60.0 \n", + "2 BrentLab/hughes_2006;overexpression;11 27.0 \n", + "3 BrentLab/hughes_2006;overexpression;12 532.0 \n", + "4 BrentLab/hughes_2006;overexpression;12 10.0 \n", + "... ... ... \n", + "29799 BrentLab/kemmeren_2014;kemmeren_2014;784 154.0 \n", + "29800 BrentLab/kemmeren_2014;kemmeren_2014;666 215.0 \n", + "29801 BrentLab/kemmeren_2014;kemmeren_2014;271 221.0 \n", + "29802 BrentLab/kemmeren_2014;kemmeren_2014;1077 281.0 \n", + "29803 BrentLab/kemmeren_2014;kemmeren_2014;963 526.0 \n", "\n", - " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 2.0 3.0 2.0 \n", - "1 NaN NaN NaN \n", - "2 1.0 3.0 1.0 \n", - "3 96.0 479.0 92.0 \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "9599 39.0 13.0 39.0 \n", - "9600 278.0 17.0 171.0 \n", - "9601 57.0 19.0 27.0 \n", - "9602 9.0 11.0 9.0 \n", - "9603 221.0 140.0 206.0 \n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 206.0 12.0 206.0 \n", + "1 67.0 60.0 67.0 \n", + "2 1265.0 27.0 1265.0 \n", + "3 1093.0 532.0 1093.0 \n", + "4 556.0 10.0 556.0 \n", + "... ... ... ... \n", + "29799 905.0 154.0 905.0 \n", + "29800 108.0 215.0 108.0 \n", + "29801 925.0 221.0 925.0 \n", + "29802 73.0 283.0 77.0 \n", + "29803 227.0 527.0 227.0 \n", "\n", - " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.000225 0.004 harbison_2004-harbison_2004 \n", - "1 NaN NaN harbison_2004-harbison_2004 \n", - "2 0.000000 0.011 harbison_2004-harbison_2004 \n", - "3 0.412192 0.576 harbison_2004-harbison_2004 \n", - "4 NaN NaN harbison_2004-harbison_2004 \n", - "... ... ... ... \n", - "9599 0.000879 0.000 callingcards-annotated_features \n", - "9600 0.001455 0.000 callingcards-annotated_features \n", - "9601 0.003116 0.000 callingcards-annotated_features \n", - "9602 0.000000 0.000 callingcards-annotated_features \n", - "9603 0.116890 0.000 callingcards-annotated_features \n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.041293 0.017 log2fc \n", + "1 0.054284 0.000 log2fc \n", + "2 0.123214 0.057 log2fc \n", + "3 0.436305 0.092 log2fc \n", + "4 0.017567 0.002 log2fc \n", + "... ... ... ... \n", + "29799 0.090665 0.000 pvalue \n", + "29800 0.075036 0.005 pvalue \n", + "29801 0.403484 0.126 pvalue \n", + "29802 0.095948 0.174 pvalue \n", + "29803 0.064919 0.000 pvalue \n", "\n", - " perturbation_repo_dataset binding_id_id \\\n", - "0 Hackett_2020-hackett_2020 3 \n", - "1 Hackett_2020-hackett_2020 3 \n", - "2 Hackett_2020-hackett_2020 3 \n", - "3 Hackett_2020-hackett_2020 4 \n", - "4 Hackett_2020-hackett_2020 3 \n", - "... ... ... \n", - "9599 kemmeren_2014-kemmeren_2014 804 \n", - "9600 kemmeren_2014-kemmeren_2014 805 \n", - "9601 kemmeren_2014-kemmeren_2014 808 \n", - "9602 kemmeren_2014-kemmeren_2014 806 \n", - "9603 kemmeren_2014-kemmeren_2014 809 \n", + " binding_repo_dataset perturbation_repo_dataset \\\n", + "0 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "1 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "2 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "3 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "4 harbison_2004-harbison_2004 hughes_2006-overexpression \n", + "... ... ... \n", + "29799 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29800 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29801 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29802 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", + "29803 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n", "\n", - " binding_id_source perturbation_id_id \\\n", - "0 harbison 85 \n", - "1 harbison 83 \n", - "2 harbison 84 \n", - "3 harbison 78 \n", - "4 harbison 81 \n", - "... ... ... \n", - "9599 BrentLab/callingcards;annotated_features 901 \n", - "9600 BrentLab/callingcards;annotated_features 1053 \n", - "9601 BrentLab/callingcards;annotated_features 218 \n", - "9602 BrentLab/callingcards;annotated_features 1023 \n", - "9603 BrentLab/callingcards;annotated_features 913 \n", + " binding_id_id binding_id_source \\\n", + "0 105 harbison \n", + "1 108 harbison \n", + "2 109 harbison \n", + "3 112 harbison \n", + "4 113 harbison \n", + "... ... ... \n", + "29799 724-692-688 BrentLab/callingcards;annotated_features_combined \n", + "29800 725-435-395 BrentLab/callingcards;annotated_features_combined \n", + "29801 726-445-424 BrentLab/callingcards;annotated_features_combined \n", + "29802 79-33 BrentLab/callingcards;annotated_features_combined \n", + "29803 96-49 BrentLab/callingcards;annotated_features_combined \n", "\n", - " perturbation_id_source \n", - "0 BrentLab/Hackett_2020;hackett_2020 \n", - "1 BrentLab/Hackett_2020;hackett_2020 \n", - "2 BrentLab/Hackett_2020;hackett_2020 \n", - "3 BrentLab/Hackett_2020;hackett_2020 \n", - "4 BrentLab/Hackett_2020;hackett_2020 \n", - "... ... \n", - "9599 kemmeren \n", - "9600 kemmeren \n", - "9601 kemmeren \n", - "9602 kemmeren \n", - "9603 kemmeren \n", + " perturbation_id_id perturbation_id_source \n", + "0 10 BrentLab/hughes_2006;overexpression \n", + "1 11 BrentLab/hughes_2006;overexpression \n", + "2 11 BrentLab/hughes_2006;overexpression \n", + "3 12 BrentLab/hughes_2006;overexpression \n", + "4 12 BrentLab/hughes_2006;overexpression \n", + "... ... ... \n", + "29799 784 kemmeren \n", + "29800 666 kemmeren \n", + "29801 271 kemmeren \n", + "29802 1077 kemmeren \n", + "29803 963 kemmeren \n", "\n", - "[9604 rows x 14 columns]" + "[29804 rows x 15 columns]" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -4926,7 +4937,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "cell-25", "metadata": {}, "outputs": [ @@ -4942,53 +4953,53 @@ "4 448 ACA1 15.0 ZEV \n", "\n", " binding_id \\\n", - "0 BrentLab/callingcards;annotated_features;803 \n", + "0 BrentLab/callingcards;annotated_features;146 \n", "1 BrentLab/callingcards;annotated_features;156 \n", - "2 BrentLab/callingcards;annotated_features;126 \n", - "3 BrentLab/callingcards;annotated_features;189 \n", - "4 BrentLab/callingcards;annotated_features;146 \n", + "2 BrentLab/harbison_2004;harbison_2004;88 \n", + "3 BrentLab/callingcards;annotated_features;146 \n", + "4 BrentLab/callingcards;annotated_features;803 \n", "\n", " perturbation_id binding_rank_threshold \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;448 112.0 \n", - "1 BrentLab/Hackett_2020;hackett_2020;448 31.0 \n", - "2 BrentLab/Hackett_2020;hackett_2020;448 21.0 \n", - "3 BrentLab/Hackett_2020;hackett_2020;448 164.0 \n", - "4 BrentLab/Hackett_2020;hackett_2020;448 23.0 \n", + "0 BrentLab/hackett_2020;hackett_2020;448 452.0 \n", + "1 BrentLab/hackett_2020;hackett_2020;448 296.0 \n", + "2 BrentLab/hackett_2020;hackett_2020;448 122.0 \n", + "3 BrentLab/hackett_2020;hackett_2020;448 35.0 \n", + "4 BrentLab/hackett_2020;hackett_2020;448 544.0 \n", "\n", " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", - "0 98.0 108.0 90.0 \n", - "1 98.0 26.0 90.0 \n", - "2 98.0 17.0 90.0 \n", - "3 154.0 150.0 144.0 \n", - "4 98.0 18.0 90.0 \n", + "0 1.0 454.0 5591.0 \n", + "1 346.0 297.0 346.0 \n", + "2 218.0 122.0 218.0 \n", + "3 407.0 35.0 407.0 \n", + "4 1.0 544.0 5591.0 \n", "\n", - " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", - "0 0.187319 0.074 callingcards-annotated_features \n", - "1 0.072561 0.047 callingcards-annotated_features \n", - "2 0.061941 0.071 callingcards-annotated_features \n", - "3 0.213716 0.011 callingcards-annotated_features \n", - "4 0.066616 0.171 callingcards-annotated_features \n", + " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n", + "0 0.000000 1.000 pvalue \n", + "1 0.277211 0.000 log2fc \n", + "2 0.612736 0.917 log2fc \n", + "3 0.116834 0.000 log2fc \n", + "4 0.000000 1.000 pvalue \n", "\n", - " perturbation_repo_dataset binding_id_id \\\n", - "0 Hackett_2020-hackett_2020 803 \n", - "1 Hackett_2020-hackett_2020 156 \n", - "2 Hackett_2020-hackett_2020 126 \n", - "3 Hackett_2020-hackett_2020 189 \n", - "4 Hackett_2020-hackett_2020 146 \n", + " binding_repo_dataset perturbation_repo_dataset binding_id_id \\\n", + "0 callingcards-annotated_features hackett_2020-hackett_2020 146 \n", + "1 callingcards-annotated_features hackett_2020-hackett_2020 156 \n", + "2 harbison_2004-harbison_2004 hackett_2020-hackett_2020 88 \n", + "3 callingcards-annotated_features hackett_2020-hackett_2020 146 \n", + "4 callingcards-annotated_features hackett_2020-hackett_2020 803 \n", "\n", " binding_id_source perturbation_id_id \\\n", "0 BrentLab/callingcards;annotated_features 448 \n", "1 BrentLab/callingcards;annotated_features 448 \n", - "2 BrentLab/callingcards;annotated_features 448 \n", + "2 harbison 448 \n", "3 BrentLab/callingcards;annotated_features 448 \n", "4 BrentLab/callingcards;annotated_features 448 \n", "\n", - " perturbation_id_source \n", - "0 BrentLab/Hackett_2020;hackett_2020 \n", - "1 BrentLab/Hackett_2020;hackett_2020 \n", - "2 BrentLab/Hackett_2020;hackett_2020 \n", - "3 BrentLab/Hackett_2020;hackett_2020 \n", - "4 BrentLab/Hackett_2020;hackett_2020 \n" + " perturbation_id_source \n", + "0 hackett \n", + "1 hackett \n", + "2 hackett \n", + "3 hackett \n", + "4 hackett \n" ] } ], @@ -5016,7 +5027,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "cell-26", "metadata": {}, "outputs": [], diff --git a/docs/virtual_db.md b/docs/virtual_db.md index 3ec4b45..9062618 100644 --- a/docs/virtual_db.md +++ b/docs/virtual_db.md @@ -23,6 +23,66 @@ For comparative analysis datasets, VirtualDB creates: See the [configuration guide](virtual_db_configuration.md) for setup details and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples. +## Advanced Usage + +After any public method is called (e.g. `vdb.tables()`), the underlying DuckDB +connection is available as `vdb._db`. You can use `_db` to execute any SQL +on the database, eg creating more views, or creating a table in memory + +Custom **views** created this way appear in `tables()`, `describe()`, and +`get_fields()` automatically because those methods query DuckDB's +`information_schema`. Custom **tables** do not appear in `tables()` (which +only lists views), but are fully queryable via `vdb.query()`. + +Call at least one public method first to ensure the connection is initialized +before accessing `_db` directly. + +Example -- create a materialized analysis table:: + + # Trigger view registration + vdb.tables() + + # Create a persistent in-memory table from a complex query. + # This example selects one "best" Hackett-2020 sample per regulator + # using a priority system: ZEV+P > GEV+P > GEV+M. + vdb._db.execute(""" + CREATE OR REPLACE TABLE hackett_analysis_set AS + WITH regulator_tiers AS ( + SELECT + regulator_locus_tag, + CASE + WHEN BOOL_OR(mechanism = 'ZEV' AND restriction = 'P') THEN 1 + WHEN BOOL_OR(mechanism = 'GEV' AND restriction = 'P') THEN 2 + ELSE 3 + END AS tier + FROM hackett_meta + WHERE regulator_locus_tag NOT IN ('Z3EV', 'GEV') + GROUP BY regulator_locus_tag + ), + tier_filter AS ( + SELECT + h.sample_id, h.regulator_locus_tag, h.regulator_symbol, + h.mechanism, h.restriction, h.date, h.strain, t.tier + FROM hackett_meta h + JOIN regulator_tiers t USING (regulator_locus_tag) + WHERE + (t.tier = 1 AND h.mechanism = 'ZEV' AND h.restriction = 'P') + OR (t.tier = 2 AND h.mechanism = 'GEV' AND h.restriction = 'P') + OR (t.tier = 3 AND h.mechanism = 'GEV' AND h.restriction = 'M') + ) + SELECT DISTINCT + sample_id, regulator_locus_tag, regulator_symbol, + mechanism, restriction, date, strain + FROM tier_filter + WHERE regulator_symbol NOT IN ('GCN4', 'RDS2', 'SWI1', 'MAC1') + ORDER BY regulator_locus_tag, sample_id + """) + + df = vdb.query("SELECT * FROM hackett_analysis_set") + +Tables and views created this way are in-memory only and do not persist across +VirtualDB instances. They exist for the lifetime of the DuckDB connection. + ## API Reference ::: tfbpapi.virtual_db.VirtualDB diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md index 45320d4..42316d5 100644 --- a/docs/virtual_db_configuration.md +++ b/docs/virtual_db_configuration.md @@ -10,8 +10,10 @@ levels. repositories: # Each repository defines a "table" in the virtual database BrentLab/harbison_2004: - # REQUIRED: Specify which field is the sample identifier. At this level, it means - # that all datasets have a field `sample_id` that uniquely identifies samples. + # REQUIRED: Specify which column is the sample identifier. The `field` + # value is the actual column name in the parquet data. At the repo level, + # it applies to all datasets in this repository. If not specified at + # either level, the default column name "sample_id" is assumed. sample_id: field: sample_id # Repository-wide properties (apply to all datasets in this repository) @@ -47,8 +49,9 @@ repositories: kemmeren_2014: # optional -- see the note for `db_name` in harbison above db_name: kemmeren - # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be - # defined at the dataset level for each dataset in the repo + # REQUIRED: If `sample_id` isn't defined at the repo level, it must be + # defined at the dataset level. The `field` value is the actual column + # name in the parquet data (does not need to be literally "sample_id"). sample_id: field: sample_id # Same logical fields, different physical paths @@ -144,6 +147,62 @@ during metadata extraction and query filtering. 2. **Type consistency**: When source data might be extracted with incorrect type 3. **Performance**: Helps with query optimization and prevents type mismatches +## Tags + +Tags are arbitrary string key/value pairs for annotating datasets. They follow +the same hierarchy as property mappings: repo-level tags apply to all datasets +in the repository, dataset-level tags apply only to that dataset, and +dataset-level tags override repo-level tags with the same key. + +```yaml +repositories: + BrentLab/harbison_2004: + # Repo-level tags apply to all datasets in this repository + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + # Dataset-level tags override repo-level tags with the same key + tags: + assay: chip-chip + + BrentLab/kemmeren_2014: + tags: + assay: perturbation + organism: yeast + dataset: + kemmeren_2014: + sample_id: + field: sample_id +``` + +Access merged tags via `vdb.get_tags(db_name)`, identifying datasets by +their name as it appears in `vdb.tables()`: + +```python +from tfbpapi.virtual_db import VirtualDB + +vdb = VirtualDB("datasets.yaml") + +# Returns {"assay": "chip-chip", "organism": "yeast"} +# (dataset-level assay overrides repo-level) +vdb.get_tags("harbison") + +# Returns {"assay": "perturbation", "organism": "yeast"} +vdb.get_tags("kemmeren") +``` + +The underlying `MetadataConfig` (available as `vdb.config`) exposes the same +data via `(repo_id, config_name)` pairs for programmatic or developer use: + +```python +# Equivalent to vdb.get_tags("harbison") above +vdb.config.get_tags("BrentLab/harbison_2004", "harbison_2004") +``` + ## Comparative Datasets Comparative datasets differ from other dataset types in that they represent @@ -152,9 +211,10 @@ Each row relates 2+ samples from other datasets. ### Structure -Comparative datasets use `source_sample` fields instead of a single `sample_id`: +Comparative datasets use `source_sample` fields instead of a single sample +identifier column: - Multiple fields with `role: source_sample` -- Each contains composite identifier: `"repo_id;config_name;sample_id"` +- Each contains composite identifier: `"repo_id;config_name;sample_id_value"` - Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` ### Fields @@ -206,10 +266,11 @@ build on each other. Using `harbison` as an example primary dataset and **1. Metadata view** -One row per unique `sample_id`. Derived columns from the configuration -(e.g., `carbon_source`, `temperature_celsius`) are resolved here using -datacard definitions, factor aliases, and missing value labels. This is -the primary view for querying sample-level metadata. +One row per unique sample identifier (the column configured via +`sample_id: {field: }`). Derived columns from the +configuration (e.g., `carbon_source`, `temperature_celsius`) are resolved +here using datacard definitions, factor aliases, and missing value labels. +This is the primary view for querying sample-level metadata. **2. Raw data view** @@ -239,7 +300,7 @@ or filter by source dataset without parsing composite IDs in SQL. ``` __harbison_parquet (raw parquet, not directly exposed) | - +-> harbison_meta (deduplicated, one row per sample_id, + +-> harbison_meta (deduplicated, one row per sample identifier, | with derived columns from config) | +-> harbison (full parquet joined to harbison_meta) diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py index abf94dd..734a5f3 100644 --- a/tfbpapi/datacard.py +++ b/tfbpapi/datacard.py @@ -17,6 +17,7 @@ """ import logging +from dataclasses import dataclass from typing import Any from pydantic import ValidationError @@ -36,6 +37,34 @@ ) +@dataclass +class DatasetSchema: + """ + Complete schema summary for a data configuration. + + Derived entirely from the DataCard YAML -- no DuckDB introspection needed. Used by + VirtualDB to determine column partitioning between data and metadata parquets. + + :ivar data_columns: Column names present in the data parquet. + :ivar metadata_columns: Column names that are metadata. + :ivar join_columns: Columns common to both data and metadata parquets (used as JOIN + keys for external metadata). Empty for embedded metadata (same parquet, no JOIN + needed). + :ivar metadata_source: One of ``"embedded"``, ``"external"``, or ``"none"``. + :ivar external_metadata_config: Config name of the external metadata config, or + ``None`` if metadata is embedded or absent. + :ivar is_partitioned: Whether the data parquet is partitioned. + + """ + + data_columns: set[str] + metadata_columns: set[str] + join_columns: set[str] + metadata_source: str + external_metadata_config: str | None + is_partitioned: bool + + class DataCard: """ Parser and explorer for HuggingFace dataset metadata. @@ -91,6 +120,7 @@ def __init__(self, repo_id: str, token: str | None = None): # Cache for parsed card self._dataset_card: DatasetCard | None = None self._metadata_cache: dict[str, list[ExtractedMetadata]] = {} + self._metadata_fields_map: dict[str, list[str]] = {} @property def dataset_card(self) -> DatasetCard: @@ -115,6 +145,7 @@ def _load_and_validate_card(self) -> None: # Validate using Pydantic model self._dataset_card = DatasetCard(**card_data) + self._build_metadata_fields_map() self.logger.debug(f"Successfully validated dataset card for {self.repo_id}") except ValidationError as e: @@ -241,6 +272,186 @@ def get_metadata_relationships( return relationships + def _build_metadata_fields_map(self) -> None: + """ + Build a mapping from data config names to their metadata fields. + + Called during card loading. For each data config, resolves metadata + fields from two sources: + + 1. Embedded: the data config has ``metadata_fields`` listing which + of its own columns are metadata. + 2. External: a separate metadata-type config has ``applies_to`` + including this config name. The metadata fields are the feature + names from that metadata config. + + Embedded takes priority. For external, the first matching metadata + config wins. + + """ + assert self._dataset_card is not None + self._metadata_fields_map = {} + meta_configs = self._dataset_card.get_metadata_configs() + + for data_cfg in self._dataset_card.get_data_configs(): + name = data_cfg.config_name + # Embedded case + if data_cfg.metadata_fields: + self._metadata_fields_map[name] = list(data_cfg.metadata_fields) + continue + # External case: find metadata config with applies_to + for meta_cfg in meta_configs: + if meta_cfg.applies_to and name in meta_cfg.applies_to: + self._metadata_fields_map[name] = [ + f.name for f in meta_cfg.dataset_info.features + ] + break + else: + self.logger.warning( + "No metadata fields found for data config '%s' " + "in repo '%s' -- no embedded metadata_fields and " + "no metadata config with applies_to", + name, + self.repo_id, + ) + + def get_metadata_fields(self, config_name: str) -> list[str] | None: + """ + Get metadata field names for a data configuration. + + Returns pre-computed metadata fields resolved during card loading. + Handles both embedded metadata (``metadata_fields`` on the data + config) and external metadata (separate metadata config with + ``applies_to``). + + :param config_name: Name of the data configuration + :return: List of metadata field names, or None if no metadata + + """ + # Ensure card is loaded (triggers _build_metadata_fields_map) + _ = self.dataset_card + return self._metadata_fields_map.get(config_name) + + def get_data_col_names(self, config_name: str) -> set[str]: + """ + Return the column names from the data config's feature list. + + These are the columns present in the data parquet file, derived directly from + the DataCard feature definitions without any DuckDB introspection. + + :param config_name: Name of the data configuration + :return: Set of column names, empty if config not found + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return set() + return {f.name for f in config.dataset_info.features} + + def get_metadata_config_name(self, config_name: str) -> str | None: + """ + Return the config_name of the external metadata config, if any. + + If the data config has embedded ``metadata_fields``, or if no + metadata config with ``applies_to`` references this config, + returns None. + + :param config_name: Name of the data configuration + :return: The metadata config name, or None + + """ + _ = self.dataset_card # ensure loaded + data_cfg = self.get_config(config_name) + if not data_cfg: + return None + # Embedded metadata -- no external config needed + if data_cfg.metadata_fields: + return None + # Find external metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + return meta_cfg.config_name + return None + + def get_dataset_schema(self, config_name: str) -> DatasetSchema | None: + """ + Return schema summary for a data configuration. + + Determines whether metadata is embedded or external, which + columns belong to data vs metadata parquets, and which columns + are shared between them (join keys for external metadata). + All information is derived from the DataCard YAML -- no DuckDB + introspection is needed. + + :param config_name: Name of the data configuration + :return: DatasetSchema instance, or None if config not found + + Example -- embedded metadata:: + + schema = card.get_dataset_schema("harbison_2004") + # schema.metadata_source == "embedded" + # schema.join_columns == set() (same parquet, no JOIN) + + Example -- external metadata:: + + schema = card.get_dataset_schema("annotated_features") + # schema.metadata_source == "external" + # schema.external_metadata_config == "annotated_features_meta" + # schema.join_columns == {"id"} (common to both parquets) + + """ + _ = self.dataset_card # ensure loaded + config = self.get_config(config_name) + if not config: + return None + + is_partitioned = bool( + config.dataset_info.partitioning + and config.dataset_info.partitioning.enabled + ) + + # Embedded: metadata_fields lists which of the config's own + # columns are metadata; all live in the same parquet + if config.metadata_fields: + all_cols = {f.name for f in config.dataset_info.features} + meta_cols = set(config.metadata_fields) + data_cols = all_cols - meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + + # External: find metadata config with applies_to + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + data_cols = {f.name for f in config.dataset_info.features} + meta_cols = {f.name for f in meta_cfg.dataset_info.features} + join_cols = data_cols & meta_cols + return DatasetSchema( + data_columns=data_cols, + metadata_columns=meta_cols, + join_columns=join_cols, + metadata_source="external", + external_metadata_config=meta_cfg.config_name, + is_partitioned=is_partitioned, + ) + + # No metadata relationship -- treat all columns as data + all_cols = {f.name for f in config.dataset_info.features} + return DatasetSchema( + data_columns=all_cols, + metadata_columns=set(), + join_columns=set(), + metadata_source="none", + external_metadata_config=None, + is_partitioned=is_partitioned, + ) + def get_repository_info(self) -> dict[str, Any]: """Get general repository information.""" card = self.dataset_card @@ -315,12 +526,13 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: raise DataCardError(f"Configuration '{config_name}' not found") schema: dict[str, Any] = { - "regulator_fields": [], # Fields with role=regulator_identifier - "target_fields": [], # Fields with role=target_identifier - "condition_fields": [], # Fields with role=experimental_condition - "condition_definitions": {}, # Field-level condition details - "top_level_conditions": None, # Repo-level conditions - "config_level_conditions": None, # Config-level conditions + "regulator_fields": [], + "target_fields": [], + "condition_fields": [], + "condition_definitions": {}, + "metadata_fields": None, + "top_level_conditions": None, + "config_level_conditions": None, } for feature in config.dataset_info.features: @@ -333,15 +545,32 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: if feature.definitions: schema["condition_definitions"][feature.name] = feature.definitions + # Include features from external metadata config + meta_fields = self.get_metadata_fields(config_name) + schema["metadata_fields"] = meta_fields + if meta_fields is not None and not config.metadata_fields: + for meta_cfg in self.dataset_card.get_metadata_configs(): + if meta_cfg.applies_to and config_name in meta_cfg.applies_to: + for feature in meta_cfg.dataset_info.features: + if feature.role == "regulator_identifier": + schema["regulator_fields"].append(feature.name) + elif feature.role == "target_identifier": + schema["target_fields"].append(feature.name) + elif feature.role == "experimental_condition": + schema["condition_fields"].append(feature.name) + if feature.definitions: + schema["condition_definitions"][ + feature.name + ] = feature.definitions + break + # Add top-level conditions (applies to all configs/samples) - # Stored in model_extra as dict if self.dataset_card.model_extra: top_level = self.dataset_card.model_extra.get("experimental_conditions") if top_level: schema["top_level_conditions"] = top_level # Add config-level conditions (applies to this config's samples) - # Stored in model_extra as dict if config.model_extra: config_level = config.model_extra.get("experimental_conditions") if config_level: diff --git a/tfbpapi/models.py b/tfbpapi/models.py index a8660a1..4d77f02 100644 --- a/tfbpapi/models.py +++ b/tfbpapi/models.py @@ -458,6 +458,10 @@ class DatasetVirtualDBConfig(BaseModel): description="For comparative datasets: map link_field -> " "[repo_id, config_name] pairs", ) + tags: dict[str, str] = Field( + default_factory=dict, + description="Arbitrary key/value annotations for this dataset", + ) model_config = ConfigDict(extra="allow") @@ -526,7 +530,7 @@ def parse_property_mappings(cls, data: Any) -> dict[str, Any]: result = {} for key, value in data.items(): # Known typed fields - let Pydantic handle them - if key in ("sample_id", "links", "db_name"): + if key in ("sample_id", "links", "db_name", "tags"): result[key] = value # Dict values should be PropertyMappings elif isinstance(value, dict): @@ -591,6 +595,10 @@ class RepositoryConfig(BaseModel): dataset: dict[str, DatasetVirtualDBConfig] | None = Field( None, description="Dataset-specific configurations" ) + tags: dict[str, str] = Field( + default_factory=dict, + description="Arbitrary key/value annotations for all datasets in this repo", + ) @model_validator(mode="before") @classmethod @@ -628,10 +636,10 @@ def parse_structure(cls, data: Any) -> dict[str, Any]: f"Invalid configuration for dataset '{dataset_name}': {e}" ) from e - # Parse repo-wide properties (all keys except 'dataset') + # Parse repo-wide properties (all keys except 'dataset' and 'tags') parsed_properties = {} for key, value in data.items(): - if key == "dataset": + if key in ("dataset", "tags"): continue try: @@ -639,7 +647,11 @@ def parse_structure(cls, data: Any) -> dict[str, Any]: except Exception as e: raise ValueError(f"Invalid repo-wide property '{key}': {e}") from e - return {"properties": parsed_properties, "dataset": parsed_datasets} + return { + "properties": parsed_properties, + "dataset": parsed_datasets, + "tags": data.get("tags") or {}, + } class MetadataConfig(BaseModel): @@ -876,3 +888,55 @@ def get_property_mappings( mappings.update(dataset_config.property_mappings) return mappings + + def get_tags(self, repo_id: str, config_name: str) -> dict[str, str]: + """ + Get merged tags for a repo/dataset combination. + + Merges repo-level and dataset-level tags, with dataset-level tags taking + precedence for the same key. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Dict of merged tags + + """ + repo_config = self.get_repository_config(repo_id) + if not repo_config: + return {} + + merged: dict[str, str] = dict(repo_config.tags) + + if repo_config.dataset and config_name in repo_config.dataset: + merged.update(repo_config.dataset[config_name].tags) + + return merged + + def get_sample_id_field(self, repo_id: str, config_name: str) -> str: + """ + Resolve the actual column name for the sample identifier. + + Checks dataset-level ``sample_id`` first, then repo-level, + falling back to ``"sample_id"`` if neither is configured. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Column name for the sample identifier + + """ + repo_cfg = self.get_repository_config(repo_id) + if not repo_cfg: + return "sample_id" + + # Dataset-level takes precedence + if repo_cfg.dataset and config_name in repo_cfg.dataset: + ds_cfg = repo_cfg.dataset[config_name] + if ds_cfg.sample_id is not None and ds_cfg.sample_id.field: + return ds_cfg.sample_id.field + + # Repo-level fallback + repo_sample_id = repo_cfg.properties.get("sample_id") + if repo_sample_id is not None and repo_sample_id.field is not None: + return repo_sample_id.field + + return "sample_id" diff --git a/tfbpapi/tests/test_datacard.py b/tfbpapi/tests/test_datacard.py index 5f098de..b9228d1 100644 --- a/tfbpapi/tests/test_datacard.py +++ b/tfbpapi/tests/test_datacard.py @@ -5,10 +5,80 @@ import pytest from tfbpapi import DataCard +from tfbpapi.datacard import DatasetSchema from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError from tfbpapi.models import DatasetType +def _external_metadata_card_data(): + """Card data with external metadata (no embedded metadata_fields).""" + return { + "configs": [ + { + "config_name": "coverage_data", + "description": "Coverage measurements", + "dataset_type": "genome_map", + "default": True, + "data_files": [{"split": "train", "path": "coverage.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "chr", + "dtype": "string", + "description": "Chromosome", + "role": "genomic_coordinate", + }, + { + "name": "coverage", + "dtype": "float32", + "description": "Coverage value", + "role": "quantitative_measure", + }, + ] + }, + }, + { + "config_name": "sample_metadata", + "description": "Sample metadata", + "dataset_type": "metadata", + "applies_to": ["coverage_data"], + "data_files": [{"split": "train", "path": "metadata.parquet"}], + "dataset_info": { + "features": [ + { + "name": "sample_id", + "dtype": "integer", + "description": "Sample ID", + }, + { + "name": "batch", + "dtype": "string", + "description": "Batch ID", + }, + { + "name": "regulator_locus_tag", + "dtype": "string", + "description": "TF locus tag", + "role": "regulator_identifier", + }, + { + "name": "regulator_symbol", + "dtype": "string", + "description": "TF symbol", + "role": "regulator_identifier", + }, + ] + }, + }, + ], + } + + class TestDataCard: """Test suite for DataCard class.""" @@ -30,6 +100,7 @@ def test_init( assert datacard.token == test_token assert datacard._dataset_card is None assert datacard._metadata_cache == {} + assert datacard._metadata_fields_map == {} # Check that fetchers were initialized mock_card_fetcher.assert_called_once_with(token=test_token) @@ -447,3 +518,393 @@ def test_extract_partition_values_fetch_error( # Should return empty set on error assert values == set() + + +class TestGetMetadataFields: + """Tests for DataCard.get_metadata_fields().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata_fields on the data config are returned.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("binding_data") + + assert result == ["regulator_symbol", "experimental_condition"] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_fields( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata via applies_to returns feature names.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("coverage_data") + + assert result == [ + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + ] + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("genomic_features") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Unknown config name returns None.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_fields("nonexistent") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_extract_schema_includes_external_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """extract_metadata_schema includes roles from external metadata.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + schema = datacard.extract_metadata_schema("coverage_data") + + # External metadata features with role=regulator_identifier + assert "regulator_locus_tag" in schema["regulator_fields"] + assert "regulator_symbol" in schema["regulator_fields"] + # metadata_fields key populated + assert schema["metadata_fields"] is not None + assert "sample_id" in schema["metadata_fields"] + + +class TestGetMetadataConfigName: + """Tests for DataCard.get_metadata_config_name().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_config_name( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """Returns metadata config name when applies_to matches.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("coverage_data") + + assert result == "sample_metadata" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None when metadata is embedded.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("binding_data") + + assert result is None + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_metadata_config_name("nonexistent") + + assert result is None + + +class TestGetDataColNames: + """Tests for DataCard.get_data_col_names().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_returns_feature_names( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns column names from the data config's features.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("binding_data") + + # binding_data features: regulator_symbol, target_gene, + # experimental_condition, binding_score + assert isinstance(result, set) + assert result == { + "regulator_symbol", + "target_gene", + "experimental_condition", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_config_returns_data_features( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """For external metadata, returns data config features only.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("coverage_data") + + # coverage_data features: sample_id, chr, coverage + assert result == {"sample_id", "chr", "coverage"} + # Must NOT include metadata-only columns + assert "batch" not in result + assert "regulator_locus_tag" not in result + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_empty_set( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns empty set for unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_data_col_names("nonexistent") + + assert result == set() + + +class TestGetDatasetSchema: + """Tests for DataCard.get_dataset_schema().""" + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_embedded_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Embedded metadata produces correct data/metadata column split.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # binding_data has metadata_fields: [regulator_symbol, + # experimental_condition] and features: regulator_symbol, + # target_gene, experimental_condition, binding_score + result = datacard.get_dataset_schema("binding_data") + + assert result is not None + assert isinstance(result, DatasetSchema) + assert result.metadata_source == "embedded" + assert result.external_metadata_config is None + assert result.join_columns == set() + assert result.metadata_columns == { + "regulator_symbol", + "experimental_condition", + } + # data_columns = all features minus metadata_columns + assert result.data_columns == { + "target_gene", + "binding_score", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_external_metadata_returns_correct_schema( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + ): + """External metadata produces correct split and join columns.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = _external_metadata_card_data() + + datacard = DataCard(test_repo_id) + # coverage_data features: sample_id, chr, coverage + # sample_metadata features: sample_id, batch, regulator_locus_tag, + # regulator_symbol + # join_columns = intersection = {sample_id} + result = datacard.get_dataset_schema("coverage_data") + + assert result is not None + assert result.metadata_source == "external" + assert result.external_metadata_config == "sample_metadata" + assert result.data_columns == {"sample_id", "chr", "coverage"} + assert result.metadata_columns == { + "sample_id", + "batch", + "regulator_locus_tag", + "regulator_symbol", + } + assert result.join_columns == {"sample_id"} + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_no_metadata_returns_all_cols_as_data( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Config with no metadata relationship has all cols as data.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + # genomic_features has no metadata_fields and no applies_to + result = datacard.get_dataset_schema("genomic_features") + + assert result is not None + assert result.metadata_source == "none" + assert result.external_metadata_config is None + assert result.metadata_columns == set() + assert result.join_columns == set() + assert result.data_columns == { + "gene_id", + "gene_symbol", + "chromosome", + "start", + "end", + } + + @patch("tfbpapi.datacard.HfDataCardFetcher") + @patch("tfbpapi.datacard.HfRepoStructureFetcher") + @patch("tfbpapi.datacard.HfSizeInfoFetcher") + def test_unknown_config_returns_none( + self, + mock_size_fetcher, + mock_structure_fetcher, + mock_card_fetcher, + test_repo_id, + sample_dataset_card_data, + ): + """Returns None for an unknown config name.""" + mock_fetcher_instance = Mock() + mock_card_fetcher.return_value = mock_fetcher_instance + mock_fetcher_instance.fetch.return_value = sample_dataset_card_data + + datacard = DataCard(test_repo_id) + result = datacard.get_dataset_schema("nonexistent") + + assert result is None diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index e62b840..cb64592 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -14,6 +14,8 @@ import pytest import yaml # type: ignore +from tfbpapi.datacard import DatasetSchema +from tfbpapi.models import MetadataConfig from tfbpapi.virtual_db import VirtualDB # ------------------------------------------------------------------ @@ -313,27 +315,87 @@ def _make_mock_datacard(repo_id): card.get_config.return_value = config_mock card.get_field_definitions.return_value = HARBISON_CONDITION_DEFS card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = METADATA_FIELDS["harbison_2004"] + card.get_metadata_config_name.return_value = None + # Harbison: embedded metadata, condition is data col used for + # derived properties; metadata_cols are the three metadata fields + harbison_meta_cols = set(METADATA_FIELDS["harbison_2004"]) + harbison_data_cols = { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } - harbison_meta_cols + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "condition", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns=harbison_data_cols + | { + "sample_id", + "condition", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=harbison_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) elif repo_id == "BrentLab/kemmeren": config_mock = MagicMock() config_mock.metadata_fields = METADATA_FIELDS["kemmeren_2014"] - # model_extra at config level (no experimental_conditions - # at this level for kemmeren) config_mock.model_extra = {} card.get_config.return_value = config_mock card.get_field_definitions.return_value = {} - # model_extra at top level with experimental_conditions - # wrapper -- matches real DataCard structure dataset_card_mock = MagicMock() dataset_card_mock.model_extra = { "experimental_conditions": KEMMEREN_EXP_CONDITIONS, } card.dataset_card = dataset_card_mock + card.get_metadata_fields.return_value = METADATA_FIELDS["kemmeren_2014"] + card.get_metadata_config_name.return_value = None + kemmeren_meta_cols = set(METADATA_FIELDS["kemmeren_2014"]) + card.get_data_col_names.return_value = { + "sample_id", + "regulator_locus_tag", + "regulator_symbol", + "target_locus_tag", + "effect", + "pvalue", + } + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={ + "sample_id", + "target_locus_tag", + "effect", + "pvalue", + }, + metadata_columns=kemmeren_meta_cols, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) else: config_mock = MagicMock() config_mock.metadata_fields = None card.get_config.return_value = config_mock card.get_field_definitions.return_value = {} card.get_experimental_conditions.return_value = {} + card.get_metadata_fields.return_value = None + card.get_metadata_config_name.return_value = None + card.get_data_col_names.return_value = set() + card.get_dataset_schema.return_value = None return card @@ -408,6 +470,201 @@ def test_db_name_map(self, config_path): ) +# ------------------------------------------------------------------ +# Tests: Tags +# ------------------------------------------------------------------ + + +class TestTags: + """Tests for get_tags() hierarchical merging.""" + + def _make_config(self, yaml_str: str) -> MetadataConfig: + import yaml as _yaml + + return MetadataConfig.model_validate(_yaml.safe_load(yaml_str)) + + def test_repo_level_tags_only(self): + """Repo-level tags propagate when dataset has none.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {"assay": "binding", "organism": "yeast"} + + def test_dataset_level_tags_only(self): + """Dataset-level tags are returned when repo has none.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + tags: + assay: chip-chip + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {"assay": "chip-chip"} + + def test_dataset_overrides_repo_tags(self): + """Dataset-level tags override repo-level for the same key.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + sample_id: + field: sample_id + tags: + assay: chip-chip + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags["assay"] == "chip-chip" + assert tags["organism"] == "yeast" + + def test_no_tags(self): + """Returns empty dict when neither level has tags.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + tags = config.get_tags("BrentLab/harbison", "harbison_2004") + assert tags == {} + + def test_unknown_repo_returns_empty(self): + """Unknown repo_id returns empty dict.""" + config = self._make_config( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + sample_id: + field: sample_id + """ + ) + assert config.get_tags("BrentLab/nonexistent", "harbison_2004") == {} + + def test_yaml_round_trip(self): + """Tags parsed from YAML produce correct merged result.""" + config = self._make_config( + """ + repositories: + BrentLab/repo_a: + tags: + type: primary + organism: yeast + dataset: + dataset_a: + sample_id: + field: sample_id + tags: + type: binding + version: "2024" + BrentLab/repo_b: + tags: + type: perturbation + dataset: + dataset_b: + sample_id: + field: sample_id + """ + ) + tags_a = config.get_tags("BrentLab/repo_a", "dataset_a") + assert tags_a == {"type": "binding", "organism": "yeast", "version": "2024"} + + tags_b = config.get_tags("BrentLab/repo_b", "dataset_b") + assert tags_b == {"type": "perturbation"} + + def _make_vdb(self, yaml_str: str, tmp_path) -> VirtualDB: + + p = tmp_path / "config.yaml" + p.write_text(yaml_str) + return VirtualDB(str(p)) + + def test_vdb_get_tags_returns_merged(self, tmp_path): + """VirtualDB.get_tags() returns merged repo+dataset tags by db_name.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + organism: yeast + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + tags: + assay: chip-chip + """, + tmp_path, + ) + tags = vdb.get_tags("harbison") + assert tags == {"assay": "chip-chip", "organism": "yeast"} + + def test_vdb_get_tags_unknown_name_returns_empty(self, tmp_path): + """VirtualDB.get_tags() returns empty dict for unknown db_name.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + """, + tmp_path, + ) + assert vdb.get_tags("nonexistent") == {} + + def test_vdb_get_tags_no_views_needed(self, tmp_path): + """VirtualDB.get_tags() works before any views are registered.""" + vdb = self._make_vdb( + """ + repositories: + BrentLab/harbison: + tags: + assay: binding + dataset: + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + """, + tmp_path, + ) + assert not vdb._views_registered + tags = vdb.get_tags("harbison") + assert tags == {"assay": "binding"} + assert not vdb._views_registered + + # ------------------------------------------------------------------ # Tests: View registration # ------------------------------------------------------------------ @@ -780,3 +1037,283 @@ def test_lazy_init(self, config_path): v = VirtualDB(config_path) assert v._conn is None assert not v._views_registered + + +# ------------------------------------------------------------------ +# Tests: dynamic sample_id column +# ------------------------------------------------------------------ + + +class TestDynamicSampleId: + """Tests that the sample identifier column is resolved from config.""" + + def test_non_default_sample_id(self, tmp_path, monkeypatch): + """Views work when sample_id maps to a non-default column.""" + import tfbpapi.virtual_db as vdb_module + + # Config uses experiment_id as the sample identifier + config = { + "repositories": { + "TestOrg/custom_id": { + "dataset": { + "custom_data": { + "db_name": "custom", + "sample_id": { + "field": "experiment_id", + }, + "regulator": { + "field": "regulator", + }, + } + } + } + } + } + config_path = tmp_path / "config.yaml" + with open(config_path, "w") as f: + yaml.dump(config, f) + + # Parquet uses experiment_id (not sample_id) + df = pd.DataFrame( + { + "experiment_id": [100, 100, 200, 200], + "regulator": ["TF1", "TF1", "TF2", "TF2"], + "target": ["G1", "G2", "G1", "G2"], + "score": [1.5, 0.8, 2.1, 0.3], + } + ) + parquet_path = tmp_path / "custom.parquet" + files = { + ("TestOrg/custom_id", "custom_data"): [_write_parquet(parquet_path, df)], + } + + # Mock datacard + mock_card = MagicMock() + mock_card.get_metadata_fields.return_value = [ + "regulator", + ] + mock_card.get_field_definitions.return_value = {} + mock_card.get_experimental_conditions.return_value = {} + mock_card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"experiment_id", "target", "score"}, + metadata_columns={"regulator"}, + join_columns=set(), + metadata_source="embedded", + external_metadata_config=None, + is_partitioned=False, + ) + + v = VirtualDB(config_path) + + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cn: files.get((repo_id, cn), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: mock_card, + ) + + # Meta view should have experiment_id + regulator + meta_df = v.query("SELECT * FROM custom_meta") + assert "experiment_id" in meta_df.columns + assert len(meta_df) == 2 # 2 distinct samples + + # Enriched raw view should JOIN on experiment_id + raw_df = v.query("SELECT * FROM custom") + assert "experiment_id" in raw_df.columns + assert len(raw_df) == 4 # all rows + + def test_get_sample_id_field_dataset_level(self): + """Dataset-level sample_id takes precedence.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "dataset": { + "ds": { + "sample_id": { + "field": "my_id", + }, + } + } + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "my_id" + + def test_get_sample_id_field_repo_level(self): + """Repo-level sample_id used when dataset has none.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_sid"}, + "dataset": {"ds": {}}, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "repo_sid" + + def test_get_sample_id_field_default(self): + """Falls back to 'sample_id' when not configured.""" + config = MetadataConfig.model_validate( + {"repositories": {"Org/repo": {"dataset": {"ds": {}}}}} + ) + assert config.get_sample_id_field("Org/repo", "ds") == "sample_id" + + def test_get_sample_id_field_dataset_overrides_repo(self): + """Dataset-level overrides repo-level.""" + config = MetadataConfig.model_validate( + { + "repositories": { + "Org/repo": { + "sample_id": {"field": "repo_id_col"}, + "dataset": { + "ds": { + "sample_id": { + "field": "ds_id_col", + }, + } + }, + } + } + } + ) + assert config.get_sample_id_field("Org/repo", "ds") == "ds_id_col" + + +class TestExternalMetadata: + """Tests for datasets with external metadata parquet files.""" + + def test_external_metadata_join(self, tmp_path, monkeypatch): + """Meta view JOINs data and metadata parquet when metadata is in a separate + config.""" + import tfbpapi.virtual_db as vdb_module + + # Data parquet: measurements with sample_id but no + # metadata columns like db_id or batch + data_df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL002W", + ], + "effect": [1.5, 0.8, 2.1, 0.3], + } + ) + # Metadata parquet: sample-level metadata + meta_df = pd.DataFrame( + { + "sample_id": [1, 2], + "db_id": [101, 102], + "regulator_locus_tag": ["YBR049C", "YDR463W"], + "background_hops": [500, 600], + } + ) + + data_path = _write_parquet(tmp_path / "data.parquet", data_df) + meta_path = _write_parquet(tmp_path / "meta.parquet", meta_df) + + parquet_files = { + ("TestOrg/repo", "chip_data"): [data_path], + ("TestOrg/repo", "sample_metadata"): [meta_path], + } + + config = { + "repositories": { + "TestOrg/repo": { + "sample_id": {"field": "sample_id"}, + "dataset": { + "chip_data": { + "db_name": "chip", + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + } + }, + } + } + } + config_file = tmp_path / "config.yaml" + with open(config_file, "w") as f: + yaml.dump(config, f) + + # Mock DataCard: external metadata via applies_to + card = MagicMock() + config_mock = MagicMock() + config_mock.metadata_fields = None # no embedded + card.get_config.return_value = config_mock + card.get_metadata_fields.return_value = [ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + ] + card.get_metadata_config_name.return_value = "sample_metadata" + # Data parquet columns (from chip_data features) + card.get_data_col_names.return_value = { + "sample_id", + "target_locus_tag", + "effect", + } + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + # External metadata schema: data cols in data parquet, + # metadata cols in metadata parquet, joined on sample_id + card.get_dataset_schema.return_value = DatasetSchema( + data_columns={"sample_id", "target_locus_tag", "effect"}, + metadata_columns={ + "sample_id", + "db_id", + "regulator_locus_tag", + "background_hops", + }, + join_columns={"sample_id"}, + metadata_source="external", + external_metadata_config="sample_metadata", + is_partitioned=False, + ) + + v = VirtualDB(config_file) + monkeypatch.setattr( + VirtualDB, + "_resolve_parquet_files", + lambda self, repo_id, cfg: parquet_files.get((repo_id, cfg), []), + ) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: card, + ) + + # Trigger view registration + tables = v.tables() + assert "chip" in tables + assert "chip_meta" in tables + + # Meta view should have columns from both parquets + meta_result = v.query("SELECT * FROM chip_meta ORDER BY sample_id") + meta_cols = set(meta_result.columns) + assert "sample_id" in meta_cols + assert "db_id" in meta_cols + assert "regulator_locus_tag" in meta_cols + assert "background_hops" in meta_cols + + # Verify data is correct (joined properly) + assert len(meta_result) == 2 + row1 = meta_result[meta_result["sample_id"] == 1].iloc[0] + assert row1["db_id"] == 101 + assert row1["regulator_locus_tag"] == "YBR049C" + + # Enriched raw view should also work + raw_result = v.query("SELECT * FROM chip ORDER BY sample_id") + assert "db_id" in raw_result.columns + assert len(raw_result) == 4 # 4 data rows diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index 96097a3..1ac968c 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -11,8 +11,8 @@ views (one row per sample with derived columns) and full data views (measurement-level data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded views that parse composite ID fields into ``_source`` (aliased to the configured -db_name) and ``_id`` (sample_id) columns. The expectation is that a developer will -use this interface to write SQL queries against the views to provide an API to +db_name) and ``_id`` (sample identifier) columns. The expectation is that a developer +will use this interface to write SQL queries against the views to provide an API to downstream users and applications. Example Usage:: @@ -49,8 +49,9 @@ import duckdb import pandas as pd +from duckdb import BinderException -from tfbpapi.datacard import DataCard +from tfbpapi.datacard import DataCard, DatasetSchema from tfbpapi.models import MetadataConfig logger = logging.getLogger(__name__) @@ -389,6 +390,27 @@ def get_common_fields(self) -> list[str]: common = set.intersection(*sets) return sorted(common) + def get_tags(self, db_name: str) -> dict[str, str]: + """ + Return the merged tags for a dataset. + + Tags are defined in the configuration at the repository and/or + dataset level. Dataset-level tags override repository-level tags + with the same key. See the ``tags`` section of the configuration + guide for details. + + :param db_name: Dataset name as it appears in :meth:`tables` (the + resolved ``db_name`` from the configuration, or the + ``config_name`` if ``db_name`` was not explicitly set). + :return: Dict of merged tags, or empty dict if the dataset has no + tags or the name is not found. + + """ + if db_name not in self._db_name_map: + return {} + repo_id, config_name = self._db_name_map[db_name] + return self.config.get_tags(repo_id, config_name) + # ------------------------------------------------------------------ # Lazy initialisation # ------------------------------------------------------------------ @@ -414,6 +436,62 @@ def _register_all_views(self) -> None: parquet_only=comparative, ) + # 1b. Resolve external metadata parquet views. + # When a data config's metadata lives in a separate HF config + # (applies_to), register its parquet as ___metadata_parquet. + # All information is derived from DataCard YAML -- no DuckDB + # introspection needed. + self._dataset_schemas: dict[str, DatasetSchema] = {} + self._external_meta_views: dict[str, str] = {} + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if self._is_comparative(repo_id, config_name): + continue + try: + card = _cached_datacard(repo_id, token=self.token) + schema = card.get_dataset_schema(config_name) + except Exception as exc: + logger.warning( + "Could not get dataset schema for %s/%s: %s", + repo_id, + config_name, + exc, + ) + continue + if schema is not None: + self._dataset_schemas[db_name] = schema + if ( + schema is None + or schema.metadata_source != "external" + or not schema.external_metadata_config + ): + continue + meta_view = f"__{db_name}_metadata_parquet" + files = self._resolve_parquet_files( + repo_id, schema.external_metadata_config + ) + if not files: + logger.warning( + "No parquet files for external metadata config " + "'%s' in repo '%s'", + schema.external_metadata_config, + repo_id, + ) + continue + files_sql = ", ".join(f"'{f}'" for f in files) + try: + self._db.execute( + f"CREATE OR REPLACE VIEW {meta_view} AS " + f"SELECT * FROM read_parquet([{files_sql}])" + ) + except Exception as exc: + logger.warning( + "Failed to create external metadata view '%s': %s", + meta_view, + exc, + ) + continue + self._external_meta_views[db_name] = meta_view + # 2. Metadata views for primary datasets (_meta) # This is based on the metadata defined in the datacard, # and includes any additional derived columns based on the @@ -567,62 +645,150 @@ def _register_raw_view( def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: """ - Register a ``_meta`` view with one row per sample_id. + Register a ``_meta`` view with one row per sample. + + Includes metadata columns from the DataCard plus any derived columns + from config property mappings (resolved against DataCard definitions + with factor aliases applied). - Includes raw metadata columns from the DataCard plus any derived columns from - config property mappings (resolved against DataCard definitions with factor - aliases applied). + For datasets with external metadata (a separate HF config with + ``applies_to``), JOINs the data parquet to the metadata parquet + on the configured sample_id column. The actual columns in the metadata + parquet are determined by DuckDB introspection (``DESCRIBE``) rather + than the DataCard feature list, because DataCard feature lists are + conceptual schemas that may include columns not physically present + in the parquet files. :param db_name: Base view name for the primary dataset :param repo_id: Repository ID :param config_name: Configuration name + raises ValueError: If no metadata fields are found. + raises BinderException: If view creation fails, with SQL details. + """ parquet_view = f"__{db_name}_parquet" if not self._view_exists(parquet_view): return - meta_cols = self._resolve_metadata_fields(repo_id, config_name) - prop_result = self._resolve_property_columns(repo_id, config_name) + sample_col = self._get_sample_id_col(db_name) - if prop_result is not None: - derived_exprs, prop_raw_cols = prop_result - # Raw cols = metadata_fields + any source fields needed - # by property mappings - if meta_cols is not None: - raw = list(dict.fromkeys(["sample_id"] + meta_cols + prop_raw_cols)) - else: - raw = list(dict.fromkeys(["sample_id"] + prop_raw_cols)) + # Pull ext_meta_view early -- needed for both meta_cols and + # FROM clause construction. + schema: DatasetSchema | None = getattr(self, "_dataset_schemas", {}).get( + db_name + ) + ext_meta_view: str | None = getattr(self, "_external_meta_views", {}).get( + db_name + ) - raw_sql = ", ".join(raw) + is_external = ( + ext_meta_view is not None + and schema is not None + and schema.metadata_source == "external" + ) - # Outer SELECT: raw cols + derived expressions - outer_parts = list(raw) + derived_exprs - outer_sql = ", ".join(outer_parts) + if is_external: + # DataCard feature lists are conceptual -- columns listed there + # may not be physically present in the parquet file. Use DuckDB + # introspection to get the actual columns in the metadata parquet. + assert ext_meta_view is not None + actual_meta_cols: set[str] = set(self._get_view_columns(ext_meta_view)) + meta_cols: list[str] = sorted(actual_meta_cols) + elif schema is not None: + actual_meta_cols = schema.metadata_columns + meta_cols = sorted(actual_meta_cols) + else: + meta_cols = self._resolve_metadata_fields(repo_id, config_name) or [] + actual_meta_cols = set(meta_cols) - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT {outer_sql} " - f"FROM (" - f"SELECT DISTINCT {raw_sql} " - f"FROM {parquet_view}" - f") AS __raw" + if not meta_cols: + raise ValueError( + f"No metadata fields found for {repo_id}/{config_name}. " + f"Cannot create meta view '{db_name}_meta'." ) - elif meta_cols is not None: - # Fallback: metadata_fields only, no property mappings - cols = list(dict.fromkeys(["sample_id"] + meta_cols)) - cols_sql = ", ".join(cols) - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT {cols_sql} " - f"FROM {parquet_view}" + + # FROM clause: JOIN data + metadata parquets when external, + # plain parquet view otherwise. + if is_external: + assert ext_meta_view is not None + # Use the configured sample_id column as the join key. + # The DataCard feature intersection (schema.join_columns) + # is unreliable because a data config's feature list may + # document columns that are physically only in the metadata + # parquet (present conceptually after a join, not in the + # physical data parquet file). + from_clause = ( + f"{parquet_view} d " f"JOIN {ext_meta_view} m " f"USING ({sample_col})" ) + is_join = True else: - # No metadata_fields at all -- all columns are metadata - self._db.execute( - f"CREATE OR REPLACE VIEW {db_name}_meta AS " - f"SELECT DISTINCT * FROM {parquet_view}" - ) + from_clause = parquet_view + is_join = False + + def qualify(col: str) -> str: + """Return qualified column name for JOIN context.""" + if not is_join: + return col + if col == sample_col: + return col # USING makes join key unqualified + # Use the actual metadata parquet columns (from DuckDB + # introspection) to decide qualification, not the DataCard + # feature list which may be inaccurate. + if col in actual_meta_cols: + return f"m.{col}" + return f"d.{col}" + + # Build SELECT: sample_id + metadata cols (deduplicated) + seen: set[str] = set() + select_parts: list[str] = [] + + def add_col(col: str) -> None: + if col not in seen: + seen.add(col) + select_parts.append(qualify(col)) + + add_col(sample_col) + for col in meta_cols: + add_col(col) + + # Add derived property expressions from the VirtualDB config + prop_result = self._resolve_property_columns(repo_id, config_name) + if prop_result is not None: + derived_exprs, prop_raw_cols = prop_result + # Ensure source columns needed by expressions are selected + for col in prop_raw_cols: + add_col(col) + # Qualify source column references inside CASE WHEN expressions + if is_join: + qualified_exprs = [] + for expr in derived_exprs: + for raw_col in prop_raw_cols: + q = qualify(raw_col) + if q != raw_col: + # Replace bare column name in CASE WHEN patterns + expr = expr.replace( + f"CASE {raw_col} ", f"CASE {q} " + ).replace(f" {raw_col} = ", f" {q} = ") + qualified_exprs.append(expr) + derived_exprs = qualified_exprs + select_parts.extend(derived_exprs) + + cols_sql = ", ".join(select_parts) + sql = ( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT {cols_sql} FROM {from_clause}" + ) + try: + self._db.execute(sql) + except BinderException as exc: + raise BinderException( + f"Failed to create meta view '{db_name}_meta'.\n" + f" schema: {schema}\n" + f" from_clause: {from_clause}\n" + f" SQL: {sql}\n" + f" error: {exc}" + ) from exc def _enrich_raw_view(self, db_name: str) -> None: """ @@ -648,40 +814,58 @@ def _enrich_raw_view(self, db_name: str) -> None: if not extra_cols: return + sample_col = self._get_sample_id_col(db_name) extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) self._db.execute( f"CREATE OR REPLACE VIEW {db_name} AS " f"SELECT r.*, {extra_select} " f"FROM {parquet_name} r " - f"JOIN {meta_name} m USING (sample_id)" + f"JOIN {meta_name} m USING ({sample_col})" ) def _get_view_columns(self, view: str) -> list[str]: - """Return column names for a view.""" - df = self._db.execute( - f"SELECT column_name FROM information_schema.columns " - f"WHERE table_name = '{view}'" - ).fetchdf() + """ + Return column names for a view. + + Uses ``DESCRIBE`` rather than ``information_schema`` to force + eager schema resolution for ``read_parquet``-backed views, + which DuckDB may evaluate lazily. + + """ + df = self._db.execute(f"DESCRIBE {view}").fetchdf() return df["column_name"].tolist() + def _get_sample_id_col(self, db_name: str) -> str: + """ + Resolve the sample identifier column name for a dataset. + + :param db_name: Resolved database view name + :return: Actual column name for the sample identifier + + """ + repo_id, config_name = self._db_name_map[db_name] + return self.config.get_sample_id_field(repo_id, config_name) + def _resolve_metadata_fields( self, repo_id: str, config_name: str ) -> list[str] | None: """ - Get the metadata_fields list from the DataCard config. + Get metadata field names from the DataCard. + + Delegates to ``DataCard.get_metadata_fields()`` which handles + both embedded metadata_fields and external metadata configs + (via applies_to). :param repo_id: Repository ID :param config_name: Configuration name - :return: List of metadata field names, or None if not specified + :return: List of metadata field names, or None if not found """ try: card = _cached_datacard(repo_id, token=self.token) - config = card.get_config(config_name) - if config and config.metadata_fields: - return list(config.metadata_fields) + return card.get_metadata_fields(config_name) except Exception: - logger.debug( + logger.error( "Could not resolve metadata_fields for %s/%s", repo_id, config_name, @@ -975,7 +1159,7 @@ def _register_comparative_expanded_view( - ``_source`` -- the ``repo_id;config_name`` prefix, aliased to the configured ``db_name`` when available. - - ``_id`` -- the sample_id component. + - ``_id`` -- the sample identifier component. :param db_name: Base view name for the comparative dataset :param ds_cfg: DatasetVirtualDBConfig with ``links``