diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb
index 7305146..bb07e75 100644
--- a/docs/tutorials/virtual_db_tutorial.ipynb
+++ b/docs/tutorials/virtual_db_tutorial.ipynb
@@ -33,7 +33,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Config saved to: /tmp/tmpf610qghb/vdb_config.yaml\n"
+ "Config saved to: /tmp/tmp_krovt13/vdb_config.yaml\n"
]
}
],
@@ -41,6 +41,10 @@
"config_yaml = \"\"\"\n",
"repositories:\n",
" BrentLab/harbison_2004:\n",
+ " tags:\n",
+ " assay: binding\n",
+ " method: chip-chip\n",
+ " organism: yeast\n",
" dataset:\n",
" harbison_2004:\n",
" db_name: harbison\n",
@@ -59,6 +63,10 @@
" field: regulator_symbol\n",
"\n",
" BrentLab/kemmeren_2014:\n",
+ " tags:\n",
+ " assay: perturbation\n",
+ " method: microarray\n",
+ " organism: yeast\n",
" dataset:\n",
" kemmeren_2014:\n",
" db_name: kemmeren\n",
@@ -75,8 +83,17 @@
" field: regulator_symbol\n",
"\n",
" BrentLab/hackett_2020:\n",
+ " # Repo-level tags apply to all datasets in this repository\n",
+ " tags:\n",
+ " method: test_overwrite\n",
+ " organism: yeast\n",
" dataset:\n",
" hackett_2020:\n",
+ " # Dataset-level tags: 'assay' is new,\n",
+ " # 'method' overrides the repo-level value\n",
+ " tags:\n",
+ " assay: perturbation\n",
+ " method: overexpression\n",
" db_name: hackett\n",
" sample_id:\n",
" field: sample_id\n",
@@ -169,6 +186,56 @@
"print(repr(vdb))"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "0f10c138",
+ "metadata": {},
+ "source": [
+ "## Tags\n",
+ "\n",
+ "Tags are arbitrary key/value annotations defined in the configuration. They\n",
+ "follow the same hierarchy as property mappings: repo-level tags apply to all\n",
+ "datasets in that repository, and dataset-level tags override repo-level tags\n",
+ "with the same key.\n",
+ "\n",
+ "Use `config.get_tags(repo_id, config_name)` to retrieve the merged tags for\n",
+ "any dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "f7d73db0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "harbison tags: {'assay': 'binding', 'method': 'chip-chip', 'organism': 'yeast'}\n",
+ "kemmeren tags: {'assay': 'perturbation', 'method': 'microarray', 'organism': 'yeast'}\n",
+ "hackett tags: {'method': 'overexpression', 'organism': 'yeast', 'assay': 'perturbation'}\n",
+ "dto tags: {}\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "# Tags are accessible directly from the VirtualDB instance using the db_name.\n",
+ "# No need to import MetadataConfig or specify repo_id.\n",
+ "print(\"harbison tags:\", vdb.get_tags(\"harbison\"))\n",
+ "print(\"kemmeren tags:\", vdb.get_tags(\"kemmeren\"))\n",
+ "\n",
+ "# Hackett has tags at both levels:\n",
+ "# 'organism' comes from the repo level only,\n",
+ "# 'assay' is added at the dataset level only,\n",
+ "# 'method' is defined at both levels -- the dataset value wins.\n",
+ "print(\"hackett tags:\", vdb.get_tags(\"hackett\"))\n",
+ "\n",
+ "# Dataset with no tags returns empty dict\n",
+ "print(\"dto tags:\", vdb.get_tags(\"dto\"))"
+ ]
+ },
{
"cell_type": "markdown",
"id": "cell-5",
@@ -187,7 +254,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"id": "cell-6",
"metadata": {},
"outputs": [
@@ -202,10 +269,11 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]\n",
- "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12264.05it/s]\n",
- "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9731.56it/s]\n",
- "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 21883.33it/s]\n",
+ "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10407.70it/s]\n",
+ "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1770.50it/s]\n",
+ "Fetching 1 files: 100%|██████████| 1/1 [00:20<00:00, 20.31s/it]\n",
+ "No metadata fields found for data config 'dto' in repo 'BrentLab/yeast_comparative_analysis' -- no embedded metadata_fields and no metadata config with applies_to\n",
+ "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 55091.56it/s]\n",
"Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n",
"Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n",
"Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n",
@@ -235,7 +303,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"id": "pdebujnqb9q",
"metadata": {},
"outputs": [
@@ -284,7 +352,7 @@
"type": "unknown"
}
],
- "ref": "8720a362-ea0c-4293-9656-ba6725dcaa3d",
+ "ref": "955566a4-2a55-483f-a0d4-11f1757f6a28",
"rows": [
[
"0",
@@ -299,7 +367,7 @@
[
"1",
"harbison_meta",
- "regulator_locus_tag",
+ "condition",
"VARCHAR",
"YES",
null,
@@ -309,7 +377,7 @@
[
"2",
"harbison_meta",
- "regulator_symbol",
+ "regulator_locus_tag",
"VARCHAR",
"YES",
null,
@@ -319,7 +387,7 @@
[
"3",
"harbison_meta",
- "condition",
+ "regulator_symbol",
"VARCHAR",
"YES",
null,
@@ -394,7 +462,7 @@
"
\n",
" | 1 | \n",
" harbison_meta | \n",
- " regulator_locus_tag | \n",
+ " condition | \n",
" VARCHAR | \n",
" YES | \n",
" None | \n",
@@ -404,7 +472,7 @@
"
\n",
" | 2 | \n",
" harbison_meta | \n",
- " regulator_symbol | \n",
+ " regulator_locus_tag | \n",
" VARCHAR | \n",
" YES | \n",
" None | \n",
@@ -414,7 +482,7 @@
"
\n",
" | 3 | \n",
" harbison_meta | \n",
- " condition | \n",
+ " regulator_symbol | \n",
" VARCHAR | \n",
" YES | \n",
" None | \n",
@@ -448,14 +516,14 @@
"text/plain": [
" table column_name column_type null key default extra\n",
"0 harbison_meta sample_id INTEGER YES None None None\n",
- "1 harbison_meta regulator_locus_tag VARCHAR YES None None None\n",
- "2 harbison_meta regulator_symbol VARCHAR YES None None None\n",
- "3 harbison_meta condition VARCHAR YES None None None\n",
+ "1 harbison_meta condition VARCHAR YES None None None\n",
+ "2 harbison_meta regulator_locus_tag VARCHAR YES None None None\n",
+ "3 harbison_meta regulator_symbol VARCHAR YES None None None\n",
"4 harbison_meta carbon_source VARCHAR YES None None None\n",
"5 harbison_meta temperature_celsius DOUBLE YES None None None"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -468,7 +536,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"id": "9deee334",
"metadata": {},
"outputs": [
@@ -517,7 +585,7 @@
"type": "unknown"
}
],
- "ref": "001db2c7-a5c2-4561-9b12-35733ce1b2e6",
+ "ref": "012ff714-cded-469d-9c53-642872a5d487",
"rows": [
[
"0",
@@ -793,7 +861,7 @@
"10 harbison temperature_celsius DOUBLE YES None None None"
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -805,7 +873,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "cell-9",
"metadata": {},
"outputs": [
@@ -839,7 +907,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"id": "1a705f1c",
"metadata": {},
"outputs": [
@@ -858,17 +926,17 @@
"type": "integer"
},
{
- "name": "regulator_locus_tag",
+ "name": "condition",
"rawType": "object",
"type": "string"
},
{
- "name": "regulator_symbol",
+ "name": "regulator_locus_tag",
"rawType": "object",
"type": "string"
},
{
- "name": "condition",
+ "name": "regulator_symbol",
"rawType": "object",
"type": "string"
},
@@ -883,50 +951,50 @@
"type": "float"
}
],
- "ref": "e5bb4909-b231-44d7-85b8-5219b51f4a4b",
+ "ref": "b5c797e5-c834-4f9d-a8fc-89789ef0cc68",
"rows": [
[
"0",
- "166",
- "YIL131C",
- "FKH1",
+ "300",
"YPD",
+ "YOL116W",
+ "MSN1",
"glucose",
"30.0"
],
[
"1",
- "3",
- "YBL005W",
- "PDR3",
+ "113",
"YPD",
+ "YGL035C",
+ "MIG1",
"glucose",
"30.0"
],
[
"2",
- "173",
- "YIR023W",
- "DAL81",
- "YPD",
+ "81",
+ "RAPA",
+ "YEL009C",
+ "GCN4",
"glucose",
"30.0"
],
[
"3",
- "220",
- "YLR014C",
- "PPR1",
+ "279",
"YPD",
+ "YNL139C",
+ "THO2",
"glucose",
"30.0"
],
[
"4",
- "83",
- "YEL009C",
- "GCN4",
- "YPD",
+ "73",
+ "H2O2Hi",
+ "YDR423C",
+ "CAD1",
"glucose",
"30.0"
]
@@ -956,9 +1024,9 @@
"
\n",
" | \n",
" sample_id | \n",
+ " condition | \n",
" regulator_locus_tag | \n",
" regulator_symbol | \n",
- " condition | \n",
" carbon_source | \n",
" temperature_celsius | \n",
"
\n",
@@ -966,46 +1034,46 @@
" \n",
" \n",
" | 0 | \n",
- " 166 | \n",
- " YIL131C | \n",
- " FKH1 | \n",
+ " 300 | \n",
" YPD | \n",
+ " YOL116W | \n",
+ " MSN1 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 3 | \n",
- " YBL005W | \n",
- " PDR3 | \n",
+ " 113 | \n",
" YPD | \n",
+ " YGL035C | \n",
+ " MIG1 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 173 | \n",
- " YIR023W | \n",
- " DAL81 | \n",
- " YPD | \n",
+ " 81 | \n",
+ " RAPA | \n",
+ " YEL009C | \n",
+ " GCN4 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 220 | \n",
- " YLR014C | \n",
- " PPR1 | \n",
+ " 279 | \n",
" YPD | \n",
+ " YNL139C | \n",
+ " THO2 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 83 | \n",
- " YEL009C | \n",
- " GCN4 | \n",
- " YPD | \n",
+ " 73 | \n",
+ " H2O2Hi | \n",
+ " YDR423C | \n",
+ " CAD1 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
@@ -1014,12 +1082,12 @@
""
],
"text/plain": [
- " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n",
- "0 166 YIL131C FKH1 YPD glucose \n",
- "1 3 YBL005W PDR3 YPD glucose \n",
- "2 173 YIR023W DAL81 YPD glucose \n",
- "3 220 YLR014C PPR1 YPD glucose \n",
- "4 83 YEL009C GCN4 YPD glucose \n",
+ " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n",
+ "0 300 YPD YOL116W MSN1 glucose \n",
+ "1 113 YPD YGL035C MIG1 glucose \n",
+ "2 81 RAPA YEL009C GCN4 glucose \n",
+ "3 279 YPD YNL139C THO2 glucose \n",
+ "4 73 H2O2Hi YDR423C CAD1 glucose \n",
"\n",
" temperature_celsius \n",
"0 30.0 \n",
@@ -1029,7 +1097,7 @@
"4 30.0 "
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -1055,7 +1123,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"id": "cell-17",
"metadata": {},
"outputs": [
@@ -1124,75 +1192,75 @@
"type": "float"
}
],
- "ref": "a6cb8a91-c1c2-4bc8-af51-12e900d7a4bf",
+ "ref": "06d7e391-9665-4a5f-9276-359ff8e71c3e",
"rows": [
[
"0",
- "14",
- "13.0",
+ "15",
+ "14.0",
"YBR049C",
"REB1",
- "H2O2Lo",
+ "YPD",
"YPR204W",
"YPR204W",
- "0.78449615",
- "0.53566521",
+ "0.85288861",
+ "0.76943045",
"glucose",
"30.0"
],
[
"1",
- "14",
- "13.0",
+ "15",
+ "14.0",
"YBR049C",
"REB1",
- "H2O2Lo",
+ "YPD",
"YPR203W",
"YPR203W",
- "1.4509147",
- "0.95955603",
+ "1.2490028",
+ "0.11237602",
"glucose",
"30.0"
],
[
"2",
- "14",
- "13.0",
+ "15",
+ "14.0",
"YBR049C",
"REB1",
- "H2O2Lo",
+ "YPD",
"YPR202W",
"YPR202W",
- "1.4509147",
- "0.95955603",
+ "1.2490028",
+ "0.11237602",
"glucose",
"30.0"
],
[
"3",
- "14",
- "13.0",
+ "15",
+ "14.0",
"YBR049C",
"REB1",
- "H2O2Lo",
+ "YPD",
"YPR201W",
"ARR3",
- "0.92586339",
- "0.45367192",
+ "1.5137073",
+ "0.1681333",
"glucose",
"30.0"
],
[
"4",
- "14",
- "13.0",
+ "15",
+ "14.0",
"YBR049C",
"REB1",
- "H2O2Lo",
+ "YPD",
"YPR200C",
"ARR2",
- "0.92586339",
- "0.45367192",
+ "1.5137073",
+ "0.1681333",
"glucose",
"30.0"
]
@@ -1237,71 +1305,71 @@
" \n",
" \n",
" | 0 | \n",
- " 14 | \n",
- " 13.0 | \n",
+ " 15 | \n",
+ " 14.0 | \n",
" YBR049C | \n",
" REB1 | \n",
- " H2O2Lo | \n",
+ " YPD | \n",
" YPR204W | \n",
" YPR204W | \n",
- " 0.784496 | \n",
- " 0.535665 | \n",
+ " 0.852889 | \n",
+ " 0.769430 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 14 | \n",
- " 13.0 | \n",
+ " 15 | \n",
+ " 14.0 | \n",
" YBR049C | \n",
" REB1 | \n",
- " H2O2Lo | \n",
+ " YPD | \n",
" YPR203W | \n",
" YPR203W | \n",
- " 1.450915 | \n",
- " 0.959556 | \n",
+ " 1.249003 | \n",
+ " 0.112376 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 14 | \n",
- " 13.0 | \n",
+ " 15 | \n",
+ " 14.0 | \n",
" YBR049C | \n",
" REB1 | \n",
- " H2O2Lo | \n",
+ " YPD | \n",
" YPR202W | \n",
" YPR202W | \n",
- " 1.450915 | \n",
- " 0.959556 | \n",
+ " 1.249003 | \n",
+ " 0.112376 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 14 | \n",
- " 13.0 | \n",
+ " 15 | \n",
+ " 14.0 | \n",
" YBR049C | \n",
" REB1 | \n",
- " H2O2Lo | \n",
+ " YPD | \n",
" YPR201W | \n",
" ARR3 | \n",
- " 0.925863 | \n",
- " 0.453672 | \n",
+ " 1.513707 | \n",
+ " 0.168133 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 14 | \n",
- " 13.0 | \n",
+ " 15 | \n",
+ " 14.0 | \n",
" YBR049C | \n",
" REB1 | \n",
- " H2O2Lo | \n",
+ " YPD | \n",
" YPR200C | \n",
" ARR2 | \n",
- " 0.925863 | \n",
- " 0.453672 | \n",
+ " 1.513707 | \n",
+ " 0.168133 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
@@ -1311,18 +1379,18 @@
],
"text/plain": [
" sample_id db_id regulator_locus_tag regulator_symbol condition \\\n",
- "0 14 13.0 YBR049C REB1 H2O2Lo \n",
- "1 14 13.0 YBR049C REB1 H2O2Lo \n",
- "2 14 13.0 YBR049C REB1 H2O2Lo \n",
- "3 14 13.0 YBR049C REB1 H2O2Lo \n",
- "4 14 13.0 YBR049C REB1 H2O2Lo \n",
+ "0 15 14.0 YBR049C REB1 YPD \n",
+ "1 15 14.0 YBR049C REB1 YPD \n",
+ "2 15 14.0 YBR049C REB1 YPD \n",
+ "3 15 14.0 YBR049C REB1 YPD \n",
+ "4 15 14.0 YBR049C REB1 YPD \n",
"\n",
" target_locus_tag target_symbol effect pvalue carbon_source \\\n",
- "0 YPR204W YPR204W 0.784496 0.535665 glucose \n",
- "1 YPR203W YPR203W 1.450915 0.959556 glucose \n",
- "2 YPR202W YPR202W 1.450915 0.959556 glucose \n",
- "3 YPR201W ARR3 0.925863 0.453672 glucose \n",
- "4 YPR200C ARR2 0.925863 0.453672 glucose \n",
+ "0 YPR204W YPR204W 0.852889 0.769430 glucose \n",
+ "1 YPR203W YPR203W 1.249003 0.112376 glucose \n",
+ "2 YPR202W YPR202W 1.249003 0.112376 glucose \n",
+ "3 YPR201W ARR3 1.513707 0.168133 glucose \n",
+ "4 YPR200C ARR2 1.513707 0.168133 glucose \n",
"\n",
" temperature_celsius \n",
"0 30.0 \n",
@@ -1332,7 +1400,7 @@
"4 30.0 "
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -1361,7 +1429,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"id": "cell-19",
"metadata": {},
"outputs": [
@@ -1385,7 +1453,7 @@
"type": "integer"
}
],
- "ref": "9234aaf4-a313-42c2-838a-a13568eed01d",
+ "ref": "6d8b4d37-3b6b-40f1-833d-aa6711694bcb",
"rows": [
[
"0",
@@ -1399,17 +1467,17 @@
],
[
"2",
- "HSF1",
+ "STE12",
"4"
],
[
"3",
- "STE12",
+ "RTG3",
"4"
],
[
"4",
- "RTG3",
+ "DIG1",
"4"
],
[
@@ -1419,62 +1487,62 @@
],
[
"6",
- "SKN7",
+ "HSF1",
"4"
],
[
"7",
- "DIG1",
+ "SKN7",
"4"
],
[
"8",
- "GAT1",
+ "RPN4",
"3"
],
[
"9",
- "RPN4",
+ "GAT1",
"3"
],
[
"10",
- "YAP7",
+ "AFT2",
"3"
],
[
"11",
- "TEC1",
+ "YAP7",
"3"
],
[
"12",
- "AFT1",
+ "TEC1",
"3"
],
[
"13",
- "MAL33",
+ "MOT3",
"3"
],
[
"14",
- "PHO2",
+ "ROX1",
"3"
],
[
"15",
- "MBP1",
+ "GZF3",
"3"
],
[
"16",
- "KSS1",
+ "PHO2",
"3"
],
[
"17",
- "SFP1",
+ "MAL33",
"3"
],
[
@@ -1484,37 +1552,37 @@
],
[
"19",
- "YJL206C",
+ "SFP1",
"3"
],
[
"20",
- "GZF3",
+ "KSS1",
"3"
],
[
"21",
- "MOT3",
+ "YAP6",
"3"
],
[
"22",
- "FHL1",
+ "RPH1",
"3"
],
[
"23",
- "ROX1",
+ "NRG1",
"3"
],
[
"24",
- "FKH2",
+ "PHD1",
"3"
],
[
"25",
- "AFT2",
+ "FHL1",
"3"
],
[
@@ -1524,117 +1592,117 @@
],
[
"27",
- "RIM101",
+ "FKH2",
"3"
],
[
"28",
- "YAP6",
+ "MBP1",
"3"
],
[
"29",
- "RPH1",
+ "RIM101",
"3"
],
[
"30",
- "PHD1",
+ "YJL206C",
"3"
],
[
"31",
- "NRG1",
+ "AFT1",
"3"
],
[
"32",
- "MGA1",
+ "RLM1",
"2"
],
[
"33",
- "UME1",
+ "XBP1",
"2"
],
[
"34",
- "YAP3",
+ "IME4",
"2"
],
[
"35",
- "XBP1",
+ "MCM1",
"2"
],
[
"36",
- "RDS1",
+ "DAL80",
"2"
],
[
"37",
- "MSS11",
+ "YAP3",
"2"
],
[
"38",
- "HAP2",
+ "YAP5",
"2"
],
[
"39",
- "MCM1",
+ "MAC1",
"2"
],
[
"40",
- "ADR1",
+ "UME6",
"2"
],
[
"41",
- "GCN4",
+ "PDR1",
"2"
],
[
"42",
- "MIG2",
+ "UME1",
"2"
],
[
"43",
- "SOK2",
+ "CAD1",
"2"
],
[
"44",
- "RTG1",
+ "MGA1",
"2"
],
[
"45",
- "MOT2",
+ "HAP4",
"2"
],
[
"46",
- "UGA3",
+ "MIG2",
"2"
],
[
"47",
- "PUT3",
+ "GCN4",
"2"
],
[
"48",
- "YAP5",
+ "RTG1",
"2"
],
[
"49",
- "UME6",
+ "PUT3",
"2"
]
],
@@ -1679,17 +1747,17 @@
" \n",
" \n",
" | 2 | \n",
- " HSF1 | \n",
+ " STE12 | \n",
" 4 | \n",
"
\n",
" \n",
" | 3 | \n",
- " STE12 | \n",
+ " RTG3 | \n",
" 4 | \n",
"
\n",
" \n",
" | 4 | \n",
- " RTG3 | \n",
+ " DIG1 | \n",
" 4 | \n",
"
\n",
" \n",
@@ -1699,27 +1767,27 @@
"
\n",
" \n",
" | 58 | \n",
- " DAL82 | \n",
+ " IME1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 59 | \n",
- " DAL80 | \n",
+ " RDS1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 60 | \n",
- " HAP4 | \n",
+ " MSS11 | \n",
" 2 | \n",
"
\n",
" \n",
" | 61 | \n",
- " PDR1 | \n",
+ " HAP2 | \n",
" 2 | \n",
"
\n",
" \n",
" | 62 | \n",
- " RLM1 | \n",
+ " ARR1 | \n",
" 2 | \n",
"
\n",
" \n",
@@ -1731,20 +1799,20 @@
" regulator_symbol n\n",
"0 MSN2 6\n",
"1 MSN4 5\n",
- "2 HSF1 4\n",
- "3 STE12 4\n",
- "4 RTG3 4\n",
+ "2 STE12 4\n",
+ "3 RTG3 4\n",
+ "4 DIG1 4\n",
".. ... ..\n",
- "58 DAL82 2\n",
- "59 DAL80 2\n",
- "60 HAP4 2\n",
- "61 PDR1 2\n",
- "62 RLM1 2\n",
+ "58 IME1 2\n",
+ "59 RDS1 2\n",
+ "60 MSS11 2\n",
+ "61 HAP2 2\n",
+ "62 ARR1 2\n",
"\n",
"[63 rows x 2 columns]"
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -1785,7 +1853,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"id": "cell-21",
"metadata": {},
"outputs": [
@@ -1838,6 +1906,11 @@
"rawType": "float64",
"type": "float"
},
+ {
+ "name": "pr_ranking_column",
+ "rawType": "object",
+ "type": "string"
+ },
{
"name": "binding_repo_dataset",
"rawType": "object",
@@ -1869,62 +1942,65 @@
"type": "string"
}
],
- "ref": "3464c093-78d3-4dde-9a28-850a7be5d032",
+ "ref": "1ce4dce9-5191-4116-b848-394fcdb3b5fc",
"rows": [
[
"0",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;85",
- "2.0",
- "2.0",
- "3.0",
- "2.0",
- "0.0002250900360144",
- "0.004",
+ "BrentLab/harbison_2004;harbison_2004;105",
+ "BrentLab/hughes_2006;overexpression;10",
+ "11.0",
+ "206.0",
+ "12.0",
+ "206.0",
+ "0.041292917490562644",
+ "0.017",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "105",
"harbison",
- "85",
- "BrentLab/Hackett_2020;hackett_2020"
+ "10",
+ "BrentLab/hughes_2006;overexpression"
],
[
"1",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;83",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;108",
+ "BrentLab/hughes_2006;overexpression;11",
+ "60.0",
+ "67.0",
+ "60.0",
+ "67.0",
+ "0.05428351009647073",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "108",
"harbison",
- "83",
- "BrentLab/Hackett_2020;hackett_2020"
+ "11",
+ "BrentLab/hughes_2006;overexpression"
],
[
"2",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;84",
- "2.0",
- "1.0",
- "3.0",
- "1.0",
- "0.0",
- "0.011",
+ "BrentLab/harbison_2004;harbison_2004;109",
+ "BrentLab/hughes_2006;overexpression;11",
+ "27.0",
+ "1265.0",
+ "27.0",
+ "1265.0",
+ "0.12321364371741866",
+ "0.057",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "109",
"harbison",
- "84",
- "BrentLab/Hackett_2020;hackett_2020"
+ "11",
+ "BrentLab/hughes_2006;overexpression"
]
],
"shape": {
- "columns": 14,
+ "columns": 15,
"rows": 3
}
},
@@ -1955,6 +2031,7 @@
" perturbation_set_size | \n",
" dto_fdr | \n",
" dto_empirical_pvalue | \n",
+ " pr_ranking_column | \n",
" binding_repo_dataset | \n",
" perturbation_repo_dataset | \n",
" binding_id_id | \n",
@@ -1966,92 +2043,95 @@
" \n",
" \n",
" | 0 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;85 | \n",
- " 2.0 | \n",
- " 2.0 | \n",
- " 3.0 | \n",
- " 2.0 | \n",
- " 0.000225 | \n",
- " 0.004 | \n",
+ " BrentLab/harbison_2004;harbison_2004;105 | \n",
+ " BrentLab/hughes_2006;overexpression;10 | \n",
+ " 11.0 | \n",
+ " 206.0 | \n",
+ " 12.0 | \n",
+ " 206.0 | \n",
+ " 0.041293 | \n",
+ " 0.017 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 105 | \n",
" harbison | \n",
- " 85 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 10 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 1 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;83 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " BrentLab/harbison_2004;harbison_2004;108 | \n",
+ " BrentLab/hughes_2006;overexpression;11 | \n",
+ " 60.0 | \n",
+ " 67.0 | \n",
+ " 60.0 | \n",
+ " 67.0 | \n",
+ " 0.054284 | \n",
+ " 0.000 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 108 | \n",
" harbison | \n",
- " 83 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 11 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 2 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;84 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 3.0 | \n",
- " 1.0 | \n",
- " 0.000000 | \n",
- " 0.011 | \n",
+ " BrentLab/harbison_2004;harbison_2004;109 | \n",
+ " BrentLab/hughes_2006;overexpression;11 | \n",
+ " 27.0 | \n",
+ " 1265.0 | \n",
+ " 27.0 | \n",
+ " 1265.0 | \n",
+ " 0.123214 | \n",
+ " 0.057 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 109 | \n",
" harbison | \n",
- " 84 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 11 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " binding_id \\\n",
- "0 BrentLab/harbison_2004;harbison_2004;3 \n",
- "1 BrentLab/harbison_2004;harbison_2004;3 \n",
- "2 BrentLab/harbison_2004;harbison_2004;3 \n",
+ " binding_id \\\n",
+ "0 BrentLab/harbison_2004;harbison_2004;105 \n",
+ "1 BrentLab/harbison_2004;harbison_2004;108 \n",
+ "2 BrentLab/harbison_2004;harbison_2004;109 \n",
"\n",
- " perturbation_id binding_rank_threshold \\\n",
- "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n",
- "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n",
- "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n",
+ " perturbation_id binding_rank_threshold \\\n",
+ "0 BrentLab/hughes_2006;overexpression;10 11.0 \n",
+ "1 BrentLab/hughes_2006;overexpression;11 60.0 \n",
+ "2 BrentLab/hughes_2006;overexpression;11 27.0 \n",
"\n",
" perturbation_rank_threshold binding_set_size perturbation_set_size \\\n",
- "0 2.0 3.0 2.0 \n",
- "1 NaN NaN NaN \n",
- "2 1.0 3.0 1.0 \n",
+ "0 206.0 12.0 206.0 \n",
+ "1 67.0 60.0 67.0 \n",
+ "2 1265.0 27.0 1265.0 \n",
"\n",
- " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n",
- "0 0.000225 0.004 harbison_2004-harbison_2004 \n",
- "1 NaN NaN harbison_2004-harbison_2004 \n",
- "2 0.000000 0.011 harbison_2004-harbison_2004 \n",
+ " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n",
+ "0 0.041293 0.017 log2fc \n",
+ "1 0.054284 0.000 log2fc \n",
+ "2 0.123214 0.057 log2fc \n",
"\n",
- " perturbation_repo_dataset binding_id_id binding_id_source \\\n",
- "0 Hackett_2020-hackett_2020 3 harbison \n",
- "1 Hackett_2020-hackett_2020 3 harbison \n",
- "2 Hackett_2020-hackett_2020 3 harbison \n",
+ " binding_repo_dataset perturbation_repo_dataset binding_id_id \\\n",
+ "0 harbison_2004-harbison_2004 hughes_2006-overexpression 105 \n",
+ "1 harbison_2004-harbison_2004 hughes_2006-overexpression 108 \n",
+ "2 harbison_2004-harbison_2004 hughes_2006-overexpression 109 \n",
"\n",
- " perturbation_id_id perturbation_id_source \n",
- "0 85 BrentLab/Hackett_2020;hackett_2020 \n",
- "1 83 BrentLab/Hackett_2020;hackett_2020 \n",
- "2 84 BrentLab/Hackett_2020;hackett_2020 "
+ " binding_id_source perturbation_id_id perturbation_id_source \n",
+ "0 harbison 10 BrentLab/hughes_2006;overexpression \n",
+ "1 harbison 11 BrentLab/hughes_2006;overexpression \n",
+ "2 harbison 11 BrentLab/hughes_2006;overexpression "
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -2063,7 +2143,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"id": "cell-22",
"metadata": {},
"outputs": [
@@ -2082,17 +2162,17 @@
"type": "integer"
},
{
- "name": "regulator_locus_tag",
+ "name": "condition",
"rawType": "object",
"type": "string"
},
{
- "name": "regulator_symbol",
+ "name": "regulator_locus_tag",
"rawType": "object",
"type": "string"
},
{
- "name": "condition",
+ "name": "regulator_symbol",
"rawType": "object",
"type": "string"
},
@@ -2117,117 +2197,117 @@
"type": "float"
}
],
- "ref": "58c1f0ca-b0a7-4ce7-b29f-f4e789b74707",
+ "ref": "8e604dcf-efad-42a8-a049-7bf684faa9b6",
"rows": [
[
"0",
- "50",
- "YDR043C",
- "NRG1",
- "H2O2Lo",
+ "18",
+ "YPD",
+ "YBR083W",
+ "TEC1",
"glucose",
"30.0",
"0.0",
- "0.081863152643831"
+ "0.08188235294117648"
],
[
"1",
- "213",
- "YKL222C",
- "YKL222C",
- "YPD",
+ "157",
+ "H2O2Hi",
+ "YHR206W",
+ "SKN7",
"glucose",
"30.0",
"0.0",
- "0.0"
+ "0.13931986462735127"
],
[
"2",
- "18",
- "YBR083W",
- "TEC1",
+ "93",
"YPD",
+ "YER111C",
+ "SWI4",
"glucose",
"30.0",
"0.0",
- "0.0620669105826265"
+ "0.17005078106191404"
],
[
"3",
- "7",
- "YBL103C",
- "RTG3",
- "H2O2Hi",
+ "72",
+ "YPD",
+ "YDR421W",
+ "ARO80",
"glucose",
"30.0",
"0.0",
- "0.1577232390460343"
+ "0.00011392635800218739"
],
[
"4",
- "277",
- "YNL103W",
- "MET4",
- "YPD",
- "glucose",
+ "71",
+ "SM",
+ "YDR421W",
+ "ARO80",
+ "unspecified",
"30.0",
"0.0",
- "0.016281512605042"
+ "0.00011392635800218739"
],
[
"5",
- "281",
- "YNL199C",
- "GCR2",
- "SM",
- "unspecified",
+ "346",
+ "RAPA",
+ "YPR104C",
+ "FHL1",
+ "glucose",
"30.0",
"0.0",
- "0.0296346442259623"
+ "0.019746237283784218"
],
[
"6",
- "86",
- "YER040W",
- "GLN3",
- "SM",
- "unspecified",
+ "226",
+ "YPD",
+ "YLR182W",
+ "SWI6",
+ "glucose",
"30.0",
"0.0",
- "0.2298889521004841"
+ "0.07368989186287292"
],
[
"7",
- "225",
- "YLR176C",
- "RFX1",
+ "286",
"YPD",
+ "YNL309W",
+ "STB1",
"glucose",
"30.0",
"0.0",
- "0.0144559001906082"
+ "0.1821470588235294"
],
[
"8",
- "86",
- "YER040W",
- "GLN3",
+ "172",
"SM",
+ "YIR023W",
+ "DAL81",
"unspecified",
"30.0",
"0.0",
- "0.0961169019780866"
+ "0.21656240134694307"
],
[
"9",
- "225",
- "YLR176C",
- "RFX1",
+ "320",
"YPD",
+ "YPL038W",
+ "MET31",
"glucose",
"30.0",
"0.0",
- "0.0335260614428719"
+ "0.0661219662690251"
]
],
"shape": {
@@ -2255,9 +2335,9 @@
" \n",
" | \n",
" sample_id | \n",
+ " condition | \n",
" regulator_locus_tag | \n",
" regulator_symbol | \n",
- " condition | \n",
" carbon_source | \n",
" temperature_celsius | \n",
" dto_empirical_pvalue | \n",
@@ -2267,145 +2347,145 @@
"
\n",
" \n",
" | 0 | \n",
- " 50 | \n",
- " YDR043C | \n",
- " NRG1 | \n",
- " H2O2Lo | \n",
+ " 18 | \n",
+ " YPD | \n",
+ " YBR083W | \n",
+ " TEC1 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.081863 | \n",
+ " 0.081882 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 213 | \n",
- " YKL222C | \n",
- " YKL222C | \n",
- " YPD | \n",
+ " 157 | \n",
+ " H2O2Hi | \n",
+ " YHR206W | \n",
+ " SKN7 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.000000 | \n",
+ " 0.139320 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 18 | \n",
- " YBR083W | \n",
- " TEC1 | \n",
+ " 93 | \n",
" YPD | \n",
+ " YER111C | \n",
+ " SWI4 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.062067 | \n",
+ " 0.170051 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 7 | \n",
- " YBL103C | \n",
- " RTG3 | \n",
- " H2O2Hi | \n",
+ " 72 | \n",
+ " YPD | \n",
+ " YDR421W | \n",
+ " ARO80 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.157723 | \n",
+ " 0.000114 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 277 | \n",
- " YNL103W | \n",
- " MET4 | \n",
- " YPD | \n",
- " glucose | \n",
+ " 71 | \n",
+ " SM | \n",
+ " YDR421W | \n",
+ " ARO80 | \n",
+ " unspecified | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.016282 | \n",
+ " 0.000114 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 281 | \n",
- " YNL199C | \n",
- " GCR2 | \n",
- " SM | \n",
- " unspecified | \n",
+ " 346 | \n",
+ " RAPA | \n",
+ " YPR104C | \n",
+ " FHL1 | \n",
+ " glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.029635 | \n",
+ " 0.019746 | \n",
"
\n",
" \n",
" | 6 | \n",
- " 86 | \n",
- " YER040W | \n",
- " GLN3 | \n",
- " SM | \n",
- " unspecified | \n",
+ " 226 | \n",
+ " YPD | \n",
+ " YLR182W | \n",
+ " SWI6 | \n",
+ " glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.229889 | \n",
+ " 0.073690 | \n",
"
\n",
" \n",
" | 7 | \n",
- " 225 | \n",
- " YLR176C | \n",
- " RFX1 | \n",
+ " 286 | \n",
" YPD | \n",
+ " YNL309W | \n",
+ " STB1 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.014456 | \n",
+ " 0.182147 | \n",
"
\n",
" \n",
" | 8 | \n",
- " 86 | \n",
- " YER040W | \n",
- " GLN3 | \n",
+ " 172 | \n",
" SM | \n",
+ " YIR023W | \n",
+ " DAL81 | \n",
" unspecified | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.096117 | \n",
+ " 0.216562 | \n",
"
\n",
" \n",
" | 9 | \n",
- " 225 | \n",
- " YLR176C | \n",
- " RFX1 | \n",
+ " 320 | \n",
" YPD | \n",
+ " YPL038W | \n",
+ " MET31 | \n",
" glucose | \n",
" 30.0 | \n",
" 0.0 | \n",
- " 0.033526 | \n",
+ " 0.066122 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n",
- "0 50 YDR043C NRG1 H2O2Lo glucose \n",
- "1 213 YKL222C YKL222C YPD glucose \n",
- "2 18 YBR083W TEC1 YPD glucose \n",
- "3 7 YBL103C RTG3 H2O2Hi glucose \n",
- "4 277 YNL103W MET4 YPD glucose \n",
- "5 281 YNL199C GCR2 SM unspecified \n",
- "6 86 YER040W GLN3 SM unspecified \n",
- "7 225 YLR176C RFX1 YPD glucose \n",
- "8 86 YER040W GLN3 SM unspecified \n",
- "9 225 YLR176C RFX1 YPD glucose \n",
+ " sample_id condition regulator_locus_tag regulator_symbol carbon_source \\\n",
+ "0 18 YPD YBR083W TEC1 glucose \n",
+ "1 157 H2O2Hi YHR206W SKN7 glucose \n",
+ "2 93 YPD YER111C SWI4 glucose \n",
+ "3 72 YPD YDR421W ARO80 glucose \n",
+ "4 71 SM YDR421W ARO80 unspecified \n",
+ "5 346 RAPA YPR104C FHL1 glucose \n",
+ "6 226 YPD YLR182W SWI6 glucose \n",
+ "7 286 YPD YNL309W STB1 glucose \n",
+ "8 172 SM YIR023W DAL81 unspecified \n",
+ "9 320 YPD YPL038W MET31 glucose \n",
"\n",
" temperature_celsius dto_empirical_pvalue dto_fdr \n",
- "0 30.0 0.0 0.081863 \n",
- "1 30.0 0.0 0.000000 \n",
- "2 30.0 0.0 0.062067 \n",
- "3 30.0 0.0 0.157723 \n",
- "4 30.0 0.0 0.016282 \n",
- "5 30.0 0.0 0.029635 \n",
- "6 30.0 0.0 0.229889 \n",
- "7 30.0 0.0 0.014456 \n",
- "8 30.0 0.0 0.096117 \n",
- "9 30.0 0.0 0.033526 "
+ "0 30.0 0.0 0.081882 \n",
+ "1 30.0 0.0 0.139320 \n",
+ "2 30.0 0.0 0.170051 \n",
+ "3 30.0 0.0 0.000114 \n",
+ "4 30.0 0.0 0.000114 \n",
+ "5 30.0 0.0 0.019746 \n",
+ "6 30.0 0.0 0.073690 \n",
+ "7 30.0 0.0 0.182147 \n",
+ "8 30.0 0.0 0.216562 \n",
+ "9 30.0 0.0 0.066122 "
]
},
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -2426,7 +2506,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"id": "cell-23",
"metadata": {},
"outputs": [
@@ -2460,77 +2540,77 @@
"type": "string"
}
],
- "ref": "b916ca80-75d1-448d-82a5-c82086ca1ed9",
+ "ref": "18363370-be4d-4693-8836-96409a2ac869",
"rows": [
[
"0",
- "289",
- "DAL82",
+ "15",
+ "REB1",
"0.0",
- "1208"
+ "100_242"
],
[
"1",
- "251",
- "MAC1",
+ "303",
+ "CIN5",
"0.0",
- "1103"
+ "1280"
],
[
"2",
- "321",
- "DIG1",
+ "330",
+ "CUP9",
"0.0",
- "1372"
+ "256"
],
[
"3",
- "238",
- "YAP1",
+ "114",
+ "AFT1",
"0.0",
- "996"
+ "87"
],
[
"4",
- "303",
- "CIN5",
+ "9",
+ "RTG3",
"0.0",
- "1365"
+ "57"
],
[
"5",
- "245",
- "ARG81",
+ "118",
+ "HSF1",
"0.0",
- "1023"
+ "88"
],
[
"6",
- "184",
- "CBF1",
+ "15",
+ "REB1",
"0.0",
- "754"
+ "100_242"
],
[
"7",
- "252",
- "MAC1",
+ "162",
+ "XBP1",
"0.0",
- "1103"
+ "24"
],
[
"8",
- "200",
- "PHD1",
+ "240",
+ "YAP1",
"0.0",
- "890"
+ "182"
],
[
"9",
- "251",
- "MAC1",
+ "150",
+ "STP2",
"0.0",
- "1110"
+ "604"
]
],
"shape": {
@@ -2566,73 +2646,73 @@
" \n",
" \n",
" | 0 | \n",
- " 289 | \n",
- " DAL82 | \n",
+ " 15 | \n",
+ " REB1 | \n",
" 0.0 | \n",
- " 1208 | \n",
+ " 100_242 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 251 | \n",
- " MAC1 | \n",
+ " 303 | \n",
+ " CIN5 | \n",
" 0.0 | \n",
- " 1103 | \n",
+ " 1280 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 321 | \n",
- " DIG1 | \n",
+ " 330 | \n",
+ " CUP9 | \n",
" 0.0 | \n",
- " 1372 | \n",
+ " 256 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 238 | \n",
- " YAP1 | \n",
+ " 114 | \n",
+ " AFT1 | \n",
" 0.0 | \n",
- " 996 | \n",
+ " 87 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 303 | \n",
- " CIN5 | \n",
+ " 9 | \n",
+ " RTG3 | \n",
" 0.0 | \n",
- " 1365 | \n",
+ " 57 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 245 | \n",
- " ARG81 | \n",
+ " 118 | \n",
+ " HSF1 | \n",
" 0.0 | \n",
- " 1023 | \n",
+ " 88 | \n",
"
\n",
" \n",
" | 6 | \n",
- " 184 | \n",
- " CBF1 | \n",
+ " 15 | \n",
+ " REB1 | \n",
" 0.0 | \n",
- " 754 | \n",
+ " 100_242 | \n",
"
\n",
" \n",
" | 7 | \n",
- " 252 | \n",
- " MAC1 | \n",
+ " 162 | \n",
+ " XBP1 | \n",
" 0.0 | \n",
- " 1103 | \n",
+ " 24 | \n",
"
\n",
" \n",
" | 8 | \n",
- " 200 | \n",
- " PHD1 | \n",
+ " 240 | \n",
+ " YAP1 | \n",
" 0.0 | \n",
- " 890 | \n",
+ " 182 | \n",
"
\n",
" \n",
" | 9 | \n",
- " 251 | \n",
- " MAC1 | \n",
+ " 150 | \n",
+ " STP2 | \n",
" 0.0 | \n",
- " 1110 | \n",
+ " 604 | \n",
"
\n",
" \n",
"\n",
@@ -2640,19 +2720,19 @@
],
"text/plain": [
" harbison_sample_id regulator_symbol dto_empirical_pvalue hackett_sample_id\n",
- "0 289 DAL82 0.0 1208\n",
- "1 251 MAC1 0.0 1103\n",
- "2 321 DIG1 0.0 1372\n",
- "3 238 YAP1 0.0 996\n",
- "4 303 CIN5 0.0 1365\n",
- "5 245 ARG81 0.0 1023\n",
- "6 184 CBF1 0.0 754\n",
- "7 252 MAC1 0.0 1103\n",
- "8 200 PHD1 0.0 890\n",
- "9 251 MAC1 0.0 1110"
+ "0 15 REB1 0.0 100_242\n",
+ "1 303 CIN5 0.0 1280\n",
+ "2 330 CUP9 0.0 256\n",
+ "3 114 AFT1 0.0 87\n",
+ "4 9 RTG3 0.0 57\n",
+ "5 118 HSF1 0.0 88\n",
+ "6 15 REB1 0.0 100_242\n",
+ "7 162 XBP1 0.0 24\n",
+ "8 240 YAP1 0.0 182\n",
+ "9 150 STP2 0.0 604"
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -2690,7 +2770,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"id": "f03e942a",
"metadata": {},
"outputs": [
@@ -2729,12 +2809,12 @@
"type": "integer"
}
],
- "ref": "1185b490-3375-41d0-b61c-0f35dae2b815",
+ "ref": "0736a331-fb06-4ba3-abe4-dff7ac0e65a3",
"rows": [
[
"0",
"SWI1",
- "15.0",
+ "5.0",
"ZEV",
"P",
"3"
@@ -2750,7 +2830,7 @@
[
"2",
"SWI1",
- "45.0",
+ "20.0",
"ZEV",
"P",
"3"
@@ -2758,7 +2838,7 @@
[
"3",
"SWI1",
- "5.0",
+ "10.0",
"ZEV",
"P",
"3"
@@ -2766,7 +2846,7 @@
[
"4",
"SWI1",
- "0.0",
+ "90.0",
"ZEV",
"P",
"3"
@@ -2774,7 +2854,7 @@
[
"5",
"SWI1",
- "90.0",
+ "0.0",
"ZEV",
"P",
"3"
@@ -2782,7 +2862,7 @@
[
"6",
"SWI1",
- "10.0",
+ "15.0",
"ZEV",
"P",
"3"
@@ -2790,31 +2870,31 @@
[
"7",
"SWI1",
- "20.0",
+ "45.0",
"ZEV",
"P",
"3"
],
[
"8",
- "GCN4",
- "0.0",
+ "RDS2",
+ "10.0",
"ZEV",
"P",
"2"
],
[
"9",
- "GCN4",
- "30.0",
- "ZEV",
+ "MAC1",
+ "90.0",
+ "GEV",
"P",
"2"
],
[
"10",
"MAC1",
- "0.0",
+ "15.0",
"GEV",
"P",
"2"
@@ -2822,22 +2902,22 @@
[
"11",
"RDS2",
- "5.0",
+ "20.0",
"ZEV",
"P",
"2"
],
[
"12",
- "RDS2",
+ "MAC1",
"45.0",
- "ZEV",
+ "GEV",
"P",
"2"
],
[
"13",
- "Z3EV",
+ "RDS2",
"30.0",
"ZEV",
"P",
@@ -2846,46 +2926,46 @@
[
"14",
"GCN4",
- "90.0",
+ "15.0",
"ZEV",
"P",
"2"
],
[
"15",
- "Z3EV",
- "15.0",
- "ZEV",
+ "MAC1",
+ "30.0",
+ "GEV",
"P",
"2"
],
[
"16",
- "GCN4",
- "45.0",
- "ZEV",
+ "MAC1",
+ "5.0",
+ "GEV",
"P",
"2"
],
[
"17",
- "MAC1",
- "5.0",
- "GEV",
+ "GCN4",
+ "45.0",
+ "ZEV",
"P",
"2"
],
[
"18",
- "MAC1",
+ "GCN4",
"90.0",
- "GEV",
+ "ZEV",
"P",
"2"
],
[
"19",
- "Z3EV",
+ "RDS2",
"45.0",
"ZEV",
"P",
@@ -2894,38 +2974,38 @@
[
"20",
"RDS2",
- "10.0",
+ "0.0",
"ZEV",
"P",
"2"
],
[
"21",
- "GCN4",
- "15.0",
+ "RDS2",
+ "90.0",
"ZEV",
"P",
"2"
],
[
"22",
- "RDS2",
- "90.0",
+ "GCN4",
+ "30.0",
"ZEV",
"P",
"2"
],
[
"23",
- "RDS2",
+ "MAC1",
"0.0",
- "ZEV",
+ "GEV",
"P",
"2"
],
[
"24",
- "Z3EV",
+ "RDS2",
"5.0",
"ZEV",
"P",
@@ -2933,88 +3013,24 @@
],
[
"25",
- "Z3EV",
- "90.0",
- "ZEV",
- "P",
- "2"
- ],
- [
- "26",
- "Z3EV",
- "20.0",
- "ZEV",
- "P",
- "2"
- ],
- [
- "27",
- "RDS2",
- "30.0",
- "ZEV",
- "P",
- "2"
- ],
- [
- "28",
- "Z3EV",
+ "GCN4",
"0.0",
"ZEV",
"P",
"2"
],
[
- "29",
+ "26",
"RDS2",
"15.0",
"ZEV",
"P",
"2"
- ],
- [
- "30",
- "Z3EV",
- "10.0",
- "ZEV",
- "P",
- "2"
- ],
- [
- "31",
- "RDS2",
- "20.0",
- "ZEV",
- "P",
- "2"
- ],
- [
- "32",
- "MAC1",
- "45.0",
- "GEV",
- "P",
- "2"
- ],
- [
- "33",
- "MAC1",
- "15.0",
- "GEV",
- "P",
- "2"
- ],
- [
- "34",
- "MAC1",
- "30.0",
- "GEV",
- "P",
- "2"
]
],
"shape": {
"columns": 5,
- "rows": 35
+ "rows": 27
}
},
"text/html": [
@@ -3047,7 +3063,7 @@
" \n",
" | 0 | \n",
" SWI1 | \n",
- " 15.0 | \n",
+ " 5.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3063,7 +3079,7 @@
"
\n",
" | 2 | \n",
" SWI1 | \n",
- " 45.0 | \n",
+ " 20.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3071,7 +3087,7 @@
"
\n",
" | 3 | \n",
" SWI1 | \n",
- " 5.0 | \n",
+ " 10.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3079,7 +3095,7 @@
"
\n",
" | 4 | \n",
" SWI1 | \n",
- " 0.0 | \n",
+ " 90.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3087,7 +3103,7 @@
"
\n",
" | 5 | \n",
" SWI1 | \n",
- " 90.0 | \n",
+ " 0.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3095,7 +3111,7 @@
"
\n",
" | 6 | \n",
" SWI1 | \n",
- " 10.0 | \n",
+ " 15.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
@@ -3103,31 +3119,31 @@
"
\n",
" | 7 | \n",
" SWI1 | \n",
- " 20.0 | \n",
+ " 45.0 | \n",
" ZEV | \n",
" P | \n",
" 3 | \n",
"
\n",
" \n",
" | 8 | \n",
- " GCN4 | \n",
- " 0.0 | \n",
+ " RDS2 | \n",
+ " 10.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 9 | \n",
- " GCN4 | \n",
- " 30.0 | \n",
- " ZEV | \n",
+ " MAC1 | \n",
+ " 90.0 | \n",
+ " GEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 10 | \n",
" MAC1 | \n",
- " 0.0 | \n",
+ " 15.0 | \n",
" GEV | \n",
" P | \n",
" 2 | \n",
@@ -3135,22 +3151,22 @@
"
\n",
" | 11 | \n",
" RDS2 | \n",
- " 5.0 | \n",
+ " 20.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 12 | \n",
- " RDS2 | \n",
+ " MAC1 | \n",
" 45.0 | \n",
- " ZEV | \n",
+ " GEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 13 | \n",
- " Z3EV | \n",
+ " RDS2 | \n",
" 30.0 | \n",
" ZEV | \n",
" P | \n",
@@ -3159,46 +3175,46 @@
"
\n",
" | 14 | \n",
" GCN4 | \n",
- " 90.0 | \n",
+ " 15.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 15 | \n",
- " Z3EV | \n",
- " 15.0 | \n",
- " ZEV | \n",
+ " MAC1 | \n",
+ " 30.0 | \n",
+ " GEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 16 | \n",
- " GCN4 | \n",
- " 45.0 | \n",
- " ZEV | \n",
+ " MAC1 | \n",
+ " 5.0 | \n",
+ " GEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 17 | \n",
- " MAC1 | \n",
- " 5.0 | \n",
- " GEV | \n",
+ " GCN4 | \n",
+ " 45.0 | \n",
+ " ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 18 | \n",
- " MAC1 | \n",
+ " GCN4 | \n",
" 90.0 | \n",
- " GEV | \n",
+ " ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 19 | \n",
- " Z3EV | \n",
+ " RDS2 | \n",
" 45.0 | \n",
" ZEV | \n",
" P | \n",
@@ -3207,38 +3223,38 @@
"
\n",
" | 20 | \n",
" RDS2 | \n",
- " 10.0 | \n",
+ " 0.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 21 | \n",
- " GCN4 | \n",
- " 15.0 | \n",
+ " RDS2 | \n",
+ " 90.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 22 | \n",
- " RDS2 | \n",
- " 90.0 | \n",
+ " GCN4 | \n",
+ " 30.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 23 | \n",
- " RDS2 | \n",
+ " MAC1 | \n",
" 0.0 | \n",
- " ZEV | \n",
+ " GEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
" | 24 | \n",
- " Z3EV | \n",
+ " RDS2 | \n",
" 5.0 | \n",
" ZEV | \n",
" P | \n",
@@ -3246,128 +3262,56 @@
"
\n",
" \n",
" | 25 | \n",
- " Z3EV | \n",
- " 90.0 | \n",
- " ZEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 26 | \n",
- " Z3EV | \n",
- " 20.0 | \n",
- " ZEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 27 | \n",
- " RDS2 | \n",
- " 30.0 | \n",
- " ZEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 28 | \n",
- " Z3EV | \n",
+ " GCN4 | \n",
" 0.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
" \n",
- " | 29 | \n",
+ " 26 | \n",
" RDS2 | \n",
" 15.0 | \n",
" ZEV | \n",
" P | \n",
" 2 | \n",
"
\n",
- " \n",
- " | 30 | \n",
- " Z3EV | \n",
- " 10.0 | \n",
- " ZEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 31 | \n",
- " RDS2 | \n",
- " 20.0 | \n",
- " ZEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 32 | \n",
- " MAC1 | \n",
- " 45.0 | \n",
- " GEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 33 | \n",
- " MAC1 | \n",
- " 15.0 | \n",
- " GEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 34 | \n",
- " MAC1 | \n",
- " 30.0 | \n",
- " GEV | \n",
- " P | \n",
- " 2 | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
" regulator_symbol time mechanism restriction n\n",
- "0 SWI1 15.0 ZEV P 3\n",
+ "0 SWI1 5.0 ZEV P 3\n",
"1 SWI1 30.0 ZEV P 3\n",
- "2 SWI1 45.0 ZEV P 3\n",
- "3 SWI1 5.0 ZEV P 3\n",
- "4 SWI1 0.0 ZEV P 3\n",
- "5 SWI1 90.0 ZEV P 3\n",
- "6 SWI1 10.0 ZEV P 3\n",
- "7 SWI1 20.0 ZEV P 3\n",
- "8 GCN4 0.0 ZEV P 2\n",
- "9 GCN4 30.0 ZEV P 2\n",
- "10 MAC1 0.0 GEV P 2\n",
- "11 RDS2 5.0 ZEV P 2\n",
- "12 RDS2 45.0 ZEV P 2\n",
- "13 Z3EV 30.0 ZEV P 2\n",
- "14 GCN4 90.0 ZEV P 2\n",
- "15 Z3EV 15.0 ZEV P 2\n",
- "16 GCN4 45.0 ZEV P 2\n",
- "17 MAC1 5.0 GEV P 2\n",
- "18 MAC1 90.0 GEV P 2\n",
- "19 Z3EV 45.0 ZEV P 2\n",
- "20 RDS2 10.0 ZEV P 2\n",
- "21 GCN4 15.0 ZEV P 2\n",
- "22 RDS2 90.0 ZEV P 2\n",
- "23 RDS2 0.0 ZEV P 2\n",
- "24 Z3EV 5.0 ZEV P 2\n",
- "25 Z3EV 90.0 ZEV P 2\n",
- "26 Z3EV 20.0 ZEV P 2\n",
- "27 RDS2 30.0 ZEV P 2\n",
- "28 Z3EV 0.0 ZEV P 2\n",
- "29 RDS2 15.0 ZEV P 2\n",
- "30 Z3EV 10.0 ZEV P 2\n",
- "31 RDS2 20.0 ZEV P 2\n",
- "32 MAC1 45.0 GEV P 2\n",
- "33 MAC1 15.0 GEV P 2\n",
- "34 MAC1 30.0 GEV P 2"
+ "2 SWI1 20.0 ZEV P 3\n",
+ "3 SWI1 10.0 ZEV P 3\n",
+ "4 SWI1 90.0 ZEV P 3\n",
+ "5 SWI1 0.0 ZEV P 3\n",
+ "6 SWI1 15.0 ZEV P 3\n",
+ "7 SWI1 45.0 ZEV P 3\n",
+ "8 RDS2 10.0 ZEV P 2\n",
+ "9 MAC1 90.0 GEV P 2\n",
+ "10 MAC1 15.0 GEV P 2\n",
+ "11 RDS2 20.0 ZEV P 2\n",
+ "12 MAC1 45.0 GEV P 2\n",
+ "13 RDS2 30.0 ZEV P 2\n",
+ "14 GCN4 15.0 ZEV P 2\n",
+ "15 MAC1 30.0 GEV P 2\n",
+ "16 MAC1 5.0 GEV P 2\n",
+ "17 GCN4 45.0 ZEV P 2\n",
+ "18 GCN4 90.0 ZEV P 2\n",
+ "19 RDS2 45.0 ZEV P 2\n",
+ "20 RDS2 0.0 ZEV P 2\n",
+ "21 RDS2 90.0 ZEV P 2\n",
+ "22 GCN4 30.0 ZEV P 2\n",
+ "23 MAC1 0.0 GEV P 2\n",
+ "24 RDS2 5.0 ZEV P 2\n",
+ "25 GCN4 0.0 ZEV P 2\n",
+ "26 RDS2 15.0 ZEV P 2"
]
},
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -3386,7 +3330,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"id": "4d869036",
"metadata": {},
"outputs": [
@@ -3405,32 +3349,27 @@
"type": "integer"
},
{
- "name": "regulator_locus_tag",
+ "name": "date",
"rawType": "object",
"type": "string"
},
{
- "name": "regulator_symbol",
+ "name": "mechanism",
"rawType": "object",
"type": "string"
},
{
- "name": "time",
- "rawType": "float64",
- "type": "float"
- },
- {
- "name": "mechanism",
+ "name": "regulator_locus_tag",
"rawType": "object",
"type": "string"
},
{
- "name": "restriction",
+ "name": "regulator_symbol",
"rawType": "object",
"type": "string"
},
{
- "name": "date",
+ "name": "restriction",
"rawType": "object",
"type": "string"
},
@@ -3439,6 +3378,11 @@
"rawType": "object",
"type": "string"
},
+ {
+ "name": "time",
+ "rawType": "float64",
+ "type": "float"
+ },
{
"name": "carbon_source",
"rawType": "object",
@@ -3450,44 +3394,44 @@
"type": "float"
}
],
- "ref": "440ab0a2-f84a-4505-8380-e218512394f7",
+ "ref": "0f36c45d-0bab-4761-98f1-0e2a625be2df",
"rows": [
[
"0",
- "1620",
+ "1636",
+ "20161117",
+ "ZEV",
"YPL016W",
"SWI1",
- "20.0",
- "ZEV",
"P",
- "20161117",
- "SMY2266a",
+ "SMY2266c",
+ "20.0",
"glucose",
"30.0"
],
[
"1",
"1628",
+ "20161117",
+ "ZEV",
"YPL016W",
"SWI1",
- "20.0",
- "ZEV",
"P",
- "20161117",
"SMY2266b",
+ "20.0",
"glucose",
"30.0"
],
[
"2",
- "1636",
+ "1620",
+ "20161117",
+ "ZEV",
"YPL016W",
"SWI1",
- "20.0",
- "ZEV",
"P",
- "20161117",
- "SMY2266c",
+ "SMY2266a",
+ "20.0",
"glucose",
"30.0"
]
@@ -3517,13 +3461,13 @@
" \n",
" | \n",
" sample_id | \n",
+ " date | \n",
+ " mechanism | \n",
" regulator_locus_tag | \n",
" regulator_symbol | \n",
- " time | \n",
- " mechanism | \n",
" restriction | \n",
- " date | \n",
" strain | \n",
+ " time | \n",
" carbon_source | \n",
" temperature_celsius | \n",
"
\n",
@@ -3531,40 +3475,40 @@
" \n",
" \n",
" | 0 | \n",
- " 1620 | \n",
+ " 1636 | \n",
+ " 20161117 | \n",
+ " ZEV | \n",
" YPL016W | \n",
" SWI1 | \n",
- " 20.0 | \n",
- " ZEV | \n",
" P | \n",
- " 20161117 | \n",
- " SMY2266a | \n",
+ " SMY2266c | \n",
+ " 20.0 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1628 | \n",
+ " 20161117 | \n",
+ " ZEV | \n",
" YPL016W | \n",
" SWI1 | \n",
- " 20.0 | \n",
- " ZEV | \n",
" P | \n",
- " 20161117 | \n",
" SMY2266b | \n",
+ " 20.0 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 1636 | \n",
+ " 1620 | \n",
+ " 20161117 | \n",
+ " ZEV | \n",
" YPL016W | \n",
" SWI1 | \n",
- " 20.0 | \n",
- " ZEV | \n",
" P | \n",
- " 20161117 | \n",
- " SMY2266c | \n",
+ " SMY2266a | \n",
+ " 20.0 | \n",
" glucose | \n",
" 30.0 | \n",
"
\n",
@@ -3573,18 +3517,18 @@
""
],
"text/plain": [
- " sample_id regulator_locus_tag regulator_symbol time mechanism restriction \\\n",
- "0 1620 YPL016W SWI1 20.0 ZEV P \n",
- "1 1628 YPL016W SWI1 20.0 ZEV P \n",
- "2 1636 YPL016W SWI1 20.0 ZEV P \n",
+ " sample_id date mechanism regulator_locus_tag regulator_symbol \\\n",
+ "0 1636 20161117 ZEV YPL016W SWI1 \n",
+ "1 1628 20161117 ZEV YPL016W SWI1 \n",
+ "2 1620 20161117 ZEV YPL016W SWI1 \n",
"\n",
- " date strain carbon_source temperature_celsius \n",
- "0 20161117 SMY2266a glucose 30.0 \n",
- "1 20161117 SMY2266b glucose 30.0 \n",
- "2 20161117 SMY2266c glucose 30.0 "
+ " restriction strain time carbon_source temperature_celsius \n",
+ "0 P SMY2266c 20.0 glucose 30.0 \n",
+ "1 P SMY2266b 20.0 glucose 30.0 \n",
+ "2 P SMY2266a 20.0 glucose 30.0 "
]
},
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -3602,7 +3546,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"id": "89408d2b",
"metadata": {},
"outputs": [
@@ -3610,7 +3554,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2']\n"
+ "['MAC1', 'SWI1', 'GCN4', 'RDS2']\n"
]
}
],
@@ -3630,7 +3574,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"id": "5a3b802b",
"metadata": {},
"outputs": [
@@ -3638,7 +3582,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2', 'GEV']\n"
+ "['MAC1', 'SWI1', 'GCN4', 'RDS2', 'GEV']\n"
]
}
],
@@ -3650,7 +3594,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 18,
"id": "abed8bc2",
"metadata": {},
"outputs": [
@@ -3703,6 +3647,11 @@
"rawType": "float64",
"type": "float"
},
+ {
+ "name": "pr_ranking_column",
+ "rawType": "object",
+ "type": "string"
+ },
{
"name": "binding_repo_dataset",
"rawType": "object",
@@ -3734,862 +3683,912 @@
"type": "string"
}
],
- "ref": "b9dead21-45e7-491d-82d4-a2358af05efe",
+ "ref": "b0a3d538-3af3-4f72-8610-7722a73a7a4f",
"rows": [
[
"0",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;85",
- "2.0",
- "2.0",
- "3.0",
- "2.0",
- "0.0002250900360144",
- "0.004",
+ "BrentLab/harbison_2004;harbison_2004;105",
+ "BrentLab/hughes_2006;overexpression;10",
+ "11.0",
+ "206.0",
+ "12.0",
+ "206.0",
+ "0.041292917490562644",
+ "0.017",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "105",
"harbison",
- "85",
- "BrentLab/Hackett_2020;hackett_2020"
+ "10",
+ "BrentLab/hughes_2006;overexpression"
],
[
"1",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;83",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;108",
+ "BrentLab/hughes_2006;overexpression;11",
+ "60.0",
+ "67.0",
+ "60.0",
+ "67.0",
+ "0.05428351009647073",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "108",
"harbison",
- "83",
- "BrentLab/Hackett_2020;hackett_2020"
+ "11",
+ "BrentLab/hughes_2006;overexpression"
],
[
"2",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;84",
- "2.0",
- "1.0",
- "3.0",
- "1.0",
- "0.0",
- "0.011",
+ "BrentLab/harbison_2004;harbison_2004;109",
+ "BrentLab/hughes_2006;overexpression;11",
+ "27.0",
+ "1265.0",
+ "27.0",
+ "1265.0",
+ "0.12321364371741866",
+ "0.057",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "109",
"harbison",
- "84",
- "BrentLab/Hackett_2020;hackett_2020"
+ "11",
+ "BrentLab/hughes_2006;overexpression"
],
[
"3",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;78",
- "487.0",
- "96.0",
- "479.0",
- "92.0",
- "0.4121918908550328",
- "0.576",
+ "BrentLab/harbison_2004;harbison_2004;112",
+ "BrentLab/hughes_2006;overexpression;12",
+ "532.0",
+ "1093.0",
+ "532.0",
+ "1093.0",
+ "0.4363046674390623",
+ "0.092",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "112",
"harbison",
- "78",
- "BrentLab/Hackett_2020;hackett_2020"
+ "12",
+ "BrentLab/hughes_2006;overexpression"
],
[
"4",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;81",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;113",
+ "BrentLab/hughes_2006;overexpression;12",
+ "10.0",
+ "556.0",
+ "10.0",
+ "556.0",
+ "0.01756663927480034",
+ "0.002",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "113",
"harbison",
- "81",
- "BrentLab/Hackett_2020;hackett_2020"
+ "12",
+ "BrentLab/hughes_2006;overexpression"
],
[
"5",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;33",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;118",
+ "BrentLab/hughes_2006;overexpression;13",
+ "574.0",
+ "354.0",
+ "574.0",
+ "354.0",
+ "0.13894295437217577",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "118",
"harbison",
- "33",
- "BrentLab/Hackett_2020;hackett_2020"
+ "13",
+ "BrentLab/hughes_2006;overexpression"
],
[
"6",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;73",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;119",
+ "BrentLab/hughes_2006;overexpression;13",
+ "251.0",
+ "492.0",
+ "251.0",
+ "492.0",
+ "0.11808548603694578",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "119",
"harbison",
- "73",
- "BrentLab/Hackett_2020;hackett_2020"
+ "13",
+ "BrentLab/hughes_2006;overexpression"
],
[
"7",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;47",
- "407.0",
- "310.0",
- "378.0",
- "306.0",
- "0.2038622347205313",
- "0.441",
+ "BrentLab/harbison_2004;harbison_2004;120",
+ "BrentLab/hughes_2006;overexpression;13",
+ "14.0",
+ "2954.0",
+ "14.0",
+ "2954.0",
+ "0.1616346595561947",
+ "1.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "120",
"harbison",
- "47",
- "BrentLab/Hackett_2020;hackett_2020"
+ "13",
+ "BrentLab/hughes_2006;overexpression"
],
[
"8",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;46",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;121",
+ "BrentLab/hughes_2006;overexpression;13",
+ "422.0",
+ "544.0",
+ "423.0",
+ "544.0",
+ "0.401585299611564",
+ "0.001",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "121",
"harbison",
- "46",
- "BrentLab/Hackett_2020;hackett_2020"
+ "13",
+ "BrentLab/hughes_2006;overexpression"
],
[
"9",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;45",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;122",
+ "BrentLab/hughes_2006;overexpression;14",
+ "842.0",
+ "152.0",
+ "842.0",
+ "152.0",
+ "0.37750827352885596",
+ "0.106",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "122",
"harbison",
- "45",
- "BrentLab/Hackett_2020;hackett_2020"
+ "14",
+ "BrentLab/hughes_2006;overexpression"
],
[
"10",
- "BrentLab/harbison_2004;harbison_2004;8",
- "BrentLab/Hackett_2020;hackett_2020;48",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;124",
+ "BrentLab/hughes_2006;overexpression;15",
+ "402.0",
+ "1417.0",
+ "402.0",
+ "1417.0",
+ "0.279937313245534",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "8",
+ "hughes_2006-overexpression",
+ "124",
"harbison",
- "48",
- "BrentLab/Hackett_2020;hackett_2020"
+ "15",
+ "BrentLab/hughes_2006;overexpression"
],
[
"11",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;34",
- "198.0",
- "26.0",
- "193.0",
- "24.0",
- "0.7367526600236447",
- "0.512",
+ "BrentLab/harbison_2004;harbison_2004;137",
+ "BrentLab/hughes_2006;overexpression;17",
+ "29.0",
+ "5.0",
+ "29.0",
+ "5.0",
+ "0.005954520941937803",
+ "0.043",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "137",
"harbison",
- "34",
- "BrentLab/Hackett_2020;hackett_2020"
+ "17",
+ "BrentLab/hughes_2006;overexpression"
],
[
"12",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;88",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;141",
+ "BrentLab/hughes_2006;overexpression;18",
+ "653.0",
+ "1620.0",
+ "654.0",
+ "1620.0",
+ "0.442997844156436",
+ "0.812",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "141",
"harbison",
- "88",
- "BrentLab/Hackett_2020;hackett_2020"
+ "18",
+ "BrentLab/hughes_2006;overexpression"
],
[
"13",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;79",
- "278.0",
- "82.0",
- "275.0",
- "76.0",
- "0.3669436052366566",
- "0.531",
+ "BrentLab/harbison_2004;harbison_2004;142",
+ "BrentLab/hughes_2006;overexpression;18",
+ "497.0",
+ "25.0",
+ "497.0",
+ "25.0",
+ "0.3308129606327521",
+ "0.921",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "142",
"harbison",
- "79",
- "BrentLab/Hackett_2020;hackett_2020"
+ "18",
+ "BrentLab/hughes_2006;overexpression"
],
[
"14",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;74",
- "386.0",
- "2.0",
- "381.0",
- "2.0",
- "0.0478033736153071",
- "0.596",
+ "BrentLab/harbison_2004;harbison_2004;150",
+ "BrentLab/hughes_2006;overexpression;19",
+ "91.0",
+ "1948.0",
+ "91.0",
+ "1948.0",
+ "0.2949755757517485",
+ "0.578",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "150",
"harbison",
- "74",
- "BrentLab/Hackett_2020;hackett_2020"
+ "19",
+ "BrentLab/hughes_2006;overexpression"
],
[
"15",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;87",
- "2.0",
- "2.0",
- "3.0",
- "2.0",
- "0.0002250900360144",
- "0.01",
+ "BrentLab/harbison_2004;harbison_2004;151",
+ "BrentLab/hughes_2006;overexpression;21",
+ "57.0",
+ "386.0",
+ "57.0",
+ "386.0",
+ "0.0656826352687399",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "151",
"harbison",
- "87",
- "BrentLab/Hackett_2020;hackett_2020"
+ "21",
+ "BrentLab/hughes_2006;overexpression"
],
[
"16",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;82",
- "2.0",
- "2.0",
- "3.0",
- "2.0",
- "0.0002250900360144",
- "0.005",
+ "BrentLab/harbison_2004;harbison_2004;152",
+ "BrentLab/hughes_2006;overexpression;21",
+ "272.0",
+ "526.0",
+ "272.0",
+ "526.0",
+ "0.2405177062735934",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "152",
"harbison",
- "82",
- "BrentLab/Hackett_2020;hackett_2020"
+ "21",
+ "BrentLab/hughes_2006;overexpression"
],
[
"17",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;40",
- "233.0",
- "887.0",
- "228.0",
- "853.0",
- "0.4419109947643979",
- "0.306",
+ "BrentLab/harbison_2004;harbison_2004;153",
+ "BrentLab/hughes_2006;overexpression;21",
+ "186.0",
+ "1060.0",
+ "186.0",
+ "1060.0",
+ "0.20770457061222172",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "153",
"harbison",
- "40",
- "BrentLab/Hackett_2020;hackett_2020"
+ "21",
+ "BrentLab/hughes_2006;overexpression"
],
[
"18",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;37",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;154",
+ "BrentLab/hughes_2006;overexpression;21",
+ "65.0",
+ "398.0",
+ "65.0",
+ "398.0",
+ "0.10461443622068167",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "154",
"harbison",
- "37",
- "BrentLab/Hackett_2020;hackett_2020"
+ "21",
+ "BrentLab/hughes_2006;overexpression"
],
[
"19",
- "BrentLab/harbison_2004;harbison_2004;3",
- "BrentLab/Hackett_2020;hackett_2020;86",
- "2.0",
- "2.0",
- "3.0",
- "2.0",
- "0.0002250900360144",
- "0.014",
+ "BrentLab/harbison_2004;harbison_2004;157",
+ "BrentLab/hughes_2006;overexpression;22",
+ "482.0",
+ "176.0",
+ "482.0",
+ "176.0",
+ "0.14485664209958654",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "3",
+ "hughes_2006-overexpression",
+ "157",
"harbison",
- "86",
- "BrentLab/Hackett_2020;hackett_2020"
+ "22",
+ "BrentLab/hughes_2006;overexpression"
],
[
"20",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;75",
- "386.0",
- "4.0",
- "381.0",
- "4.0",
- "0.1752790365894595",
- "0.871",
+ "BrentLab/harbison_2004;harbison_2004;158",
+ "BrentLab/hughes_2006;overexpression;22",
+ "354.0",
+ "215.0",
+ "354.0",
+ "215.0",
+ "0.12060713643717419",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "158",
"harbison",
- "75",
- "BrentLab/Hackett_2020;hackett_2020"
+ "22",
+ "BrentLab/hughes_2006;overexpression"
],
[
"21",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;77",
- "487.0",
- "15.0",
- "479.0",
- "13.0",
- "0.1591137965760322",
- "0.23",
+ "BrentLab/harbison_2004;harbison_2004;159",
+ "BrentLab/hughes_2006;overexpression;22",
+ "550.0",
+ "611.0",
+ "550.0",
+ "611.0",
+ "0.2924649934604871",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "159",
"harbison",
- "77",
- "BrentLab/Hackett_2020;hackett_2020"
+ "22",
+ "BrentLab/hughes_2006;overexpression"
],
[
"22",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;38",
- "28.0",
- "394.0",
- "29.0",
- "375.0",
- "0.1464068569498395",
- "0.309",
+ "BrentLab/harbison_2004;harbison_2004;160",
+ "BrentLab/hughes_2006;overexpression;22",
+ "77.0",
+ "625.0",
+ "77.0",
+ "625.0",
+ "0.1062495373846105",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "160",
"harbison",
- "38",
- "BrentLab/Hackett_2020;hackett_2020"
+ "22",
+ "BrentLab/hughes_2006;overexpression"
],
[
"23",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;36",
- "242.0",
- "239.0",
- "237.0",
- "230.0",
- "0.4474384543548884",
- "0.644",
+ "BrentLab/harbison_2004;harbison_2004;161",
+ "BrentLab/hughes_2006;overexpression;23",
+ "37.0",
+ "3236.0",
+ "37.0",
+ "3236.0",
+ "0.014875454821573575",
+ "0.456",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "161",
"harbison",
- "36",
- "BrentLab/Hackett_2020;hackett_2020"
+ "23",
+ "BrentLab/hughes_2006;overexpression"
],
[
"24",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;35",
- "12.0",
- "136.0",
- "12.0",
- "129.0",
- "0.1014820131734504",
- "0.411",
+ "BrentLab/harbison_2004;harbison_2004;162",
+ "BrentLab/hughes_2006;overexpression;24",
+ "417.0",
+ "1082.0",
+ "417.0",
+ "1082.0",
+ "0.22690440962955793",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "162",
"harbison",
- "35",
- "BrentLab/Hackett_2020;hackett_2020"
+ "24",
+ "BrentLab/hughes_2006;overexpression"
],
[
"25",
- "BrentLab/harbison_2004;harbison_2004;2",
- "BrentLab/Hackett_2020;hackett_2020;39",
- "236.0",
- "462.0",
- "231.0",
- "442.0",
- "0.4406392501266677",
- "0.536",
+ "BrentLab/harbison_2004;harbison_2004;163",
+ "BrentLab/hughes_2006;overexpression;24",
+ "896.0",
+ "710.0",
+ "896.0",
+ "710.0",
+ "0.41161010647006896",
+ "0.002",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "2",
+ "hughes_2006-overexpression",
+ "163",
"harbison",
- "39",
- "BrentLab/Hackett_2020;hackett_2020"
+ "24",
+ "BrentLab/hughes_2006;overexpression"
],
[
"26",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;65",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;174",
+ "BrentLab/hughes_2006;overexpression;26",
+ "55.0",
+ "2135.0",
+ "55.0",
+ "2135.0",
+ "0.08879402276624998",
+ "0.006",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "174",
"harbison",
- "65",
- "BrentLab/Hackett_2020;hackett_2020"
+ "26",
+ "BrentLab/hughes_2006;overexpression"
],
[
"27",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;80",
- "386.0",
- "12.0",
- "381.0",
- "11.0",
- "0.1530190500167841",
- "0.26",
+ "BrentLab/harbison_2004;harbison_2004;175",
+ "BrentLab/hughes_2006;overexpression;27",
+ "79.0",
+ "354.0",
+ "79.0",
+ "354.0",
+ "0.36280804176948345",
+ "0.485",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "175",
"harbison",
- "80",
- "BrentLab/Hackett_2020;hackett_2020"
+ "27",
+ "BrentLab/hughes_2006;overexpression"
],
[
"28",
- "BrentLab/harbison_2004;harbison_2004;4",
- "BrentLab/Hackett_2020;hackett_2020;76",
- "386.0",
- "13.0",
- "381.0",
- "13.0",
- "0.3335221550855992",
- "0.723",
+ "BrentLab/harbison_2004;harbison_2004;176",
+ "BrentLab/hughes_2006;overexpression;27",
+ "1.0",
+ "604.0",
+ "1.0",
+ "604.0",
+ "0.0",
+ "0.981",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "4",
+ "hughes_2006-overexpression",
+ "176",
"harbison",
- "76",
- "BrentLab/Hackett_2020;hackett_2020"
+ "27",
+ "BrentLab/hughes_2006;overexpression"
],
[
"29",
- "BrentLab/harbison_2004;harbison_2004;10",
- "BrentLab/Hackett_2020;hackett_2020;48",
- "467.0",
- "60.0",
- "454.0",
- "60.0",
- "0.1983655120981107",
- "0.035",
+ "BrentLab/harbison_2004;harbison_2004;177",
+ "BrentLab/hughes_2006;overexpression;28",
+ "10.0",
+ "3654.0",
+ "10.0",
+ "3654.0",
+ "0.0",
+ "1.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "10",
+ "hughes_2006-overexpression",
+ "177",
"harbison",
- "48",
- "BrentLab/Hackett_2020;hackett_2020"
+ "28",
+ "BrentLab/hughes_2006;overexpression"
],
[
"30",
- "BrentLab/harbison_2004;harbison_2004;10",
- "BrentLab/Hackett_2020;hackett_2020;47",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;178",
+ "BrentLab/hughes_2006;overexpression;28",
+ "20.0",
+ "61.0",
+ "22.0",
+ "61.0",
+ "0.10253010965306489",
+ "0.707",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "10",
+ "hughes_2006-overexpression",
+ "178",
"harbison",
- "47",
- "BrentLab/Hackett_2020;hackett_2020"
+ "28",
+ "BrentLab/hughes_2006;overexpression"
],
[
"31",
- "BrentLab/harbison_2004;harbison_2004;10",
- "BrentLab/Hackett_2020;hackett_2020;46",
- "284.0",
- "47.0",
- "278.0",
- "46.0",
- "0.0992715955737997",
- "0.003",
+ "BrentLab/harbison_2004;harbison_2004;179",
+ "BrentLab/hughes_2006;overexpression;28",
+ "6.0",
+ "1128.0",
+ "6.0",
+ "1128.0",
+ "0.15157064533525078",
+ "0.968",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "10",
+ "hughes_2006-overexpression",
+ "179",
"harbison",
- "46",
- "BrentLab/Hackett_2020;hackett_2020"
+ "28",
+ "BrentLab/hughes_2006;overexpression"
],
[
"32",
- "BrentLab/harbison_2004;harbison_2004;11",
- "BrentLab/Hackett_2020;hackett_2020;48",
- "472.0",
- "1.0",
- "459.0",
- "1.0",
- "0.0",
- "0.915",
+ "BrentLab/harbison_2004;harbison_2004;191",
+ "BrentLab/hughes_2006;overexpression;29",
+ "342.0",
+ "174.0",
+ "342.0",
+ "174.0",
+ "0.42452813230271436",
+ "0.452",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "11",
+ "hughes_2006-overexpression",
+ "191",
"harbison",
- "48",
- "BrentLab/Hackett_2020;hackett_2020"
+ "29",
+ "BrentLab/hughes_2006;overexpression"
],
[
"33",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;41",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;192",
+ "BrentLab/hughes_2006;overexpression;30",
+ "132.0",
+ "227.0",
+ "132.0",
+ "227.0",
+ "0.22362783869614716",
+ "0.002",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "192",
"harbison",
- "41",
- "BrentLab/Hackett_2020;hackett_2020"
+ "30",
+ "BrentLab/hughes_2006;overexpression"
],
[
"34",
- "BrentLab/harbison_2004;harbison_2004;16",
- "BrentLab/Hackett_2020;hackett_2020;89",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;193",
+ "BrentLab/hughes_2006;overexpression;30",
+ "322.0",
+ "442.0",
+ "322.0",
+ "442.0",
+ "0.40950351528951207",
+ "0.021",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "16",
+ "hughes_2006-overexpression",
+ "193",
"harbison",
- "89",
- "BrentLab/Hackett_2020;hackett_2020"
+ "30",
+ "BrentLab/hughes_2006;overexpression"
],
[
"35",
- "BrentLab/harbison_2004;harbison_2004;8",
- "BrentLab/Hackett_2020;hackett_2020;41",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;194",
+ "BrentLab/hughes_2006;overexpression;30",
+ "76.0",
+ "43.0",
+ "76.0",
+ "43.0",
+ "0.12124752831206184",
+ "0.395",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "8",
+ "hughes_2006-overexpression",
+ "194",
"harbison",
- "41",
- "BrentLab/Hackett_2020;hackett_2020"
+ "30",
+ "BrentLab/hughes_2006;overexpression"
],
[
"36",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;43",
- "2.0",
- "330.0",
- "2.0",
- "318.0",
- "0.0",
- "0.195",
+ "BrentLab/harbison_2004;harbison_2004;201",
+ "BrentLab/hughes_2006;overexpression;31",
+ "136.0",
+ "1104.0",
+ "136.0",
+ "1104.0",
+ "0.2752121157648751",
+ "0.001",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "201",
"harbison",
- "43",
- "BrentLab/Hackett_2020;hackett_2020"
+ "31",
+ "BrentLab/hughes_2006;overexpression"
],
[
"37",
- "BrentLab/harbison_2004;harbison_2004;16",
- "BrentLab/Hackett_2020;hackett_2020;91",
- "9.0",
- "1.0",
- "9.0",
- "1.0",
+ "BrentLab/harbison_2004;harbison_2004;202",
+ "BrentLab/hughes_2006;overexpression;31",
+ "287.0",
+ "36.0",
+ "287.0",
+ "36.0",
+ "0.06401671759841812",
"0.0",
- "0.019",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "16",
+ "hughes_2006-overexpression",
+ "202",
"harbison",
- "91",
- "BrentLab/Hackett_2020;hackett_2020"
+ "31",
+ "BrentLab/hughes_2006;overexpression"
],
[
"38",
- "BrentLab/harbison_2004;harbison_2004;17",
- "BrentLab/Hackett_2020;hackett_2020;91",
- "2.0",
- "1.0",
- "2.0",
- "1.0",
- "0.0",
- "0.008",
+ "BrentLab/harbison_2004;harbison_2004;203",
+ "BrentLab/hughes_2006;overexpression;31",
+ "88.0",
+ "41.0",
+ "88.0",
+ "41.0",
+ "0.06563294471122981",
+ "0.003",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "17",
+ "hughes_2006-overexpression",
+ "203",
"harbison",
- "91",
- "BrentLab/Hackett_2020;hackett_2020"
+ "31",
+ "BrentLab/hughes_2006;overexpression"
],
[
"39",
- "BrentLab/harbison_2004;harbison_2004;8",
- "BrentLab/Hackett_2020;hackett_2020;43",
- "290.0",
- "412.0",
- "278.0",
- "386.0",
- "0.4521656634210855",
- "0.208",
+ "BrentLab/harbison_2004;harbison_2004;204",
+ "BrentLab/hughes_2006;overexpression;31",
+ "318.0",
+ "1948.0",
+ "319.0",
+ "1948.0",
+ "0.380107954958676",
+ "0.57",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "8",
+ "hughes_2006-overexpression",
+ "204",
"harbison",
- "43",
- "BrentLab/Hackett_2020;hackett_2020"
+ "31",
+ "BrentLab/hughes_2006;overexpression"
],
[
"40",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;66",
- "398.0",
- "16.0",
- "390.0",
- "15.0",
- "0.2406042358803986",
- "0.431",
+ "BrentLab/harbison_2004;harbison_2004;205",
+ "BrentLab/hughes_2006;overexpression;31",
+ "467.0",
+ "646.0",
+ "467.0",
+ "646.0",
+ "0.42659723019346846",
+ "0.006",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "205",
"harbison",
- "66",
- "BrentLab/Hackett_2020;hackett_2020"
+ "31",
+ "BrentLab/hughes_2006;overexpression"
],
[
"41",
- "BrentLab/harbison_2004;harbison_2004;7",
- "BrentLab/Hackett_2020;hackett_2020;42",
- "122.0",
- "212.0",
- "120.0",
- "206.0",
- "0.3447911486822476",
- "0.49",
+ "BrentLab/harbison_2004;harbison_2004;207",
+ "BrentLab/hughes_2006;overexpression;32",
+ "55.0",
+ "230.0",
+ "56.0",
+ "230.0",
+ "0.3233042722751513",
+ "0.796",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "7",
+ "hughes_2006-overexpression",
+ "207",
"harbison",
- "42",
- "BrentLab/Hackett_2020;hackett_2020"
+ "32",
+ "BrentLab/hughes_2006;overexpression"
],
[
"42",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;72",
- "346.0",
- "18.0",
- "338.0",
- "16.0",
- "0.22671996124031",
- "0.528",
+ "BrentLab/harbison_2004;harbison_2004;208",
+ "BrentLab/hughes_2006;overexpression;32",
+ "25.0",
+ "126.0",
+ "25.0",
+ "126.0",
+ "0.0489281862304512",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "208",
"harbison",
- "72",
- "BrentLab/Hackett_2020;hackett_2020"
+ "32",
+ "BrentLab/hughes_2006;overexpression"
],
[
"43",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;69",
- "118.0",
- "120.0",
- "115.0",
- "113.0",
- "0.3139880952380952",
- "0.454",
+ "BrentLab/harbison_2004;harbison_2004;209",
+ "BrentLab/hughes_2006;overexpression;32",
+ "122.0",
+ "688.0",
+ "122.0",
+ "688.0",
+ "0.10777396924484826",
+ "0.0",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "209",
"harbison",
- "69",
- "BrentLab/Hackett_2020;hackett_2020"
+ "32",
+ "BrentLab/hughes_2006;overexpression"
],
[
"44",
- "BrentLab/harbison_2004;harbison_2004;20",
- "BrentLab/Hackett_2020;hackett_2020;99",
- "3.0",
- "1.0",
- "4.0",
- "1.0",
- "0.0",
- "0.006",
+ "BrentLab/harbison_2004;harbison_2004;210",
+ "BrentLab/hughes_2006;overexpression;33",
+ "97.0",
+ "2113.0",
+ "97.0",
+ "2113.0",
+ "0.30052307036231024",
+ "0.807",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "20",
+ "hughes_2006-overexpression",
+ "210",
"harbison",
- "99",
- "BrentLab/Hackett_2020;hackett_2020"
+ "33",
+ "BrentLab/hughes_2006;overexpression"
],
[
"45",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;70",
- "260.0",
- "17.0",
- "256.0",
- "17.0",
- "0.1850671373200443",
- "0.455",
+ "BrentLab/harbison_2004;harbison_2004;219",
+ "BrentLab/hughes_2006;overexpression;34",
+ "172.0",
+ "245.0",
+ "172.0",
+ "245.0",
+ "0.41551695727724847",
+ "0.505",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "219",
"harbison",
- "70",
- "BrentLab/Hackett_2020;hackett_2020"
+ "34",
+ "BrentLab/hughes_2006;overexpression"
],
[
"46",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;67",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;225",
+ "BrentLab/hughes_2006;overexpression;35",
+ "314.0",
+ "12.0",
+ "314.0",
+ "12.0",
+ "0.15336823656300558",
+ "0.877",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "225",
"harbison",
- "67",
- "BrentLab/Hackett_2020;hackett_2020"
+ "35",
+ "BrentLab/hughes_2006;overexpression"
],
[
"47",
- "BrentLab/harbison_2004;harbison_2004;20",
- "BrentLab/Hackett_2020;hackett_2020;97",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;228",
+ "BrentLab/hughes_2006;overexpression;36",
+ "358.0",
+ "2316.0",
+ "358.0",
+ "2316.0",
+ "0.33853600995025945",
+ "0.804",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "20",
+ "hughes_2006-overexpression",
+ "228",
"harbison",
- "97",
- "BrentLab/Hackett_2020;hackett_2020"
+ "36",
+ "BrentLab/hughes_2006;overexpression"
],
[
"48",
- "BrentLab/harbison_2004;harbison_2004;5",
- "BrentLab/Hackett_2020;hackett_2020;68",
- "260.0",
- "2.0",
- "256.0",
- "2.0",
- "0.0317379568106312",
- "0.647",
+ "BrentLab/harbison_2004;harbison_2004;231",
+ "BrentLab/hughes_2006;overexpression;38",
+ "77.0",
+ "362.0",
+ "77.0",
+ "362.0",
+ "0.32227814728264126",
+ "0.36",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "5",
+ "hughes_2006-overexpression",
+ "231",
"harbison",
- "68",
- "BrentLab/Hackett_2020;hackett_2020"
+ "38",
+ "BrentLab/hughes_2006;overexpression"
],
[
"49",
- "BrentLab/harbison_2004;harbison_2004;10",
- "BrentLab/Hackett_2020;hackett_2020;41",
- null,
- null,
- null,
- null,
- null,
- null,
+ "BrentLab/harbison_2004;harbison_2004;232",
+ "BrentLab/hughes_2006;overexpression;38",
+ "40.0",
+ "3302.0",
+ "41.0",
+ "3302.0",
+ "0.01832419557792558",
+ "0.593",
+ "log2fc",
"harbison_2004-harbison_2004",
- "Hackett_2020-hackett_2020",
- "10",
+ "hughes_2006-overexpression",
+ "232",
"harbison",
- "41",
- "BrentLab/Hackett_2020;hackett_2020"
+ "38",
+ "BrentLab/hughes_2006;overexpression"
]
],
"shape": {
- "columns": 14,
- "rows": 9604
+ "columns": 15,
+ "rows": 29804
}
},
"text/html": [
@@ -4619,6 +4618,7 @@
" perturbation_set_size | \n",
" dto_fdr | \n",
" dto_empirical_pvalue | \n",
+ " pr_ranking_column | \n",
" binding_repo_dataset | \n",
" perturbation_repo_dataset | \n",
" binding_id_id | \n",
@@ -4630,88 +4630,93 @@
" \n",
" \n",
" | 0 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;85 | \n",
- " 2.0 | \n",
- " 2.0 | \n",
- " 3.0 | \n",
- " 2.0 | \n",
- " 0.000225 | \n",
- " 0.004 | \n",
+ " BrentLab/harbison_2004;harbison_2004;105 | \n",
+ " BrentLab/hughes_2006;overexpression;10 | \n",
+ " 11.0 | \n",
+ " 206.0 | \n",
+ " 12.0 | \n",
+ " 206.0 | \n",
+ " 0.041293 | \n",
+ " 0.017 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 105 | \n",
" harbison | \n",
- " 85 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 10 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 1 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;83 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " BrentLab/harbison_2004;harbison_2004;108 | \n",
+ " BrentLab/hughes_2006;overexpression;11 | \n",
+ " 60.0 | \n",
+ " 67.0 | \n",
+ " 60.0 | \n",
+ " 67.0 | \n",
+ " 0.054284 | \n",
+ " 0.000 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 108 | \n",
" harbison | \n",
- " 83 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 11 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 2 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;84 | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 3.0 | \n",
- " 1.0 | \n",
- " 0.000000 | \n",
- " 0.011 | \n",
+ " BrentLab/harbison_2004;harbison_2004;109 | \n",
+ " BrentLab/hughes_2006;overexpression;11 | \n",
+ " 27.0 | \n",
+ " 1265.0 | \n",
+ " 27.0 | \n",
+ " 1265.0 | \n",
+ " 0.123214 | \n",
+ " 0.057 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 109 | \n",
" harbison | \n",
- " 84 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 11 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 3 | \n",
- " BrentLab/harbison_2004;harbison_2004;4 | \n",
- " BrentLab/Hackett_2020;hackett_2020;78 | \n",
- " 487.0 | \n",
- " 96.0 | \n",
- " 479.0 | \n",
- " 92.0 | \n",
- " 0.412192 | \n",
- " 0.576 | \n",
+ " BrentLab/harbison_2004;harbison_2004;112 | \n",
+ " BrentLab/hughes_2006;overexpression;12 | \n",
+ " 532.0 | \n",
+ " 1093.0 | \n",
+ " 532.0 | \n",
+ " 1093.0 | \n",
+ " 0.436305 | \n",
+ " 0.092 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 4 | \n",
+ " hughes_2006-overexpression | \n",
+ " 112 | \n",
" harbison | \n",
- " 78 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 12 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | 4 | \n",
- " BrentLab/harbison_2004;harbison_2004;3 | \n",
- " BrentLab/Hackett_2020;hackett_2020;81 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " BrentLab/harbison_2004;harbison_2004;113 | \n",
+ " BrentLab/hughes_2006;overexpression;12 | \n",
+ " 10.0 | \n",
+ " 556.0 | \n",
+ " 10.0 | \n",
+ " 556.0 | \n",
+ " 0.017567 | \n",
+ " 0.002 | \n",
+ " log2fc | \n",
" harbison_2004-harbison_2004 | \n",
- " Hackett_2020-hackett_2020 | \n",
- " 3 | \n",
+ " hughes_2006-overexpression | \n",
+ " 113 | \n",
" harbison | \n",
- " 81 | \n",
- " BrentLab/Hackett_2020;hackett_2020 | \n",
+ " 12 | \n",
+ " BrentLab/hughes_2006;overexpression | \n",
"
\n",
" \n",
" | ... | \n",
@@ -4729,193 +4734,199 @@
" ... | \n",
" ... | \n",
" ... | \n",
+ " ... | \n",
"
\n",
" \n",
- " | 9599 | \n",
- " BrentLab/callingcards;annotated_features;804 | \n",
- " BrentLab/kemmeren_2014;kemmeren_2014;901 | \n",
- " 14.0 | \n",
- " 39.0 | \n",
- " 13.0 | \n",
- " 39.0 | \n",
- " 0.000879 | \n",
+ " 29799 | \n",
+ " BrentLab/callingcards;annotated_features_combi... | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;784 | \n",
+ " 154.0 | \n",
+ " 905.0 | \n",
+ " 154.0 | \n",
+ " 905.0 | \n",
+ " 0.090665 | \n",
" 0.000 | \n",
- " callingcards-annotated_features | \n",
+ " pvalue | \n",
+ " callingcards-annotated_features_combined | \n",
" kemmeren_2014-kemmeren_2014 | \n",
- " 804 | \n",
- " BrentLab/callingcards;annotated_features | \n",
- " 901 | \n",
+ " 724-692-688 | \n",
+ " BrentLab/callingcards;annotated_features_combined | \n",
+ " 784 | \n",
" kemmeren | \n",
"
\n",
" \n",
- " | 9600 | \n",
- " BrentLab/callingcards;annotated_features;805 | \n",
- " BrentLab/kemmeren_2014;kemmeren_2014;1053 | \n",
- " 18.0 | \n",
- " 278.0 | \n",
- " 17.0 | \n",
- " 171.0 | \n",
- " 0.001455 | \n",
- " 0.000 | \n",
- " callingcards-annotated_features | \n",
+ " 29800 | \n",
+ " BrentLab/callingcards;annotated_features_combi... | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;666 | \n",
+ " 215.0 | \n",
+ " 108.0 | \n",
+ " 215.0 | \n",
+ " 108.0 | \n",
+ " 0.075036 | \n",
+ " 0.005 | \n",
+ " pvalue | \n",
+ " callingcards-annotated_features_combined | \n",
" kemmeren_2014-kemmeren_2014 | \n",
- " 805 | \n",
- " BrentLab/callingcards;annotated_features | \n",
- " 1053 | \n",
+ " 725-435-395 | \n",
+ " BrentLab/callingcards;annotated_features_combined | \n",
+ " 666 | \n",
" kemmeren | \n",
"
\n",
" \n",
- " | 9601 | \n",
- " BrentLab/callingcards;annotated_features;808 | \n",
- " BrentLab/kemmeren_2014;kemmeren_2014;218 | \n",
- " 20.0 | \n",
- " 57.0 | \n",
- " 19.0 | \n",
- " 27.0 | \n",
- " 0.003116 | \n",
- " 0.000 | \n",
- " callingcards-annotated_features | \n",
+ " 29801 | \n",
+ " BrentLab/callingcards;annotated_features_combi... | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;271 | \n",
+ " 221.0 | \n",
+ " 925.0 | \n",
+ " 221.0 | \n",
+ " 925.0 | \n",
+ " 0.403484 | \n",
+ " 0.126 | \n",
+ " pvalue | \n",
+ " callingcards-annotated_features_combined | \n",
" kemmeren_2014-kemmeren_2014 | \n",
- " 808 | \n",
- " BrentLab/callingcards;annotated_features | \n",
- " 218 | \n",
+ " 726-445-424 | \n",
+ " BrentLab/callingcards;annotated_features_combined | \n",
+ " 271 | \n",
" kemmeren | \n",
"
\n",
" \n",
- " | 9602 | \n",
- " BrentLab/callingcards;annotated_features;806 | \n",
- " BrentLab/kemmeren_2014;kemmeren_2014;1023 | \n",
- " 10.0 | \n",
- " 9.0 | \n",
- " 11.0 | \n",
- " 9.0 | \n",
- " 0.000000 | \n",
- " 0.000 | \n",
- " callingcards-annotated_features | \n",
+ " 29802 | \n",
+ " BrentLab/callingcards;annotated_features_combi... | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;1077 | \n",
+ " 281.0 | \n",
+ " 73.0 | \n",
+ " 283.0 | \n",
+ " 77.0 | \n",
+ " 0.095948 | \n",
+ " 0.174 | \n",
+ " pvalue | \n",
+ " callingcards-annotated_features_combined | \n",
" kemmeren_2014-kemmeren_2014 | \n",
- " 806 | \n",
- " BrentLab/callingcards;annotated_features | \n",
- " 1023 | \n",
+ " 79-33 | \n",
+ " BrentLab/callingcards;annotated_features_combined | \n",
+ " 1077 | \n",
" kemmeren | \n",
"
\n",
" \n",
- " | 9603 | \n",
- " BrentLab/callingcards;annotated_features;809 | \n",
- " BrentLab/kemmeren_2014;kemmeren_2014;913 | \n",
- " 150.0 | \n",
- " 221.0 | \n",
- " 140.0 | \n",
- " 206.0 | \n",
- " 0.116890 | \n",
+ " 29803 | \n",
+ " BrentLab/callingcards;annotated_features_combi... | \n",
+ " BrentLab/kemmeren_2014;kemmeren_2014;963 | \n",
+ " 526.0 | \n",
+ " 227.0 | \n",
+ " 527.0 | \n",
+ " 227.0 | \n",
+ " 0.064919 | \n",
" 0.000 | \n",
- " callingcards-annotated_features | \n",
+ " pvalue | \n",
+ " callingcards-annotated_features_combined | \n",
" kemmeren_2014-kemmeren_2014 | \n",
- " 809 | \n",
- " BrentLab/callingcards;annotated_features | \n",
- " 913 | \n",
+ " 96-49 | \n",
+ " BrentLab/callingcards;annotated_features_combined | \n",
+ " 963 | \n",
" kemmeren | \n",
"
\n",
" \n",
"\n",
- "9604 rows × 14 columns
\n",
+ "29804 rows × 15 columns
\n",
""
],
"text/plain": [
- " binding_id \\\n",
- "0 BrentLab/harbison_2004;harbison_2004;3 \n",
- "1 BrentLab/harbison_2004;harbison_2004;3 \n",
- "2 BrentLab/harbison_2004;harbison_2004;3 \n",
- "3 BrentLab/harbison_2004;harbison_2004;4 \n",
- "4 BrentLab/harbison_2004;harbison_2004;3 \n",
- "... ... \n",
- "9599 BrentLab/callingcards;annotated_features;804 \n",
- "9600 BrentLab/callingcards;annotated_features;805 \n",
- "9601 BrentLab/callingcards;annotated_features;808 \n",
- "9602 BrentLab/callingcards;annotated_features;806 \n",
- "9603 BrentLab/callingcards;annotated_features;809 \n",
+ " binding_id \\\n",
+ "0 BrentLab/harbison_2004;harbison_2004;105 \n",
+ "1 BrentLab/harbison_2004;harbison_2004;108 \n",
+ "2 BrentLab/harbison_2004;harbison_2004;109 \n",
+ "3 BrentLab/harbison_2004;harbison_2004;112 \n",
+ "4 BrentLab/harbison_2004;harbison_2004;113 \n",
+ "... ... \n",
+ "29799 BrentLab/callingcards;annotated_features_combi... \n",
+ "29800 BrentLab/callingcards;annotated_features_combi... \n",
+ "29801 BrentLab/callingcards;annotated_features_combi... \n",
+ "29802 BrentLab/callingcards;annotated_features_combi... \n",
+ "29803 BrentLab/callingcards;annotated_features_combi... \n",
"\n",
- " perturbation_id binding_rank_threshold \\\n",
- "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n",
- "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n",
- "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n",
- "3 BrentLab/Hackett_2020;hackett_2020;78 487.0 \n",
- "4 BrentLab/Hackett_2020;hackett_2020;81 NaN \n",
- "... ... ... \n",
- "9599 BrentLab/kemmeren_2014;kemmeren_2014;901 14.0 \n",
- "9600 BrentLab/kemmeren_2014;kemmeren_2014;1053 18.0 \n",
- "9601 BrentLab/kemmeren_2014;kemmeren_2014;218 20.0 \n",
- "9602 BrentLab/kemmeren_2014;kemmeren_2014;1023 10.0 \n",
- "9603 BrentLab/kemmeren_2014;kemmeren_2014;913 150.0 \n",
+ " perturbation_id binding_rank_threshold \\\n",
+ "0 BrentLab/hughes_2006;overexpression;10 11.0 \n",
+ "1 BrentLab/hughes_2006;overexpression;11 60.0 \n",
+ "2 BrentLab/hughes_2006;overexpression;11 27.0 \n",
+ "3 BrentLab/hughes_2006;overexpression;12 532.0 \n",
+ "4 BrentLab/hughes_2006;overexpression;12 10.0 \n",
+ "... ... ... \n",
+ "29799 BrentLab/kemmeren_2014;kemmeren_2014;784 154.0 \n",
+ "29800 BrentLab/kemmeren_2014;kemmeren_2014;666 215.0 \n",
+ "29801 BrentLab/kemmeren_2014;kemmeren_2014;271 221.0 \n",
+ "29802 BrentLab/kemmeren_2014;kemmeren_2014;1077 281.0 \n",
+ "29803 BrentLab/kemmeren_2014;kemmeren_2014;963 526.0 \n",
"\n",
- " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n",
- "0 2.0 3.0 2.0 \n",
- "1 NaN NaN NaN \n",
- "2 1.0 3.0 1.0 \n",
- "3 96.0 479.0 92.0 \n",
- "4 NaN NaN NaN \n",
- "... ... ... ... \n",
- "9599 39.0 13.0 39.0 \n",
- "9600 278.0 17.0 171.0 \n",
- "9601 57.0 19.0 27.0 \n",
- "9602 9.0 11.0 9.0 \n",
- "9603 221.0 140.0 206.0 \n",
+ " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n",
+ "0 206.0 12.0 206.0 \n",
+ "1 67.0 60.0 67.0 \n",
+ "2 1265.0 27.0 1265.0 \n",
+ "3 1093.0 532.0 1093.0 \n",
+ "4 556.0 10.0 556.0 \n",
+ "... ... ... ... \n",
+ "29799 905.0 154.0 905.0 \n",
+ "29800 108.0 215.0 108.0 \n",
+ "29801 925.0 221.0 925.0 \n",
+ "29802 73.0 283.0 77.0 \n",
+ "29803 227.0 527.0 227.0 \n",
"\n",
- " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n",
- "0 0.000225 0.004 harbison_2004-harbison_2004 \n",
- "1 NaN NaN harbison_2004-harbison_2004 \n",
- "2 0.000000 0.011 harbison_2004-harbison_2004 \n",
- "3 0.412192 0.576 harbison_2004-harbison_2004 \n",
- "4 NaN NaN harbison_2004-harbison_2004 \n",
- "... ... ... ... \n",
- "9599 0.000879 0.000 callingcards-annotated_features \n",
- "9600 0.001455 0.000 callingcards-annotated_features \n",
- "9601 0.003116 0.000 callingcards-annotated_features \n",
- "9602 0.000000 0.000 callingcards-annotated_features \n",
- "9603 0.116890 0.000 callingcards-annotated_features \n",
+ " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n",
+ "0 0.041293 0.017 log2fc \n",
+ "1 0.054284 0.000 log2fc \n",
+ "2 0.123214 0.057 log2fc \n",
+ "3 0.436305 0.092 log2fc \n",
+ "4 0.017567 0.002 log2fc \n",
+ "... ... ... ... \n",
+ "29799 0.090665 0.000 pvalue \n",
+ "29800 0.075036 0.005 pvalue \n",
+ "29801 0.403484 0.126 pvalue \n",
+ "29802 0.095948 0.174 pvalue \n",
+ "29803 0.064919 0.000 pvalue \n",
"\n",
- " perturbation_repo_dataset binding_id_id \\\n",
- "0 Hackett_2020-hackett_2020 3 \n",
- "1 Hackett_2020-hackett_2020 3 \n",
- "2 Hackett_2020-hackett_2020 3 \n",
- "3 Hackett_2020-hackett_2020 4 \n",
- "4 Hackett_2020-hackett_2020 3 \n",
- "... ... ... \n",
- "9599 kemmeren_2014-kemmeren_2014 804 \n",
- "9600 kemmeren_2014-kemmeren_2014 805 \n",
- "9601 kemmeren_2014-kemmeren_2014 808 \n",
- "9602 kemmeren_2014-kemmeren_2014 806 \n",
- "9603 kemmeren_2014-kemmeren_2014 809 \n",
+ " binding_repo_dataset perturbation_repo_dataset \\\n",
+ "0 harbison_2004-harbison_2004 hughes_2006-overexpression \n",
+ "1 harbison_2004-harbison_2004 hughes_2006-overexpression \n",
+ "2 harbison_2004-harbison_2004 hughes_2006-overexpression \n",
+ "3 harbison_2004-harbison_2004 hughes_2006-overexpression \n",
+ "4 harbison_2004-harbison_2004 hughes_2006-overexpression \n",
+ "... ... ... \n",
+ "29799 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n",
+ "29800 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n",
+ "29801 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n",
+ "29802 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n",
+ "29803 callingcards-annotated_features_combined kemmeren_2014-kemmeren_2014 \n",
"\n",
- " binding_id_source perturbation_id_id \\\n",
- "0 harbison 85 \n",
- "1 harbison 83 \n",
- "2 harbison 84 \n",
- "3 harbison 78 \n",
- "4 harbison 81 \n",
- "... ... ... \n",
- "9599 BrentLab/callingcards;annotated_features 901 \n",
- "9600 BrentLab/callingcards;annotated_features 1053 \n",
- "9601 BrentLab/callingcards;annotated_features 218 \n",
- "9602 BrentLab/callingcards;annotated_features 1023 \n",
- "9603 BrentLab/callingcards;annotated_features 913 \n",
+ " binding_id_id binding_id_source \\\n",
+ "0 105 harbison \n",
+ "1 108 harbison \n",
+ "2 109 harbison \n",
+ "3 112 harbison \n",
+ "4 113 harbison \n",
+ "... ... ... \n",
+ "29799 724-692-688 BrentLab/callingcards;annotated_features_combined \n",
+ "29800 725-435-395 BrentLab/callingcards;annotated_features_combined \n",
+ "29801 726-445-424 BrentLab/callingcards;annotated_features_combined \n",
+ "29802 79-33 BrentLab/callingcards;annotated_features_combined \n",
+ "29803 96-49 BrentLab/callingcards;annotated_features_combined \n",
"\n",
- " perturbation_id_source \n",
- "0 BrentLab/Hackett_2020;hackett_2020 \n",
- "1 BrentLab/Hackett_2020;hackett_2020 \n",
- "2 BrentLab/Hackett_2020;hackett_2020 \n",
- "3 BrentLab/Hackett_2020;hackett_2020 \n",
- "4 BrentLab/Hackett_2020;hackett_2020 \n",
- "... ... \n",
- "9599 kemmeren \n",
- "9600 kemmeren \n",
- "9601 kemmeren \n",
- "9602 kemmeren \n",
- "9603 kemmeren \n",
+ " perturbation_id_id perturbation_id_source \n",
+ "0 10 BrentLab/hughes_2006;overexpression \n",
+ "1 11 BrentLab/hughes_2006;overexpression \n",
+ "2 11 BrentLab/hughes_2006;overexpression \n",
+ "3 12 BrentLab/hughes_2006;overexpression \n",
+ "4 12 BrentLab/hughes_2006;overexpression \n",
+ "... ... ... \n",
+ "29799 784 kemmeren \n",
+ "29800 666 kemmeren \n",
+ "29801 271 kemmeren \n",
+ "29802 1077 kemmeren \n",
+ "29803 963 kemmeren \n",
"\n",
- "[9604 rows x 14 columns]"
+ "[29804 rows x 15 columns]"
]
},
- "execution_count": 19,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -4926,7 +4937,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 19,
"id": "cell-25",
"metadata": {},
"outputs": [
@@ -4942,53 +4953,53 @@
"4 448 ACA1 15.0 ZEV \n",
"\n",
" binding_id \\\n",
- "0 BrentLab/callingcards;annotated_features;803 \n",
+ "0 BrentLab/callingcards;annotated_features;146 \n",
"1 BrentLab/callingcards;annotated_features;156 \n",
- "2 BrentLab/callingcards;annotated_features;126 \n",
- "3 BrentLab/callingcards;annotated_features;189 \n",
- "4 BrentLab/callingcards;annotated_features;146 \n",
+ "2 BrentLab/harbison_2004;harbison_2004;88 \n",
+ "3 BrentLab/callingcards;annotated_features;146 \n",
+ "4 BrentLab/callingcards;annotated_features;803 \n",
"\n",
" perturbation_id binding_rank_threshold \\\n",
- "0 BrentLab/Hackett_2020;hackett_2020;448 112.0 \n",
- "1 BrentLab/Hackett_2020;hackett_2020;448 31.0 \n",
- "2 BrentLab/Hackett_2020;hackett_2020;448 21.0 \n",
- "3 BrentLab/Hackett_2020;hackett_2020;448 164.0 \n",
- "4 BrentLab/Hackett_2020;hackett_2020;448 23.0 \n",
+ "0 BrentLab/hackett_2020;hackett_2020;448 452.0 \n",
+ "1 BrentLab/hackett_2020;hackett_2020;448 296.0 \n",
+ "2 BrentLab/hackett_2020;hackett_2020;448 122.0 \n",
+ "3 BrentLab/hackett_2020;hackett_2020;448 35.0 \n",
+ "4 BrentLab/hackett_2020;hackett_2020;448 544.0 \n",
"\n",
" perturbation_rank_threshold binding_set_size perturbation_set_size \\\n",
- "0 98.0 108.0 90.0 \n",
- "1 98.0 26.0 90.0 \n",
- "2 98.0 17.0 90.0 \n",
- "3 154.0 150.0 144.0 \n",
- "4 98.0 18.0 90.0 \n",
+ "0 1.0 454.0 5591.0 \n",
+ "1 346.0 297.0 346.0 \n",
+ "2 218.0 122.0 218.0 \n",
+ "3 407.0 35.0 407.0 \n",
+ "4 1.0 544.0 5591.0 \n",
"\n",
- " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n",
- "0 0.187319 0.074 callingcards-annotated_features \n",
- "1 0.072561 0.047 callingcards-annotated_features \n",
- "2 0.061941 0.071 callingcards-annotated_features \n",
- "3 0.213716 0.011 callingcards-annotated_features \n",
- "4 0.066616 0.171 callingcards-annotated_features \n",
+ " dto_fdr dto_empirical_pvalue pr_ranking_column \\\n",
+ "0 0.000000 1.000 pvalue \n",
+ "1 0.277211 0.000 log2fc \n",
+ "2 0.612736 0.917 log2fc \n",
+ "3 0.116834 0.000 log2fc \n",
+ "4 0.000000 1.000 pvalue \n",
"\n",
- " perturbation_repo_dataset binding_id_id \\\n",
- "0 Hackett_2020-hackett_2020 803 \n",
- "1 Hackett_2020-hackett_2020 156 \n",
- "2 Hackett_2020-hackett_2020 126 \n",
- "3 Hackett_2020-hackett_2020 189 \n",
- "4 Hackett_2020-hackett_2020 146 \n",
+ " binding_repo_dataset perturbation_repo_dataset binding_id_id \\\n",
+ "0 callingcards-annotated_features hackett_2020-hackett_2020 146 \n",
+ "1 callingcards-annotated_features hackett_2020-hackett_2020 156 \n",
+ "2 harbison_2004-harbison_2004 hackett_2020-hackett_2020 88 \n",
+ "3 callingcards-annotated_features hackett_2020-hackett_2020 146 \n",
+ "4 callingcards-annotated_features hackett_2020-hackett_2020 803 \n",
"\n",
" binding_id_source perturbation_id_id \\\n",
"0 BrentLab/callingcards;annotated_features 448 \n",
"1 BrentLab/callingcards;annotated_features 448 \n",
- "2 BrentLab/callingcards;annotated_features 448 \n",
+ "2 harbison 448 \n",
"3 BrentLab/callingcards;annotated_features 448 \n",
"4 BrentLab/callingcards;annotated_features 448 \n",
"\n",
- " perturbation_id_source \n",
- "0 BrentLab/Hackett_2020;hackett_2020 \n",
- "1 BrentLab/Hackett_2020;hackett_2020 \n",
- "2 BrentLab/Hackett_2020;hackett_2020 \n",
- "3 BrentLab/Hackett_2020;hackett_2020 \n",
- "4 BrentLab/Hackett_2020;hackett_2020 \n"
+ " perturbation_id_source \n",
+ "0 hackett \n",
+ "1 hackett \n",
+ "2 hackett \n",
+ "3 hackett \n",
+ "4 hackett \n"
]
}
],
@@ -5016,7 +5027,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 20,
"id": "cell-26",
"metadata": {},
"outputs": [],
diff --git a/docs/virtual_db.md b/docs/virtual_db.md
index 3ec4b45..9062618 100644
--- a/docs/virtual_db.md
+++ b/docs/virtual_db.md
@@ -23,6 +23,66 @@ For comparative analysis datasets, VirtualDB creates:
See the [configuration guide](virtual_db_configuration.md) for setup details
and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples.
+## Advanced Usage
+
+After any public method is called (e.g. `vdb.tables()`), the underlying DuckDB
+connection is available as `vdb._db`. You can use `_db` to execute any SQL
+on the database, eg creating more views, or creating a table in memory
+
+Custom **views** created this way appear in `tables()`, `describe()`, and
+`get_fields()` automatically because those methods query DuckDB's
+`information_schema`. Custom **tables** do not appear in `tables()` (which
+only lists views), but are fully queryable via `vdb.query()`.
+
+Call at least one public method first to ensure the connection is initialized
+before accessing `_db` directly.
+
+Example -- create a materialized analysis table::
+
+ # Trigger view registration
+ vdb.tables()
+
+ # Create a persistent in-memory table from a complex query.
+ # This example selects one "best" Hackett-2020 sample per regulator
+ # using a priority system: ZEV+P > GEV+P > GEV+M.
+ vdb._db.execute("""
+ CREATE OR REPLACE TABLE hackett_analysis_set AS
+ WITH regulator_tiers AS (
+ SELECT
+ regulator_locus_tag,
+ CASE
+ WHEN BOOL_OR(mechanism = 'ZEV' AND restriction = 'P') THEN 1
+ WHEN BOOL_OR(mechanism = 'GEV' AND restriction = 'P') THEN 2
+ ELSE 3
+ END AS tier
+ FROM hackett_meta
+ WHERE regulator_locus_tag NOT IN ('Z3EV', 'GEV')
+ GROUP BY regulator_locus_tag
+ ),
+ tier_filter AS (
+ SELECT
+ h.sample_id, h.regulator_locus_tag, h.regulator_symbol,
+ h.mechanism, h.restriction, h.date, h.strain, t.tier
+ FROM hackett_meta h
+ JOIN regulator_tiers t USING (regulator_locus_tag)
+ WHERE
+ (t.tier = 1 AND h.mechanism = 'ZEV' AND h.restriction = 'P')
+ OR (t.tier = 2 AND h.mechanism = 'GEV' AND h.restriction = 'P')
+ OR (t.tier = 3 AND h.mechanism = 'GEV' AND h.restriction = 'M')
+ )
+ SELECT DISTINCT
+ sample_id, regulator_locus_tag, regulator_symbol,
+ mechanism, restriction, date, strain
+ FROM tier_filter
+ WHERE regulator_symbol NOT IN ('GCN4', 'RDS2', 'SWI1', 'MAC1')
+ ORDER BY regulator_locus_tag, sample_id
+ """)
+
+ df = vdb.query("SELECT * FROM hackett_analysis_set")
+
+Tables and views created this way are in-memory only and do not persist across
+VirtualDB instances. They exist for the lifetime of the DuckDB connection.
+
## API Reference
::: tfbpapi.virtual_db.VirtualDB
diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md
index 45320d4..42316d5 100644
--- a/docs/virtual_db_configuration.md
+++ b/docs/virtual_db_configuration.md
@@ -10,8 +10,10 @@ levels.
repositories:
# Each repository defines a "table" in the virtual database
BrentLab/harbison_2004:
- # REQUIRED: Specify which field is the sample identifier. At this level, it means
- # that all datasets have a field `sample_id` that uniquely identifies samples.
+ # REQUIRED: Specify which column is the sample identifier. The `field`
+ # value is the actual column name in the parquet data. At the repo level,
+ # it applies to all datasets in this repository. If not specified at
+ # either level, the default column name "sample_id" is assumed.
sample_id:
field: sample_id
# Repository-wide properties (apply to all datasets in this repository)
@@ -47,8 +49,9 @@ repositories:
kemmeren_2014:
# optional -- see the note for `db_name` in harbison above
db_name: kemmeren
- # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be
- # defined at the dataset level for each dataset in the repo
+ # REQUIRED: If `sample_id` isn't defined at the repo level, it must be
+ # defined at the dataset level. The `field` value is the actual column
+ # name in the parquet data (does not need to be literally "sample_id").
sample_id:
field: sample_id
# Same logical fields, different physical paths
@@ -144,6 +147,62 @@ during metadata extraction and query filtering.
2. **Type consistency**: When source data might be extracted with incorrect type
3. **Performance**: Helps with query optimization and prevents type mismatches
+## Tags
+
+Tags are arbitrary string key/value pairs for annotating datasets. They follow
+the same hierarchy as property mappings: repo-level tags apply to all datasets
+in the repository, dataset-level tags apply only to that dataset, and
+dataset-level tags override repo-level tags with the same key.
+
+```yaml
+repositories:
+ BrentLab/harbison_2004:
+ # Repo-level tags apply to all datasets in this repository
+ tags:
+ assay: binding
+ organism: yeast
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ # Dataset-level tags override repo-level tags with the same key
+ tags:
+ assay: chip-chip
+
+ BrentLab/kemmeren_2014:
+ tags:
+ assay: perturbation
+ organism: yeast
+ dataset:
+ kemmeren_2014:
+ sample_id:
+ field: sample_id
+```
+
+Access merged tags via `vdb.get_tags(db_name)`, identifying datasets by
+their name as it appears in `vdb.tables()`:
+
+```python
+from tfbpapi.virtual_db import VirtualDB
+
+vdb = VirtualDB("datasets.yaml")
+
+# Returns {"assay": "chip-chip", "organism": "yeast"}
+# (dataset-level assay overrides repo-level)
+vdb.get_tags("harbison")
+
+# Returns {"assay": "perturbation", "organism": "yeast"}
+vdb.get_tags("kemmeren")
+```
+
+The underlying `MetadataConfig` (available as `vdb.config`) exposes the same
+data via `(repo_id, config_name)` pairs for programmatic or developer use:
+
+```python
+# Equivalent to vdb.get_tags("harbison") above
+vdb.config.get_tags("BrentLab/harbison_2004", "harbison_2004")
+```
+
## Comparative Datasets
Comparative datasets differ from other dataset types in that they represent
@@ -152,9 +211,10 @@ Each row relates 2+ samples from other datasets.
### Structure
-Comparative datasets use `source_sample` fields instead of a single `sample_id`:
+Comparative datasets use `source_sample` fields instead of a single sample
+identifier column:
- Multiple fields with `role: source_sample`
-- Each contains composite identifier: `"repo_id;config_name;sample_id"`
+- Each contains composite identifier: `"repo_id;config_name;sample_id_value"`
- Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"`
### Fields
@@ -206,10 +266,11 @@ build on each other. Using `harbison` as an example primary dataset and
**1. Metadata view**
-One row per unique `sample_id`. Derived columns from the configuration
-(e.g., `carbon_source`, `temperature_celsius`) are resolved here using
-datacard definitions, factor aliases, and missing value labels. This is
-the primary view for querying sample-level metadata.
+One row per unique sample identifier (the column configured via
+`sample_id: {field: }`). Derived columns from the
+configuration (e.g., `carbon_source`, `temperature_celsius`) are resolved
+here using datacard definitions, factor aliases, and missing value labels.
+This is the primary view for querying sample-level metadata.
**2. Raw data view**
@@ -239,7 +300,7 @@ or filter by source dataset without parsing composite IDs in SQL.
```
__harbison_parquet (raw parquet, not directly exposed)
|
- +-> harbison_meta (deduplicated, one row per sample_id,
+ +-> harbison_meta (deduplicated, one row per sample identifier,
| with derived columns from config)
|
+-> harbison (full parquet joined to harbison_meta)
diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py
index abf94dd..734a5f3 100644
--- a/tfbpapi/datacard.py
+++ b/tfbpapi/datacard.py
@@ -17,6 +17,7 @@
"""
import logging
+from dataclasses import dataclass
from typing import Any
from pydantic import ValidationError
@@ -36,6 +37,34 @@
)
+@dataclass
+class DatasetSchema:
+ """
+ Complete schema summary for a data configuration.
+
+ Derived entirely from the DataCard YAML -- no DuckDB introspection needed. Used by
+ VirtualDB to determine column partitioning between data and metadata parquets.
+
+ :ivar data_columns: Column names present in the data parquet.
+ :ivar metadata_columns: Column names that are metadata.
+ :ivar join_columns: Columns common to both data and metadata parquets (used as JOIN
+ keys for external metadata). Empty for embedded metadata (same parquet, no JOIN
+ needed).
+ :ivar metadata_source: One of ``"embedded"``, ``"external"``, or ``"none"``.
+ :ivar external_metadata_config: Config name of the external metadata config, or
+ ``None`` if metadata is embedded or absent.
+ :ivar is_partitioned: Whether the data parquet is partitioned.
+
+ """
+
+ data_columns: set[str]
+ metadata_columns: set[str]
+ join_columns: set[str]
+ metadata_source: str
+ external_metadata_config: str | None
+ is_partitioned: bool
+
+
class DataCard:
"""
Parser and explorer for HuggingFace dataset metadata.
@@ -91,6 +120,7 @@ def __init__(self, repo_id: str, token: str | None = None):
# Cache for parsed card
self._dataset_card: DatasetCard | None = None
self._metadata_cache: dict[str, list[ExtractedMetadata]] = {}
+ self._metadata_fields_map: dict[str, list[str]] = {}
@property
def dataset_card(self) -> DatasetCard:
@@ -115,6 +145,7 @@ def _load_and_validate_card(self) -> None:
# Validate using Pydantic model
self._dataset_card = DatasetCard(**card_data)
+ self._build_metadata_fields_map()
self.logger.debug(f"Successfully validated dataset card for {self.repo_id}")
except ValidationError as e:
@@ -241,6 +272,186 @@ def get_metadata_relationships(
return relationships
+ def _build_metadata_fields_map(self) -> None:
+ """
+ Build a mapping from data config names to their metadata fields.
+
+ Called during card loading. For each data config, resolves metadata
+ fields from two sources:
+
+ 1. Embedded: the data config has ``metadata_fields`` listing which
+ of its own columns are metadata.
+ 2. External: a separate metadata-type config has ``applies_to``
+ including this config name. The metadata fields are the feature
+ names from that metadata config.
+
+ Embedded takes priority. For external, the first matching metadata
+ config wins.
+
+ """
+ assert self._dataset_card is not None
+ self._metadata_fields_map = {}
+ meta_configs = self._dataset_card.get_metadata_configs()
+
+ for data_cfg in self._dataset_card.get_data_configs():
+ name = data_cfg.config_name
+ # Embedded case
+ if data_cfg.metadata_fields:
+ self._metadata_fields_map[name] = list(data_cfg.metadata_fields)
+ continue
+ # External case: find metadata config with applies_to
+ for meta_cfg in meta_configs:
+ if meta_cfg.applies_to and name in meta_cfg.applies_to:
+ self._metadata_fields_map[name] = [
+ f.name for f in meta_cfg.dataset_info.features
+ ]
+ break
+ else:
+ self.logger.warning(
+ "No metadata fields found for data config '%s' "
+ "in repo '%s' -- no embedded metadata_fields and "
+ "no metadata config with applies_to",
+ name,
+ self.repo_id,
+ )
+
+ def get_metadata_fields(self, config_name: str) -> list[str] | None:
+ """
+ Get metadata field names for a data configuration.
+
+ Returns pre-computed metadata fields resolved during card loading.
+ Handles both embedded metadata (``metadata_fields`` on the data
+ config) and external metadata (separate metadata config with
+ ``applies_to``).
+
+ :param config_name: Name of the data configuration
+ :return: List of metadata field names, or None if no metadata
+
+ """
+ # Ensure card is loaded (triggers _build_metadata_fields_map)
+ _ = self.dataset_card
+ return self._metadata_fields_map.get(config_name)
+
+ def get_data_col_names(self, config_name: str) -> set[str]:
+ """
+ Return the column names from the data config's feature list.
+
+ These are the columns present in the data parquet file, derived directly from
+ the DataCard feature definitions without any DuckDB introspection.
+
+ :param config_name: Name of the data configuration
+ :return: Set of column names, empty if config not found
+
+ """
+ _ = self.dataset_card # ensure loaded
+ config = self.get_config(config_name)
+ if not config:
+ return set()
+ return {f.name for f in config.dataset_info.features}
+
+ def get_metadata_config_name(self, config_name: str) -> str | None:
+ """
+ Return the config_name of the external metadata config, if any.
+
+ If the data config has embedded ``metadata_fields``, or if no
+ metadata config with ``applies_to`` references this config,
+ returns None.
+
+ :param config_name: Name of the data configuration
+ :return: The metadata config name, or None
+
+ """
+ _ = self.dataset_card # ensure loaded
+ data_cfg = self.get_config(config_name)
+ if not data_cfg:
+ return None
+ # Embedded metadata -- no external config needed
+ if data_cfg.metadata_fields:
+ return None
+ # Find external metadata config with applies_to
+ for meta_cfg in self.dataset_card.get_metadata_configs():
+ if meta_cfg.applies_to and config_name in meta_cfg.applies_to:
+ return meta_cfg.config_name
+ return None
+
+ def get_dataset_schema(self, config_name: str) -> DatasetSchema | None:
+ """
+ Return schema summary for a data configuration.
+
+ Determines whether metadata is embedded or external, which
+ columns belong to data vs metadata parquets, and which columns
+ are shared between them (join keys for external metadata).
+ All information is derived from the DataCard YAML -- no DuckDB
+ introspection is needed.
+
+ :param config_name: Name of the data configuration
+ :return: DatasetSchema instance, or None if config not found
+
+ Example -- embedded metadata::
+
+ schema = card.get_dataset_schema("harbison_2004")
+ # schema.metadata_source == "embedded"
+ # schema.join_columns == set() (same parquet, no JOIN)
+
+ Example -- external metadata::
+
+ schema = card.get_dataset_schema("annotated_features")
+ # schema.metadata_source == "external"
+ # schema.external_metadata_config == "annotated_features_meta"
+ # schema.join_columns == {"id"} (common to both parquets)
+
+ """
+ _ = self.dataset_card # ensure loaded
+ config = self.get_config(config_name)
+ if not config:
+ return None
+
+ is_partitioned = bool(
+ config.dataset_info.partitioning
+ and config.dataset_info.partitioning.enabled
+ )
+
+ # Embedded: metadata_fields lists which of the config's own
+ # columns are metadata; all live in the same parquet
+ if config.metadata_fields:
+ all_cols = {f.name for f in config.dataset_info.features}
+ meta_cols = set(config.metadata_fields)
+ data_cols = all_cols - meta_cols
+ return DatasetSchema(
+ data_columns=data_cols,
+ metadata_columns=meta_cols,
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=is_partitioned,
+ )
+
+ # External: find metadata config with applies_to
+ for meta_cfg in self.dataset_card.get_metadata_configs():
+ if meta_cfg.applies_to and config_name in meta_cfg.applies_to:
+ data_cols = {f.name for f in config.dataset_info.features}
+ meta_cols = {f.name for f in meta_cfg.dataset_info.features}
+ join_cols = data_cols & meta_cols
+ return DatasetSchema(
+ data_columns=data_cols,
+ metadata_columns=meta_cols,
+ join_columns=join_cols,
+ metadata_source="external",
+ external_metadata_config=meta_cfg.config_name,
+ is_partitioned=is_partitioned,
+ )
+
+ # No metadata relationship -- treat all columns as data
+ all_cols = {f.name for f in config.dataset_info.features}
+ return DatasetSchema(
+ data_columns=all_cols,
+ metadata_columns=set(),
+ join_columns=set(),
+ metadata_source="none",
+ external_metadata_config=None,
+ is_partitioned=is_partitioned,
+ )
+
def get_repository_info(self) -> dict[str, Any]:
"""Get general repository information."""
card = self.dataset_card
@@ -315,12 +526,13 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]:
raise DataCardError(f"Configuration '{config_name}' not found")
schema: dict[str, Any] = {
- "regulator_fields": [], # Fields with role=regulator_identifier
- "target_fields": [], # Fields with role=target_identifier
- "condition_fields": [], # Fields with role=experimental_condition
- "condition_definitions": {}, # Field-level condition details
- "top_level_conditions": None, # Repo-level conditions
- "config_level_conditions": None, # Config-level conditions
+ "regulator_fields": [],
+ "target_fields": [],
+ "condition_fields": [],
+ "condition_definitions": {},
+ "metadata_fields": None,
+ "top_level_conditions": None,
+ "config_level_conditions": None,
}
for feature in config.dataset_info.features:
@@ -333,15 +545,32 @@ def extract_metadata_schema(self, config_name: str) -> dict[str, Any]:
if feature.definitions:
schema["condition_definitions"][feature.name] = feature.definitions
+ # Include features from external metadata config
+ meta_fields = self.get_metadata_fields(config_name)
+ schema["metadata_fields"] = meta_fields
+ if meta_fields is not None and not config.metadata_fields:
+ for meta_cfg in self.dataset_card.get_metadata_configs():
+ if meta_cfg.applies_to and config_name in meta_cfg.applies_to:
+ for feature in meta_cfg.dataset_info.features:
+ if feature.role == "regulator_identifier":
+ schema["regulator_fields"].append(feature.name)
+ elif feature.role == "target_identifier":
+ schema["target_fields"].append(feature.name)
+ elif feature.role == "experimental_condition":
+ schema["condition_fields"].append(feature.name)
+ if feature.definitions:
+ schema["condition_definitions"][
+ feature.name
+ ] = feature.definitions
+ break
+
# Add top-level conditions (applies to all configs/samples)
- # Stored in model_extra as dict
if self.dataset_card.model_extra:
top_level = self.dataset_card.model_extra.get("experimental_conditions")
if top_level:
schema["top_level_conditions"] = top_level
# Add config-level conditions (applies to this config's samples)
- # Stored in model_extra as dict
if config.model_extra:
config_level = config.model_extra.get("experimental_conditions")
if config_level:
diff --git a/tfbpapi/models.py b/tfbpapi/models.py
index a8660a1..4d77f02 100644
--- a/tfbpapi/models.py
+++ b/tfbpapi/models.py
@@ -458,6 +458,10 @@ class DatasetVirtualDBConfig(BaseModel):
description="For comparative datasets: map link_field -> "
"[repo_id, config_name] pairs",
)
+ tags: dict[str, str] = Field(
+ default_factory=dict,
+ description="Arbitrary key/value annotations for this dataset",
+ )
model_config = ConfigDict(extra="allow")
@@ -526,7 +530,7 @@ def parse_property_mappings(cls, data: Any) -> dict[str, Any]:
result = {}
for key, value in data.items():
# Known typed fields - let Pydantic handle them
- if key in ("sample_id", "links", "db_name"):
+ if key in ("sample_id", "links", "db_name", "tags"):
result[key] = value
# Dict values should be PropertyMappings
elif isinstance(value, dict):
@@ -591,6 +595,10 @@ class RepositoryConfig(BaseModel):
dataset: dict[str, DatasetVirtualDBConfig] | None = Field(
None, description="Dataset-specific configurations"
)
+ tags: dict[str, str] = Field(
+ default_factory=dict,
+ description="Arbitrary key/value annotations for all datasets in this repo",
+ )
@model_validator(mode="before")
@classmethod
@@ -628,10 +636,10 @@ def parse_structure(cls, data: Any) -> dict[str, Any]:
f"Invalid configuration for dataset '{dataset_name}': {e}"
) from e
- # Parse repo-wide properties (all keys except 'dataset')
+ # Parse repo-wide properties (all keys except 'dataset' and 'tags')
parsed_properties = {}
for key, value in data.items():
- if key == "dataset":
+ if key in ("dataset", "tags"):
continue
try:
@@ -639,7 +647,11 @@ def parse_structure(cls, data: Any) -> dict[str, Any]:
except Exception as e:
raise ValueError(f"Invalid repo-wide property '{key}': {e}") from e
- return {"properties": parsed_properties, "dataset": parsed_datasets}
+ return {
+ "properties": parsed_properties,
+ "dataset": parsed_datasets,
+ "tags": data.get("tags") or {},
+ }
class MetadataConfig(BaseModel):
@@ -876,3 +888,55 @@ def get_property_mappings(
mappings.update(dataset_config.property_mappings)
return mappings
+
+ def get_tags(self, repo_id: str, config_name: str) -> dict[str, str]:
+ """
+ Get merged tags for a repo/dataset combination.
+
+ Merges repo-level and dataset-level tags, with dataset-level tags taking
+ precedence for the same key.
+
+ :param repo_id: Repository ID
+ :param config_name: Dataset/config name
+ :return: Dict of merged tags
+
+ """
+ repo_config = self.get_repository_config(repo_id)
+ if not repo_config:
+ return {}
+
+ merged: dict[str, str] = dict(repo_config.tags)
+
+ if repo_config.dataset and config_name in repo_config.dataset:
+ merged.update(repo_config.dataset[config_name].tags)
+
+ return merged
+
+ def get_sample_id_field(self, repo_id: str, config_name: str) -> str:
+ """
+ Resolve the actual column name for the sample identifier.
+
+ Checks dataset-level ``sample_id`` first, then repo-level,
+ falling back to ``"sample_id"`` if neither is configured.
+
+ :param repo_id: Repository ID
+ :param config_name: Dataset/config name
+ :return: Column name for the sample identifier
+
+ """
+ repo_cfg = self.get_repository_config(repo_id)
+ if not repo_cfg:
+ return "sample_id"
+
+ # Dataset-level takes precedence
+ if repo_cfg.dataset and config_name in repo_cfg.dataset:
+ ds_cfg = repo_cfg.dataset[config_name]
+ if ds_cfg.sample_id is not None and ds_cfg.sample_id.field:
+ return ds_cfg.sample_id.field
+
+ # Repo-level fallback
+ repo_sample_id = repo_cfg.properties.get("sample_id")
+ if repo_sample_id is not None and repo_sample_id.field is not None:
+ return repo_sample_id.field
+
+ return "sample_id"
diff --git a/tfbpapi/tests/test_datacard.py b/tfbpapi/tests/test_datacard.py
index 5f098de..b9228d1 100644
--- a/tfbpapi/tests/test_datacard.py
+++ b/tfbpapi/tests/test_datacard.py
@@ -5,10 +5,80 @@
import pytest
from tfbpapi import DataCard
+from tfbpapi.datacard import DatasetSchema
from tfbpapi.errors import DataCardError, DataCardValidationError, HfDataFetchError
from tfbpapi.models import DatasetType
+def _external_metadata_card_data():
+ """Card data with external metadata (no embedded metadata_fields)."""
+ return {
+ "configs": [
+ {
+ "config_name": "coverage_data",
+ "description": "Coverage measurements",
+ "dataset_type": "genome_map",
+ "default": True,
+ "data_files": [{"split": "train", "path": "coverage.parquet"}],
+ "dataset_info": {
+ "features": [
+ {
+ "name": "sample_id",
+ "dtype": "integer",
+ "description": "Sample ID",
+ },
+ {
+ "name": "chr",
+ "dtype": "string",
+ "description": "Chromosome",
+ "role": "genomic_coordinate",
+ },
+ {
+ "name": "coverage",
+ "dtype": "float32",
+ "description": "Coverage value",
+ "role": "quantitative_measure",
+ },
+ ]
+ },
+ },
+ {
+ "config_name": "sample_metadata",
+ "description": "Sample metadata",
+ "dataset_type": "metadata",
+ "applies_to": ["coverage_data"],
+ "data_files": [{"split": "train", "path": "metadata.parquet"}],
+ "dataset_info": {
+ "features": [
+ {
+ "name": "sample_id",
+ "dtype": "integer",
+ "description": "Sample ID",
+ },
+ {
+ "name": "batch",
+ "dtype": "string",
+ "description": "Batch ID",
+ },
+ {
+ "name": "regulator_locus_tag",
+ "dtype": "string",
+ "description": "TF locus tag",
+ "role": "regulator_identifier",
+ },
+ {
+ "name": "regulator_symbol",
+ "dtype": "string",
+ "description": "TF symbol",
+ "role": "regulator_identifier",
+ },
+ ]
+ },
+ },
+ ],
+ }
+
+
class TestDataCard:
"""Test suite for DataCard class."""
@@ -30,6 +100,7 @@ def test_init(
assert datacard.token == test_token
assert datacard._dataset_card is None
assert datacard._metadata_cache == {}
+ assert datacard._metadata_fields_map == {}
# Check that fetchers were initialized
mock_card_fetcher.assert_called_once_with(token=test_token)
@@ -447,3 +518,393 @@ def test_extract_partition_values_fetch_error(
# Should return empty set on error
assert values == set()
+
+
+class TestGetMetadataFields:
+ """Tests for DataCard.get_metadata_fields()."""
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_embedded_metadata_fields(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Embedded metadata_fields on the data config are returned."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_fields("binding_data")
+
+ assert result == ["regulator_symbol", "experimental_condition"]
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_external_metadata_fields(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ ):
+ """External metadata via applies_to returns feature names."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = _external_metadata_card_data()
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_fields("coverage_data")
+
+ assert result == [
+ "sample_id",
+ "batch",
+ "regulator_locus_tag",
+ "regulator_symbol",
+ ]
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_no_metadata_returns_none(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Config with no metadata returns None."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_fields("genomic_features")
+
+ assert result is None
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_unknown_config_returns_none(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Unknown config name returns None."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_fields("nonexistent")
+
+ assert result is None
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_extract_schema_includes_external_features(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ ):
+ """extract_metadata_schema includes roles from external metadata."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = _external_metadata_card_data()
+
+ datacard = DataCard(test_repo_id)
+ schema = datacard.extract_metadata_schema("coverage_data")
+
+ # External metadata features with role=regulator_identifier
+ assert "regulator_locus_tag" in schema["regulator_fields"]
+ assert "regulator_symbol" in schema["regulator_fields"]
+ # metadata_fields key populated
+ assert schema["metadata_fields"] is not None
+ assert "sample_id" in schema["metadata_fields"]
+
+
+class TestGetMetadataConfigName:
+ """Tests for DataCard.get_metadata_config_name()."""
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_external_metadata_returns_config_name(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ ):
+ """Returns metadata config name when applies_to matches."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = _external_metadata_card_data()
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_config_name("coverage_data")
+
+ assert result == "sample_metadata"
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_embedded_metadata_returns_none(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Returns None when metadata is embedded."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_config_name("binding_data")
+
+ assert result is None
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_unknown_config_returns_none(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Returns None for unknown config name."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_metadata_config_name("nonexistent")
+
+ assert result is None
+
+
+class TestGetDataColNames:
+ """Tests for DataCard.get_data_col_names()."""
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_returns_feature_names(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Returns column names from the data config's features."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_data_col_names("binding_data")
+
+ # binding_data features: regulator_symbol, target_gene,
+ # experimental_condition, binding_score
+ assert isinstance(result, set)
+ assert result == {
+ "regulator_symbol",
+ "target_gene",
+ "experimental_condition",
+ "binding_score",
+ }
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_external_metadata_config_returns_data_features(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ ):
+ """For external metadata, returns data config features only."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = _external_metadata_card_data()
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_data_col_names("coverage_data")
+
+ # coverage_data features: sample_id, chr, coverage
+ assert result == {"sample_id", "chr", "coverage"}
+ # Must NOT include metadata-only columns
+ assert "batch" not in result
+ assert "regulator_locus_tag" not in result
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_unknown_config_returns_empty_set(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Returns empty set for unknown config name."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_data_col_names("nonexistent")
+
+ assert result == set()
+
+
+class TestGetDatasetSchema:
+ """Tests for DataCard.get_dataset_schema()."""
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_embedded_metadata_returns_correct_schema(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Embedded metadata produces correct data/metadata column split."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ # binding_data has metadata_fields: [regulator_symbol,
+ # experimental_condition] and features: regulator_symbol,
+ # target_gene, experimental_condition, binding_score
+ result = datacard.get_dataset_schema("binding_data")
+
+ assert result is not None
+ assert isinstance(result, DatasetSchema)
+ assert result.metadata_source == "embedded"
+ assert result.external_metadata_config is None
+ assert result.join_columns == set()
+ assert result.metadata_columns == {
+ "regulator_symbol",
+ "experimental_condition",
+ }
+ # data_columns = all features minus metadata_columns
+ assert result.data_columns == {
+ "target_gene",
+ "binding_score",
+ }
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_external_metadata_returns_correct_schema(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ ):
+ """External metadata produces correct split and join columns."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = _external_metadata_card_data()
+
+ datacard = DataCard(test_repo_id)
+ # coverage_data features: sample_id, chr, coverage
+ # sample_metadata features: sample_id, batch, regulator_locus_tag,
+ # regulator_symbol
+ # join_columns = intersection = {sample_id}
+ result = datacard.get_dataset_schema("coverage_data")
+
+ assert result is not None
+ assert result.metadata_source == "external"
+ assert result.external_metadata_config == "sample_metadata"
+ assert result.data_columns == {"sample_id", "chr", "coverage"}
+ assert result.metadata_columns == {
+ "sample_id",
+ "batch",
+ "regulator_locus_tag",
+ "regulator_symbol",
+ }
+ assert result.join_columns == {"sample_id"}
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_no_metadata_returns_all_cols_as_data(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Config with no metadata relationship has all cols as data."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ # genomic_features has no metadata_fields and no applies_to
+ result = datacard.get_dataset_schema("genomic_features")
+
+ assert result is not None
+ assert result.metadata_source == "none"
+ assert result.external_metadata_config is None
+ assert result.metadata_columns == set()
+ assert result.join_columns == set()
+ assert result.data_columns == {
+ "gene_id",
+ "gene_symbol",
+ "chromosome",
+ "start",
+ "end",
+ }
+
+ @patch("tfbpapi.datacard.HfDataCardFetcher")
+ @patch("tfbpapi.datacard.HfRepoStructureFetcher")
+ @patch("tfbpapi.datacard.HfSizeInfoFetcher")
+ def test_unknown_config_returns_none(
+ self,
+ mock_size_fetcher,
+ mock_structure_fetcher,
+ mock_card_fetcher,
+ test_repo_id,
+ sample_dataset_card_data,
+ ):
+ """Returns None for an unknown config name."""
+ mock_fetcher_instance = Mock()
+ mock_card_fetcher.return_value = mock_fetcher_instance
+ mock_fetcher_instance.fetch.return_value = sample_dataset_card_data
+
+ datacard = DataCard(test_repo_id)
+ result = datacard.get_dataset_schema("nonexistent")
+
+ assert result is None
diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py
index e62b840..cb64592 100644
--- a/tfbpapi/tests/test_virtual_db.py
+++ b/tfbpapi/tests/test_virtual_db.py
@@ -14,6 +14,8 @@
import pytest
import yaml # type: ignore
+from tfbpapi.datacard import DatasetSchema
+from tfbpapi.models import MetadataConfig
from tfbpapi.virtual_db import VirtualDB
# ------------------------------------------------------------------
@@ -313,27 +315,87 @@ def _make_mock_datacard(repo_id):
card.get_config.return_value = config_mock
card.get_field_definitions.return_value = HARBISON_CONDITION_DEFS
card.get_experimental_conditions.return_value = {}
+ card.get_metadata_fields.return_value = METADATA_FIELDS["harbison_2004"]
+ card.get_metadata_config_name.return_value = None
+ # Harbison: embedded metadata, condition is data col used for
+ # derived properties; metadata_cols are the three metadata fields
+ harbison_meta_cols = set(METADATA_FIELDS["harbison_2004"])
+ harbison_data_cols = {
+ "sample_id",
+ "condition",
+ "target_locus_tag",
+ "effect",
+ "pvalue",
+ } - harbison_meta_cols
+ card.get_data_col_names.return_value = {
+ "sample_id",
+ "regulator_locus_tag",
+ "regulator_symbol",
+ "condition",
+ "target_locus_tag",
+ "effect",
+ "pvalue",
+ }
+ card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns=harbison_data_cols
+ | {
+ "sample_id",
+ "condition",
+ "target_locus_tag",
+ "effect",
+ "pvalue",
+ },
+ metadata_columns=harbison_meta_cols,
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=False,
+ )
elif repo_id == "BrentLab/kemmeren":
config_mock = MagicMock()
config_mock.metadata_fields = METADATA_FIELDS["kemmeren_2014"]
- # model_extra at config level (no experimental_conditions
- # at this level for kemmeren)
config_mock.model_extra = {}
card.get_config.return_value = config_mock
card.get_field_definitions.return_value = {}
- # model_extra at top level with experimental_conditions
- # wrapper -- matches real DataCard structure
dataset_card_mock = MagicMock()
dataset_card_mock.model_extra = {
"experimental_conditions": KEMMEREN_EXP_CONDITIONS,
}
card.dataset_card = dataset_card_mock
+ card.get_metadata_fields.return_value = METADATA_FIELDS["kemmeren_2014"]
+ card.get_metadata_config_name.return_value = None
+ kemmeren_meta_cols = set(METADATA_FIELDS["kemmeren_2014"])
+ card.get_data_col_names.return_value = {
+ "sample_id",
+ "regulator_locus_tag",
+ "regulator_symbol",
+ "target_locus_tag",
+ "effect",
+ "pvalue",
+ }
+ card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns={
+ "sample_id",
+ "target_locus_tag",
+ "effect",
+ "pvalue",
+ },
+ metadata_columns=kemmeren_meta_cols,
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=False,
+ )
else:
config_mock = MagicMock()
config_mock.metadata_fields = None
card.get_config.return_value = config_mock
card.get_field_definitions.return_value = {}
card.get_experimental_conditions.return_value = {}
+ card.get_metadata_fields.return_value = None
+ card.get_metadata_config_name.return_value = None
+ card.get_data_col_names.return_value = set()
+ card.get_dataset_schema.return_value = None
return card
@@ -408,6 +470,201 @@ def test_db_name_map(self, config_path):
)
+# ------------------------------------------------------------------
+# Tests: Tags
+# ------------------------------------------------------------------
+
+
+class TestTags:
+ """Tests for get_tags() hierarchical merging."""
+
+ def _make_config(self, yaml_str: str) -> MetadataConfig:
+ import yaml as _yaml
+
+ return MetadataConfig.model_validate(_yaml.safe_load(yaml_str))
+
+ def test_repo_level_tags_only(self):
+ """Repo-level tags propagate when dataset has none."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/harbison:
+ tags:
+ assay: binding
+ organism: yeast
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ """
+ )
+ tags = config.get_tags("BrentLab/harbison", "harbison_2004")
+ assert tags == {"assay": "binding", "organism": "yeast"}
+
+ def test_dataset_level_tags_only(self):
+ """Dataset-level tags are returned when repo has none."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/harbison:
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ tags:
+ assay: chip-chip
+ """
+ )
+ tags = config.get_tags("BrentLab/harbison", "harbison_2004")
+ assert tags == {"assay": "chip-chip"}
+
+ def test_dataset_overrides_repo_tags(self):
+ """Dataset-level tags override repo-level for the same key."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/harbison:
+ tags:
+ assay: binding
+ organism: yeast
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ tags:
+ assay: chip-chip
+ """
+ )
+ tags = config.get_tags("BrentLab/harbison", "harbison_2004")
+ assert tags["assay"] == "chip-chip"
+ assert tags["organism"] == "yeast"
+
+ def test_no_tags(self):
+ """Returns empty dict when neither level has tags."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/harbison:
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ """
+ )
+ tags = config.get_tags("BrentLab/harbison", "harbison_2004")
+ assert tags == {}
+
+ def test_unknown_repo_returns_empty(self):
+ """Unknown repo_id returns empty dict."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/harbison:
+ dataset:
+ harbison_2004:
+ sample_id:
+ field: sample_id
+ """
+ )
+ assert config.get_tags("BrentLab/nonexistent", "harbison_2004") == {}
+
+ def test_yaml_round_trip(self):
+ """Tags parsed from YAML produce correct merged result."""
+ config = self._make_config(
+ """
+ repositories:
+ BrentLab/repo_a:
+ tags:
+ type: primary
+ organism: yeast
+ dataset:
+ dataset_a:
+ sample_id:
+ field: sample_id
+ tags:
+ type: binding
+ version: "2024"
+ BrentLab/repo_b:
+ tags:
+ type: perturbation
+ dataset:
+ dataset_b:
+ sample_id:
+ field: sample_id
+ """
+ )
+ tags_a = config.get_tags("BrentLab/repo_a", "dataset_a")
+ assert tags_a == {"type": "binding", "organism": "yeast", "version": "2024"}
+
+ tags_b = config.get_tags("BrentLab/repo_b", "dataset_b")
+ assert tags_b == {"type": "perturbation"}
+
+ def _make_vdb(self, yaml_str: str, tmp_path) -> VirtualDB:
+
+ p = tmp_path / "config.yaml"
+ p.write_text(yaml_str)
+ return VirtualDB(str(p))
+
+ def test_vdb_get_tags_returns_merged(self, tmp_path):
+ """VirtualDB.get_tags() returns merged repo+dataset tags by db_name."""
+ vdb = self._make_vdb(
+ """
+ repositories:
+ BrentLab/harbison:
+ tags:
+ assay: binding
+ organism: yeast
+ dataset:
+ harbison_2004:
+ db_name: harbison
+ sample_id:
+ field: sample_id
+ tags:
+ assay: chip-chip
+ """,
+ tmp_path,
+ )
+ tags = vdb.get_tags("harbison")
+ assert tags == {"assay": "chip-chip", "organism": "yeast"}
+
+ def test_vdb_get_tags_unknown_name_returns_empty(self, tmp_path):
+ """VirtualDB.get_tags() returns empty dict for unknown db_name."""
+ vdb = self._make_vdb(
+ """
+ repositories:
+ BrentLab/harbison:
+ dataset:
+ harbison_2004:
+ db_name: harbison
+ sample_id:
+ field: sample_id
+ """,
+ tmp_path,
+ )
+ assert vdb.get_tags("nonexistent") == {}
+
+ def test_vdb_get_tags_no_views_needed(self, tmp_path):
+ """VirtualDB.get_tags() works before any views are registered."""
+ vdb = self._make_vdb(
+ """
+ repositories:
+ BrentLab/harbison:
+ tags:
+ assay: binding
+ dataset:
+ harbison_2004:
+ db_name: harbison
+ sample_id:
+ field: sample_id
+ """,
+ tmp_path,
+ )
+ assert not vdb._views_registered
+ tags = vdb.get_tags("harbison")
+ assert tags == {"assay": "binding"}
+ assert not vdb._views_registered
+
+
# ------------------------------------------------------------------
# Tests: View registration
# ------------------------------------------------------------------
@@ -780,3 +1037,283 @@ def test_lazy_init(self, config_path):
v = VirtualDB(config_path)
assert v._conn is None
assert not v._views_registered
+
+
+# ------------------------------------------------------------------
+# Tests: dynamic sample_id column
+# ------------------------------------------------------------------
+
+
+class TestDynamicSampleId:
+ """Tests that the sample identifier column is resolved from config."""
+
+ def test_non_default_sample_id(self, tmp_path, monkeypatch):
+ """Views work when sample_id maps to a non-default column."""
+ import tfbpapi.virtual_db as vdb_module
+
+ # Config uses experiment_id as the sample identifier
+ config = {
+ "repositories": {
+ "TestOrg/custom_id": {
+ "dataset": {
+ "custom_data": {
+ "db_name": "custom",
+ "sample_id": {
+ "field": "experiment_id",
+ },
+ "regulator": {
+ "field": "regulator",
+ },
+ }
+ }
+ }
+ }
+ }
+ config_path = tmp_path / "config.yaml"
+ with open(config_path, "w") as f:
+ yaml.dump(config, f)
+
+ # Parquet uses experiment_id (not sample_id)
+ df = pd.DataFrame(
+ {
+ "experiment_id": [100, 100, 200, 200],
+ "regulator": ["TF1", "TF1", "TF2", "TF2"],
+ "target": ["G1", "G2", "G1", "G2"],
+ "score": [1.5, 0.8, 2.1, 0.3],
+ }
+ )
+ parquet_path = tmp_path / "custom.parquet"
+ files = {
+ ("TestOrg/custom_id", "custom_data"): [_write_parquet(parquet_path, df)],
+ }
+
+ # Mock datacard
+ mock_card = MagicMock()
+ mock_card.get_metadata_fields.return_value = [
+ "regulator",
+ ]
+ mock_card.get_field_definitions.return_value = {}
+ mock_card.get_experimental_conditions.return_value = {}
+ mock_card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns={"experiment_id", "target", "score"},
+ metadata_columns={"regulator"},
+ join_columns=set(),
+ metadata_source="embedded",
+ external_metadata_config=None,
+ is_partitioned=False,
+ )
+
+ v = VirtualDB(config_path)
+
+ monkeypatch.setattr(
+ VirtualDB,
+ "_resolve_parquet_files",
+ lambda self, repo_id, cn: files.get((repo_id, cn), []),
+ )
+ monkeypatch.setattr(
+ vdb_module,
+ "_cached_datacard",
+ lambda repo_id, token=None: mock_card,
+ )
+
+ # Meta view should have experiment_id + regulator
+ meta_df = v.query("SELECT * FROM custom_meta")
+ assert "experiment_id" in meta_df.columns
+ assert len(meta_df) == 2 # 2 distinct samples
+
+ # Enriched raw view should JOIN on experiment_id
+ raw_df = v.query("SELECT * FROM custom")
+ assert "experiment_id" in raw_df.columns
+ assert len(raw_df) == 4 # all rows
+
+ def test_get_sample_id_field_dataset_level(self):
+ """Dataset-level sample_id takes precedence."""
+ config = MetadataConfig.model_validate(
+ {
+ "repositories": {
+ "Org/repo": {
+ "dataset": {
+ "ds": {
+ "sample_id": {
+ "field": "my_id",
+ },
+ }
+ }
+ }
+ }
+ }
+ )
+ assert config.get_sample_id_field("Org/repo", "ds") == "my_id"
+
+ def test_get_sample_id_field_repo_level(self):
+ """Repo-level sample_id used when dataset has none."""
+ config = MetadataConfig.model_validate(
+ {
+ "repositories": {
+ "Org/repo": {
+ "sample_id": {"field": "repo_sid"},
+ "dataset": {"ds": {}},
+ }
+ }
+ }
+ )
+ assert config.get_sample_id_field("Org/repo", "ds") == "repo_sid"
+
+ def test_get_sample_id_field_default(self):
+ """Falls back to 'sample_id' when not configured."""
+ config = MetadataConfig.model_validate(
+ {"repositories": {"Org/repo": {"dataset": {"ds": {}}}}}
+ )
+ assert config.get_sample_id_field("Org/repo", "ds") == "sample_id"
+
+ def test_get_sample_id_field_dataset_overrides_repo(self):
+ """Dataset-level overrides repo-level."""
+ config = MetadataConfig.model_validate(
+ {
+ "repositories": {
+ "Org/repo": {
+ "sample_id": {"field": "repo_id_col"},
+ "dataset": {
+ "ds": {
+ "sample_id": {
+ "field": "ds_id_col",
+ },
+ }
+ },
+ }
+ }
+ }
+ )
+ assert config.get_sample_id_field("Org/repo", "ds") == "ds_id_col"
+
+
+class TestExternalMetadata:
+ """Tests for datasets with external metadata parquet files."""
+
+ def test_external_metadata_join(self, tmp_path, monkeypatch):
+ """Meta view JOINs data and metadata parquet when metadata is in a separate
+ config."""
+ import tfbpapi.virtual_db as vdb_module
+
+ # Data parquet: measurements with sample_id but no
+ # metadata columns like db_id or batch
+ data_df = pd.DataFrame(
+ {
+ "sample_id": [1, 1, 2, 2],
+ "target_locus_tag": [
+ "YAL001C",
+ "YAL002W",
+ "YAL001C",
+ "YAL002W",
+ ],
+ "effect": [1.5, 0.8, 2.1, 0.3],
+ }
+ )
+ # Metadata parquet: sample-level metadata
+ meta_df = pd.DataFrame(
+ {
+ "sample_id": [1, 2],
+ "db_id": [101, 102],
+ "regulator_locus_tag": ["YBR049C", "YDR463W"],
+ "background_hops": [500, 600],
+ }
+ )
+
+ data_path = _write_parquet(tmp_path / "data.parquet", data_df)
+ meta_path = _write_parquet(tmp_path / "meta.parquet", meta_df)
+
+ parquet_files = {
+ ("TestOrg/repo", "chip_data"): [data_path],
+ ("TestOrg/repo", "sample_metadata"): [meta_path],
+ }
+
+ config = {
+ "repositories": {
+ "TestOrg/repo": {
+ "sample_id": {"field": "sample_id"},
+ "dataset": {
+ "chip_data": {
+ "db_name": "chip",
+ "regulator_locus_tag": {
+ "field": "regulator_locus_tag",
+ },
+ }
+ },
+ }
+ }
+ }
+ config_file = tmp_path / "config.yaml"
+ with open(config_file, "w") as f:
+ yaml.dump(config, f)
+
+ # Mock DataCard: external metadata via applies_to
+ card = MagicMock()
+ config_mock = MagicMock()
+ config_mock.metadata_fields = None # no embedded
+ card.get_config.return_value = config_mock
+ card.get_metadata_fields.return_value = [
+ "sample_id",
+ "db_id",
+ "regulator_locus_tag",
+ "background_hops",
+ ]
+ card.get_metadata_config_name.return_value = "sample_metadata"
+ # Data parquet columns (from chip_data features)
+ card.get_data_col_names.return_value = {
+ "sample_id",
+ "target_locus_tag",
+ "effect",
+ }
+ card.get_field_definitions.return_value = {}
+ card.get_experimental_conditions.return_value = {}
+ # External metadata schema: data cols in data parquet,
+ # metadata cols in metadata parquet, joined on sample_id
+ card.get_dataset_schema.return_value = DatasetSchema(
+ data_columns={"sample_id", "target_locus_tag", "effect"},
+ metadata_columns={
+ "sample_id",
+ "db_id",
+ "regulator_locus_tag",
+ "background_hops",
+ },
+ join_columns={"sample_id"},
+ metadata_source="external",
+ external_metadata_config="sample_metadata",
+ is_partitioned=False,
+ )
+
+ v = VirtualDB(config_file)
+ monkeypatch.setattr(
+ VirtualDB,
+ "_resolve_parquet_files",
+ lambda self, repo_id, cfg: parquet_files.get((repo_id, cfg), []),
+ )
+ monkeypatch.setattr(
+ vdb_module,
+ "_cached_datacard",
+ lambda repo_id, token=None: card,
+ )
+
+ # Trigger view registration
+ tables = v.tables()
+ assert "chip" in tables
+ assert "chip_meta" in tables
+
+ # Meta view should have columns from both parquets
+ meta_result = v.query("SELECT * FROM chip_meta ORDER BY sample_id")
+ meta_cols = set(meta_result.columns)
+ assert "sample_id" in meta_cols
+ assert "db_id" in meta_cols
+ assert "regulator_locus_tag" in meta_cols
+ assert "background_hops" in meta_cols
+
+ # Verify data is correct (joined properly)
+ assert len(meta_result) == 2
+ row1 = meta_result[meta_result["sample_id"] == 1].iloc[0]
+ assert row1["db_id"] == 101
+ assert row1["regulator_locus_tag"] == "YBR049C"
+
+ # Enriched raw view should also work
+ raw_result = v.query("SELECT * FROM chip ORDER BY sample_id")
+ assert "db_id" in raw_result.columns
+ assert len(raw_result) == 4 # 4 data rows
diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py
index 96097a3..1ac968c 100644
--- a/tfbpapi/virtual_db.py
+++ b/tfbpapi/virtual_db.py
@@ -11,8 +11,8 @@
views (one row per sample with derived columns) and full data views (measurement-level
data joined to metadata). For comparative analysis datasets, VirtualDB creates expanded
views that parse composite ID fields into ``_source`` (aliased to the configured
-db_name) and ``_id`` (sample_id) columns. The expectation is that a developer will
-use this interface to write SQL queries against the views to provide an API to
+db_name) and ``_id`` (sample identifier) columns. The expectation is that a developer
+will use this interface to write SQL queries against the views to provide an API to
downstream users and applications.
Example Usage::
@@ -49,8 +49,9 @@
import duckdb
import pandas as pd
+from duckdb import BinderException
-from tfbpapi.datacard import DataCard
+from tfbpapi.datacard import DataCard, DatasetSchema
from tfbpapi.models import MetadataConfig
logger = logging.getLogger(__name__)
@@ -389,6 +390,27 @@ def get_common_fields(self) -> list[str]:
common = set.intersection(*sets)
return sorted(common)
+ def get_tags(self, db_name: str) -> dict[str, str]:
+ """
+ Return the merged tags for a dataset.
+
+ Tags are defined in the configuration at the repository and/or
+ dataset level. Dataset-level tags override repository-level tags
+ with the same key. See the ``tags`` section of the configuration
+ guide for details.
+
+ :param db_name: Dataset name as it appears in :meth:`tables` (the
+ resolved ``db_name`` from the configuration, or the
+ ``config_name`` if ``db_name`` was not explicitly set).
+ :return: Dict of merged tags, or empty dict if the dataset has no
+ tags or the name is not found.
+
+ """
+ if db_name not in self._db_name_map:
+ return {}
+ repo_id, config_name = self._db_name_map[db_name]
+ return self.config.get_tags(repo_id, config_name)
+
# ------------------------------------------------------------------
# Lazy initialisation
# ------------------------------------------------------------------
@@ -414,6 +436,62 @@ def _register_all_views(self) -> None:
parquet_only=comparative,
)
+ # 1b. Resolve external metadata parquet views.
+ # When a data config's metadata lives in a separate HF config
+ # (applies_to), register its parquet as ___metadata_parquet.
+ # All information is derived from DataCard YAML -- no DuckDB
+ # introspection needed.
+ self._dataset_schemas: dict[str, DatasetSchema] = {}
+ self._external_meta_views: dict[str, str] = {}
+ for db_name, (repo_id, config_name) in self._db_name_map.items():
+ if self._is_comparative(repo_id, config_name):
+ continue
+ try:
+ card = _cached_datacard(repo_id, token=self.token)
+ schema = card.get_dataset_schema(config_name)
+ except Exception as exc:
+ logger.warning(
+ "Could not get dataset schema for %s/%s: %s",
+ repo_id,
+ config_name,
+ exc,
+ )
+ continue
+ if schema is not None:
+ self._dataset_schemas[db_name] = schema
+ if (
+ schema is None
+ or schema.metadata_source != "external"
+ or not schema.external_metadata_config
+ ):
+ continue
+ meta_view = f"__{db_name}_metadata_parquet"
+ files = self._resolve_parquet_files(
+ repo_id, schema.external_metadata_config
+ )
+ if not files:
+ logger.warning(
+ "No parquet files for external metadata config "
+ "'%s' in repo '%s'",
+ schema.external_metadata_config,
+ repo_id,
+ )
+ continue
+ files_sql = ", ".join(f"'{f}'" for f in files)
+ try:
+ self._db.execute(
+ f"CREATE OR REPLACE VIEW {meta_view} AS "
+ f"SELECT * FROM read_parquet([{files_sql}])"
+ )
+ except Exception as exc:
+ logger.warning(
+ "Failed to create external metadata view '%s': %s",
+ meta_view,
+ exc,
+ )
+ continue
+ self._external_meta_views[db_name] = meta_view
+
# 2. Metadata views for primary datasets (_meta)
# This is based on the metadata defined in the datacard,
# and includes any additional derived columns based on the
@@ -567,62 +645,150 @@ def _register_raw_view(
def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None:
"""
- Register a ``_meta`` view with one row per sample_id.
+ Register a ``_meta`` view with one row per sample.
+
+ Includes metadata columns from the DataCard plus any derived columns
+ from config property mappings (resolved against DataCard definitions
+ with factor aliases applied).
- Includes raw metadata columns from the DataCard plus any derived columns from
- config property mappings (resolved against DataCard definitions with factor
- aliases applied).
+ For datasets with external metadata (a separate HF config with
+ ``applies_to``), JOINs the data parquet to the metadata parquet
+ on the configured sample_id column. The actual columns in the metadata
+ parquet are determined by DuckDB introspection (``DESCRIBE``) rather
+ than the DataCard feature list, because DataCard feature lists are
+ conceptual schemas that may include columns not physically present
+ in the parquet files.
:param db_name: Base view name for the primary dataset
:param repo_id: Repository ID
:param config_name: Configuration name
+ raises ValueError: If no metadata fields are found.
+ raises BinderException: If view creation fails, with SQL details.
+
"""
parquet_view = f"__{db_name}_parquet"
if not self._view_exists(parquet_view):
return
- meta_cols = self._resolve_metadata_fields(repo_id, config_name)
- prop_result = self._resolve_property_columns(repo_id, config_name)
+ sample_col = self._get_sample_id_col(db_name)
- if prop_result is not None:
- derived_exprs, prop_raw_cols = prop_result
- # Raw cols = metadata_fields + any source fields needed
- # by property mappings
- if meta_cols is not None:
- raw = list(dict.fromkeys(["sample_id"] + meta_cols + prop_raw_cols))
- else:
- raw = list(dict.fromkeys(["sample_id"] + prop_raw_cols))
+ # Pull ext_meta_view early -- needed for both meta_cols and
+ # FROM clause construction.
+ schema: DatasetSchema | None = getattr(self, "_dataset_schemas", {}).get(
+ db_name
+ )
+ ext_meta_view: str | None = getattr(self, "_external_meta_views", {}).get(
+ db_name
+ )
- raw_sql = ", ".join(raw)
+ is_external = (
+ ext_meta_view is not None
+ and schema is not None
+ and schema.metadata_source == "external"
+ )
- # Outer SELECT: raw cols + derived expressions
- outer_parts = list(raw) + derived_exprs
- outer_sql = ", ".join(outer_parts)
+ if is_external:
+ # DataCard feature lists are conceptual -- columns listed there
+ # may not be physically present in the parquet file. Use DuckDB
+ # introspection to get the actual columns in the metadata parquet.
+ assert ext_meta_view is not None
+ actual_meta_cols: set[str] = set(self._get_view_columns(ext_meta_view))
+ meta_cols: list[str] = sorted(actual_meta_cols)
+ elif schema is not None:
+ actual_meta_cols = schema.metadata_columns
+ meta_cols = sorted(actual_meta_cols)
+ else:
+ meta_cols = self._resolve_metadata_fields(repo_id, config_name) or []
+ actual_meta_cols = set(meta_cols)
- self._db.execute(
- f"CREATE OR REPLACE VIEW {db_name}_meta AS "
- f"SELECT DISTINCT {outer_sql} "
- f"FROM ("
- f"SELECT DISTINCT {raw_sql} "
- f"FROM {parquet_view}"
- f") AS __raw"
+ if not meta_cols:
+ raise ValueError(
+ f"No metadata fields found for {repo_id}/{config_name}. "
+ f"Cannot create meta view '{db_name}_meta'."
)
- elif meta_cols is not None:
- # Fallback: metadata_fields only, no property mappings
- cols = list(dict.fromkeys(["sample_id"] + meta_cols))
- cols_sql = ", ".join(cols)
- self._db.execute(
- f"CREATE OR REPLACE VIEW {db_name}_meta AS "
- f"SELECT DISTINCT {cols_sql} "
- f"FROM {parquet_view}"
+
+ # FROM clause: JOIN data + metadata parquets when external,
+ # plain parquet view otherwise.
+ if is_external:
+ assert ext_meta_view is not None
+ # Use the configured sample_id column as the join key.
+ # The DataCard feature intersection (schema.join_columns)
+ # is unreliable because a data config's feature list may
+ # document columns that are physically only in the metadata
+ # parquet (present conceptually after a join, not in the
+ # physical data parquet file).
+ from_clause = (
+ f"{parquet_view} d " f"JOIN {ext_meta_view} m " f"USING ({sample_col})"
)
+ is_join = True
else:
- # No metadata_fields at all -- all columns are metadata
- self._db.execute(
- f"CREATE OR REPLACE VIEW {db_name}_meta AS "
- f"SELECT DISTINCT * FROM {parquet_view}"
- )
+ from_clause = parquet_view
+ is_join = False
+
+ def qualify(col: str) -> str:
+ """Return qualified column name for JOIN context."""
+ if not is_join:
+ return col
+ if col == sample_col:
+ return col # USING makes join key unqualified
+ # Use the actual metadata parquet columns (from DuckDB
+ # introspection) to decide qualification, not the DataCard
+ # feature list which may be inaccurate.
+ if col in actual_meta_cols:
+ return f"m.{col}"
+ return f"d.{col}"
+
+ # Build SELECT: sample_id + metadata cols (deduplicated)
+ seen: set[str] = set()
+ select_parts: list[str] = []
+
+ def add_col(col: str) -> None:
+ if col not in seen:
+ seen.add(col)
+ select_parts.append(qualify(col))
+
+ add_col(sample_col)
+ for col in meta_cols:
+ add_col(col)
+
+ # Add derived property expressions from the VirtualDB config
+ prop_result = self._resolve_property_columns(repo_id, config_name)
+ if prop_result is not None:
+ derived_exprs, prop_raw_cols = prop_result
+ # Ensure source columns needed by expressions are selected
+ for col in prop_raw_cols:
+ add_col(col)
+ # Qualify source column references inside CASE WHEN expressions
+ if is_join:
+ qualified_exprs = []
+ for expr in derived_exprs:
+ for raw_col in prop_raw_cols:
+ q = qualify(raw_col)
+ if q != raw_col:
+ # Replace bare column name in CASE WHEN patterns
+ expr = expr.replace(
+ f"CASE {raw_col} ", f"CASE {q} "
+ ).replace(f" {raw_col} = ", f" {q} = ")
+ qualified_exprs.append(expr)
+ derived_exprs = qualified_exprs
+ select_parts.extend(derived_exprs)
+
+ cols_sql = ", ".join(select_parts)
+ sql = (
+ f"CREATE OR REPLACE VIEW {db_name}_meta AS "
+ f"SELECT DISTINCT {cols_sql} FROM {from_clause}"
+ )
+ try:
+ self._db.execute(sql)
+ except BinderException as exc:
+ raise BinderException(
+ f"Failed to create meta view '{db_name}_meta'.\n"
+ f" schema: {schema}\n"
+ f" from_clause: {from_clause}\n"
+ f" SQL: {sql}\n"
+ f" error: {exc}"
+ ) from exc
def _enrich_raw_view(self, db_name: str) -> None:
"""
@@ -648,40 +814,58 @@ def _enrich_raw_view(self, db_name: str) -> None:
if not extra_cols:
return
+ sample_col = self._get_sample_id_col(db_name)
extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols))
self._db.execute(
f"CREATE OR REPLACE VIEW {db_name} AS "
f"SELECT r.*, {extra_select} "
f"FROM {parquet_name} r "
- f"JOIN {meta_name} m USING (sample_id)"
+ f"JOIN {meta_name} m USING ({sample_col})"
)
def _get_view_columns(self, view: str) -> list[str]:
- """Return column names for a view."""
- df = self._db.execute(
- f"SELECT column_name FROM information_schema.columns "
- f"WHERE table_name = '{view}'"
- ).fetchdf()
+ """
+ Return column names for a view.
+
+ Uses ``DESCRIBE`` rather than ``information_schema`` to force
+ eager schema resolution for ``read_parquet``-backed views,
+ which DuckDB may evaluate lazily.
+
+ """
+ df = self._db.execute(f"DESCRIBE {view}").fetchdf()
return df["column_name"].tolist()
+ def _get_sample_id_col(self, db_name: str) -> str:
+ """
+ Resolve the sample identifier column name for a dataset.
+
+ :param db_name: Resolved database view name
+ :return: Actual column name for the sample identifier
+
+ """
+ repo_id, config_name = self._db_name_map[db_name]
+ return self.config.get_sample_id_field(repo_id, config_name)
+
def _resolve_metadata_fields(
self, repo_id: str, config_name: str
) -> list[str] | None:
"""
- Get the metadata_fields list from the DataCard config.
+ Get metadata field names from the DataCard.
+
+ Delegates to ``DataCard.get_metadata_fields()`` which handles
+ both embedded metadata_fields and external metadata configs
+ (via applies_to).
:param repo_id: Repository ID
:param config_name: Configuration name
- :return: List of metadata field names, or None if not specified
+ :return: List of metadata field names, or None if not found
"""
try:
card = _cached_datacard(repo_id, token=self.token)
- config = card.get_config(config_name)
- if config and config.metadata_fields:
- return list(config.metadata_fields)
+ return card.get_metadata_fields(config_name)
except Exception:
- logger.debug(
+ logger.error(
"Could not resolve metadata_fields for %s/%s",
repo_id,
config_name,
@@ -975,7 +1159,7 @@ def _register_comparative_expanded_view(
- ``_source`` -- the ``repo_id;config_name`` prefix,
aliased to the configured ``db_name`` when available.
- - ``_id`` -- the sample_id component.
+ - ``_id`` -- the sample identifier component.
:param db_name: Base view name for the comparative dataset
:param ds_cfg: DatasetVirtualDBConfig with ``links``