From bf46a19b7ab175a2b82b249970167c301d6b9451 Mon Sep 17 00:00:00 2001 From: fei1cell Date: Fri, 13 Feb 2026 16:40:30 -0600 Subject: [PATCH] v2 docs: add DTO analysis and update datasets yaml config --- docs/tutorials/DTO_analysis.ipynb | 2384 +++++++++++++++++++++++++++++ tfbpapi/datasets.yaml | 139 ++ 2 files changed, 2523 insertions(+) create mode 100644 docs/tutorials/DTO_analysis.ipynb create mode 100644 tfbpapi/datasets.yaml diff --git a/docs/tutorials/DTO_analysis.ipynb b/docs/tutorials/DTO_analysis.ipynb new file mode 100644 index 0000000..c564c38 --- /dev/null +++ b/docs/tutorials/DTO_analysis.ipynb @@ -0,0 +1,2384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "48ac26da", + "metadata": {}, + "source": [ + "## DTO Active-Set Analysis\n", + "\n", + "This analysis identifies transcription factor (TF) binding samples that show\n", + "significant dual-threshold overlap (DTO) with both major perturbation datasets,\n", + "providing a high-confidence set of regulatory interactions.\n", + "\n", + "### Workflow\n", + "\n", + "1. **Filter by Hackett-2020-ZEV**: Select binding samples from all datasets\n", + " (Harbison, Rossi, Mahendrawada) where DTO vs. Hackett-2020-ZEV P ≤ 0.01.\n", + "\n", + "2. **Filter by Kemmeren-2014-TFKO**: Apply the same selection against the\n", + " Kemmeren-2014 knockout perturbation dataset (P ≤ 0.01).\n", + "\n", + "3. **Intersect**: Retain only binding samples that are significant in **both**\n", + " comparisons. These form the \"active set\" — samples whose regulatory\n", + " relationships are independently supported by two orthogonal perturbation\n", + " experiments (ZEV-inducible overexpression and gene knockout).\n", + "\n", + "4. **Summarize by regulator**:\n", + " - **Table**: One row per regulator with the count of active samples.\n", + " - **Distribution**: Histogram showing how many regulators have 1, 2, 3, …\n", + " active samples, revealing whether active evidence is concentrated in a\n", + " few TFs or spread broadly.\n", + "\n", + "### Rationale\n", + "\n", + "A binding sample significant in only one perturbation comparison could reflect\n", + "experiment-specific artifacts. Requiring significance in both Hackett (ZEV) and\n", + "Kemmeren (TFKO) leverages two fundamentally different perturbation strategies,\n", + "increasing confidence that the observed TF–target relationships are\n", + "biologically meaningful." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "c59de9d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VirtualDB(7 repos, 8 datasets, views not yet registered)\n" + ] + } + ], + "source": [ + "from tfbpapi.virtual_db import VirtualDB\n", + "\n", + "# Read configuration from datasets.yaml\n", + "vdb = VirtualDB(\"/home/luegg/code/tfbpapi/tfbpapi/datasets.yaml\")\n", + "print(repr(vdb))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "3ccc8874", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered views:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 4963.67it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9799.78it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12595.51it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 3644.05it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10280.16it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11125.47it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10058.28it/s]\n", + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 146996.64it/s]\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'temperature_celsius' not found at path 'temperature_celsius' (current keys: ['description', 'initial_temperature_celsius', 'temperature_shift_celsius', 'temperature_shift_duration_minutes', 'growth_phase_at_harvest', 'media'])\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " chec_mahendrawada_m2025_af_combined_meta\n", + " chec_mahendrawada_m2025_af_combined_meta_meta\n", + " dto_expanded\n", + " hackett\n", + " hackett_meta\n", + " harbison\n", + " harbison_meta\n", + " kemmeren\n", + " kemmeren_meta\n", + " knockout\n", + " knockout_meta\n", + " overexpression\n", + " overexpression_meta\n", + " rossi_2021_af_combined\n", + " rossi_2021_af_combined_meta\n" + ] + } + ], + "source": [ + "# List all registered views\n", + "print(\"Registered views:\")\n", + "for name in vdb.tables():\n", + " print(f\" {name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "968e3f01", + "metadata": {}, + "source": [ + "## Step1-Obtain the intersection of the binding dataset and Hackett" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "80e34ef0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 910\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_symbolregulator_locus_tagconditioncarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdrhackett_sample_idperturbation_id_source
016TEC1YBR083WAlphaglucose30.00.00.14176891hackett
1174ZAP1YJL056CYPDglucose30.00.00.005249818hackett
238PHO2YDL106CH2O2Higlucose30.00.00.263801190hackett
3248SOK2YMR016CBUT14glucose30.00.00.0655951098hackett
450NRG1YDR043CH2O2Loglucose30.00.00.167487249hackett
550NRG1YDR043CH2O2Loglucose30.00.00.241330248hackett
6323DIG1YPL049CBUT90glucose30.00.00.0316331598hackett
7281GCR2YNL199CSMunspecified30.00.00.0017011251hackett
886GLN3YER040WSMunspecified30.00.00.099947434hackett
9282GCR2YNL199CYPDglucose30.00.00.0474431251hackett
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_symbol regulator_locus_tag condition carbon_source \\\n", + "0 16 TEC1 YBR083W Alpha glucose \n", + "1 174 ZAP1 YJL056C YPD glucose \n", + "2 38 PHO2 YDL106C H2O2Hi glucose \n", + "3 248 SOK2 YMR016C BUT14 glucose \n", + "4 50 NRG1 YDR043C H2O2Lo glucose \n", + "5 50 NRG1 YDR043C H2O2Lo glucose \n", + "6 323 DIG1 YPL049C BUT90 glucose \n", + "7 281 GCR2 YNL199C SM unspecified \n", + "8 86 GLN3 YER040W SM unspecified \n", + "9 282 GCR2 YNL199C YPD glucose \n", + "\n", + " temperature_celsius dto_empirical_pvalue dto_fdr hackett_sample_id \\\n", + "0 30.0 0.0 0.141768 91 \n", + "1 30.0 0.0 0.005249 818 \n", + "2 30.0 0.0 0.263801 190 \n", + "3 30.0 0.0 0.065595 1098 \n", + "4 30.0 0.0 0.167487 249 \n", + "5 30.0 0.0 0.241330 248 \n", + "6 30.0 0.0 0.031633 1598 \n", + "7 30.0 0.0 0.001701 1251 \n", + "8 30.0 0.0 0.099947 434 \n", + "9 30.0 0.0 0.047443 1251 \n", + "\n", + " perturbation_id_source \n", + "0 hackett \n", + "1 hackett \n", + "2 hackett \n", + "3 hackett \n", + "4 hackett \n", + "5 hackett \n", + "6 hackett \n", + "7 hackett \n", + "8 hackett \n", + "9 hackett " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Using binding dataset as the main table, JOIN dto_expanded to retrieve DTO statistics\n", + "# Filtering for vs. Hackett perturbation data, pvalue <= 0.01\n", + "harbison_hackett = vdb.query(\"\"\"\n", + " SELECT\n", + " h.sample_id,\n", + " h.regulator_symbol,\n", + " h.regulator_locus_tag,\n", + " h.condition,\n", + " h.carbon_source,\n", + " h.temperature_celsius,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS hackett_sample_id,\n", + " d.perturbation_id_source\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.perturbation_id_source = 'hackett'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(harbison_hackett)}\")\n", + "harbison_hackett.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "60ae9b90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 684\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagcarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdrhackett_sample_idperturbation_id_source
0671YOR083Wglucose250.00.0023851393hackett
1550YMR039Cglucose250.00.0239651132hackett
2154YDR216Wglucose250.00.082601304hackett
3392YJL110Cglucose250.00.124665799hackett
4700YOR290Cglucose250.00.0791601454hackett
5372YIL131Cglucose250.00.059436732hackett
6700YOR290Cglucose250.00.0700621453hackett
7372YIL131Cglucose250.00.076569731hackett
8700YOR290Cglucose250.00.0938791452hackett
9372YIL131Cglucose250.00.108097730hackett
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag carbon_source temperature_celsius \\\n", + "0 671 YOR083W glucose 25 \n", + "1 550 YMR039C glucose 25 \n", + "2 154 YDR216W glucose 25 \n", + "3 392 YJL110C glucose 25 \n", + "4 700 YOR290C glucose 25 \n", + "5 372 YIL131C glucose 25 \n", + "6 700 YOR290C glucose 25 \n", + "7 372 YIL131C glucose 25 \n", + "8 700 YOR290C glucose 25 \n", + "9 372 YIL131C glucose 25 \n", + "\n", + " dto_empirical_pvalue dto_fdr hackett_sample_id perturbation_id_source \n", + "0 0.0 0.002385 1393 hackett \n", + "1 0.0 0.023965 1132 hackett \n", + "2 0.0 0.082601 304 hackett \n", + "3 0.0 0.124665 799 hackett \n", + "4 0.0 0.079160 1454 hackett \n", + "5 0.0 0.059436 732 hackett \n", + "6 0.0 0.070062 1453 hackett \n", + "7 0.0 0.076569 731 hackett \n", + "8 0.0 0.093879 1452 hackett \n", + "9 0.0 0.108097 730 hackett " + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rossi_hackett = vdb.query(\"\"\"\n", + " SELECT\n", + " r.sample_id,\n", + " r.regulator_locus_tag,\n", + " r.carbon_source,\n", + " r.temperature_celsius,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS hackett_sample_id,\n", + " d.perturbation_id_source\n", + " FROM (\n", + " SELECT DISTINCT\n", + " sample_id,\n", + " regulator_locus_tag,\n", + " carbon_source,\n", + " temperature_celsius\n", + " FROM rossi_2021_af_combined_meta\n", + " ) r\n", + " JOIN dto_expanded d\n", + " ON CAST(r.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'rossi_2021_af_combined'\n", + " WHERE d.perturbation_id_source = 'hackett'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(rossi_hackett)}\")\n", + "rossi_hackett.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "9e8e4616", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 946\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditiondto_empirical_pvaluedto_fdrhackett_sample_idperturbation_id_source
01YBL005WPDR3standard0.00.00422782hackett
11YBL005WPDR3standard0.00.02529783hackett
21YBL005WPDR3standard0.00.04095684hackett
31YBL005WPDR3standard0.00.03178685hackett
41YBL005WPDR3standard0.00.01850986hackett
51YBL005WPDR3standard0.00.01701487hackett
61YBL005WPDR3standard0.00.01631988hackett
7100YJL056CZAP1standard0.00.000000817hackett
8100YJL056CZAP1standard0.00.000915818hackett
9100YJL056CZAP1standard0.00.000760819hackett
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition \\\n", + "0 1 YBL005W PDR3 standard \n", + "1 1 YBL005W PDR3 standard \n", + "2 1 YBL005W PDR3 standard \n", + "3 1 YBL005W PDR3 standard \n", + "4 1 YBL005W PDR3 standard \n", + "5 1 YBL005W PDR3 standard \n", + "6 1 YBL005W PDR3 standard \n", + "7 100 YJL056C ZAP1 standard \n", + "8 100 YJL056C ZAP1 standard \n", + "9 100 YJL056C ZAP1 standard \n", + "\n", + " dto_empirical_pvalue dto_fdr hackett_sample_id perturbation_id_source \n", + "0 0.0 0.004227 82 hackett \n", + "1 0.0 0.025297 83 hackett \n", + "2 0.0 0.040956 84 hackett \n", + "3 0.0 0.031786 85 hackett \n", + "4 0.0 0.018509 86 hackett \n", + "5 0.0 0.017014 87 hackett \n", + "6 0.0 0.016319 88 hackett \n", + "7 0.0 0.000000 817 hackett \n", + "8 0.0 0.000915 818 hackett \n", + "9 0.0 0.000760 819 hackett " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mahendrawada_hackett = vdb.query(\"\"\"\n", + " SELECT\n", + " m.sample_id,\n", + " m.regulator_locus_tag,\n", + " m.regulator_symbol,\n", + " m.condition,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS hackett_sample_id,\n", + " d.perturbation_id_source\n", + " FROM (\n", + " SELECT DISTINCT\n", + " sample_id,\n", + " regulator_locus_tag,\n", + " regulator_symbol,\n", + " condition\n", + " FROM chec_mahendrawada_m2025_af_combined_meta\n", + " ) m\n", + " JOIN dto_expanded d\n", + " ON CAST(m.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source LIKE '%mahendrawada%af_combined%'\n", + " WHERE d.perturbation_id_source = 'hackett'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(mahendrawada_hackett)}\")\n", + "mahendrawada_hackett.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "a4f7ee52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of records after merging: 2540\n", + "\n", + "Distribution of each dataset:\n", + "source_dataset\n", + "mahendrawada 946\n", + "harbison 910\n", + "rossi 684\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagdto_empirical_pvaluedto_fdrhackett_sample_idperturbation_id_sourcesource_dataset
016YBR083W0.00.14176891hackettharbison
1174YJL056C0.00.005249818hackettharbison
238YDL106C0.00.263801190hackettharbison
3248YMR016C0.00.0655951098hackettharbison
450YDR043C0.00.167487249hackettharbison
550YDR043C0.00.241330248hackettharbison
6323YPL049C0.00.0316331598hackettharbison
7281YNL199C0.00.0017011251hackettharbison
886YER040W0.00.099947434hackettharbison
9282YNL199C0.00.0474431251hackettharbison
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag dto_empirical_pvalue dto_fdr \\\n", + "0 16 YBR083W 0.0 0.141768 \n", + "1 174 YJL056C 0.0 0.005249 \n", + "2 38 YDL106C 0.0 0.263801 \n", + "3 248 YMR016C 0.0 0.065595 \n", + "4 50 YDR043C 0.0 0.167487 \n", + "5 50 YDR043C 0.0 0.241330 \n", + "6 323 YPL049C 0.0 0.031633 \n", + "7 281 YNL199C 0.0 0.001701 \n", + "8 86 YER040W 0.0 0.099947 \n", + "9 282 YNL199C 0.0 0.047443 \n", + "\n", + " hackett_sample_id perturbation_id_source source_dataset \n", + "0 91 hackett harbison \n", + "1 818 hackett harbison \n", + "2 190 hackett harbison \n", + "3 1098 hackett harbison \n", + "4 249 hackett harbison \n", + "5 248 hackett harbison \n", + "6 1598 hackett harbison \n", + "7 1251 hackett harbison \n", + "8 434 hackett harbison \n", + "9 1251 hackett harbison " + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "common_cols = [\n", + " \"sample_id\", \"regulator_locus_tag\",\n", + " \"dto_empirical_pvalue\", \"dto_fdr\",\n", + " \"hackett_sample_id\", \"perturbation_id_source\"\n", + "]\n", + "\n", + "harbison_hackett[\"source_dataset\"] = \"harbison\"\n", + "rossi_hackett[\"source_dataset\"] = \"rossi\"\n", + "mahendrawada_hackett[\"source_dataset\"] = \"mahendrawada\"\n", + "\n", + "hackett_all = pd.concat([\n", + " harbison_hackett[common_cols + [\"source_dataset\"]],\n", + " rossi_hackett[common_cols + [\"source_dataset\"]],\n", + " mahendrawada_hackett[common_cols + [\"source_dataset\"]],\n", + "], ignore_index=True)\n", + "\n", + "print(f\"Total number of records after merging: {len(hackett_all)}\")\n", + "print(f\"\\nDistribution of each dataset:\")\n", + "print(hackett_all[\"source_dataset\"].value_counts())\n", + "hackett_all.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "ae37e5c8", + "metadata": {}, + "source": [ + "## Step2-Obtain the intersection of the binding dataset and kemmern" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "61741f38", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 224\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_symbolregulator_locus_tagconditioncarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdrkemmeren_sample_idperturbation_id_source
070SUM1YDR310CYPDglucose30.00.00.013631299kemmeren
1151STE12YHR084WAlphaglucose30.00.00.128768622kemmeren
2319MET31YPL038WSMunspecified30.00.00.0212991367kemmeren
370SUM1YDR310CYPDglucose30.00.00.006238299kemmeren
4151STE12YHR084WAlphaglucose30.00.00.117883622kemmeren
5242GAL80YML051WYPDglucose30.00.00.0000001006kemmeren
6319MET31YPL038WSMunspecified30.00.00.0263171367kemmeren
7323DIG1YPL049CBUT90glucose30.00.00.2365021372kemmeren
849NRG1YDR043CH2O2Higlucose30.00.00.230039221kemmeren
9198RGT1YKL038WYPDglucose30.00.00.003463786kemmeren
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_symbol regulator_locus_tag condition carbon_source \\\n", + "0 70 SUM1 YDR310C YPD glucose \n", + "1 151 STE12 YHR084W Alpha glucose \n", + "2 319 MET31 YPL038W SM unspecified \n", + "3 70 SUM1 YDR310C YPD glucose \n", + "4 151 STE12 YHR084W Alpha glucose \n", + "5 242 GAL80 YML051W YPD glucose \n", + "6 319 MET31 YPL038W SM unspecified \n", + "7 323 DIG1 YPL049C BUT90 glucose \n", + "8 49 NRG1 YDR043C H2O2Hi glucose \n", + "9 198 RGT1 YKL038W YPD glucose \n", + "\n", + " temperature_celsius dto_empirical_pvalue dto_fdr kemmeren_sample_id \\\n", + "0 30.0 0.0 0.013631 299 \n", + "1 30.0 0.0 0.128768 622 \n", + "2 30.0 0.0 0.021299 1367 \n", + "3 30.0 0.0 0.006238 299 \n", + "4 30.0 0.0 0.117883 622 \n", + "5 30.0 0.0 0.000000 1006 \n", + "6 30.0 0.0 0.026317 1367 \n", + "7 30.0 0.0 0.236502 1372 \n", + "8 30.0 0.0 0.230039 221 \n", + "9 30.0 0.0 0.003463 786 \n", + "\n", + " perturbation_id_source \n", + "0 kemmeren \n", + "1 kemmeren \n", + "2 kemmeren \n", + "3 kemmeren \n", + "4 kemmeren \n", + "5 kemmeren \n", + "6 kemmeren \n", + "7 kemmeren \n", + "8 kemmeren \n", + "9 kemmeren " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harbison_kemmeren = vdb.query(\"\"\"\n", + " SELECT\n", + " h.sample_id,\n", + " h.regulator_symbol,\n", + " h.regulator_locus_tag,\n", + " h.condition,\n", + " h.carbon_source,\n", + " h.temperature_celsius,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS kemmeren_sample_id,\n", + " d.perturbation_id_source\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.perturbation_id_source = 'kemmeren'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(harbison_kemmeren)}\")\n", + "harbison_kemmeren.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "cd3e4a4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 389\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagcarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdrkemmeren_sample_idperturbation_id_source
0478YLR182Wglucose250.00.084524913kemmeren
1291YGL237Cglucose250.00.010760521kemmeren
2208YEL018Wglucose250.00.218327361kemmeren
3487YLR256Wglucose250.00.000000932kemmeren
4269YGL071Wglucose250.00.174709471kemmeren
532YBR010Wglucose250.00.24030960kemmeren
6470YLR098Cglucose250.00.000020894kemmeren
7467YLR085Cglucose250.00.252903889kemmeren
8639YOL004Wglucose250.00.2153191233kemmeren
921YBL021Cglucose250.00.05686439kemmeren
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag carbon_source temperature_celsius \\\n", + "0 478 YLR182W glucose 25 \n", + "1 291 YGL237C glucose 25 \n", + "2 208 YEL018W glucose 25 \n", + "3 487 YLR256W glucose 25 \n", + "4 269 YGL071W glucose 25 \n", + "5 32 YBR010W glucose 25 \n", + "6 470 YLR098C glucose 25 \n", + "7 467 YLR085C glucose 25 \n", + "8 639 YOL004W glucose 25 \n", + "9 21 YBL021C glucose 25 \n", + "\n", + " dto_empirical_pvalue dto_fdr kemmeren_sample_id perturbation_id_source \n", + "0 0.0 0.084524 913 kemmeren \n", + "1 0.0 0.010760 521 kemmeren \n", + "2 0.0 0.218327 361 kemmeren \n", + "3 0.0 0.000000 932 kemmeren \n", + "4 0.0 0.174709 471 kemmeren \n", + "5 0.0 0.240309 60 kemmeren \n", + "6 0.0 0.000020 894 kemmeren \n", + "7 0.0 0.252903 889 kemmeren \n", + "8 0.0 0.215319 1233 kemmeren \n", + "9 0.0 0.056864 39 kemmeren " + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rossi_kemmeren = vdb.query(\"\"\"\n", + " SELECT\n", + " r.sample_id,\n", + " r.regulator_locus_tag,\n", + " r.carbon_source,\n", + " r.temperature_celsius,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS kemmeren_sample_id,\n", + " d.perturbation_id_source\n", + " FROM (\n", + " SELECT DISTINCT\n", + " sample_id,\n", + " regulator_locus_tag,\n", + " carbon_source,\n", + " temperature_celsius\n", + " FROM rossi_2021_af_combined_meta\n", + " ) r\n", + " JOIN dto_expanded d\n", + " ON CAST(r.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'rossi_2021_af_combined'\n", + " WHERE d.perturbation_id_source = 'kemmeren'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(rossi_kemmeren)}\")\n", + "rossi_kemmeren.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6071cf05", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records that meet the criteria: 262\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditiondto_empirical_pvaluedto_fdrkemmeren_sample_idperturbation_id_source
01YBL005WPDR3standard0.00.07766731kemmeren
15YBR083WTEC1standard0.00.06585974kemmeren
2100YJL056CZAP1standard0.00.302421709kemmeren
3101YJL110CGZF3standard0.00.117351723kemmeren
4102YJL176CSWI3standard0.00.127255736kemmeren
59YBR239CERT1standard0.00.147725106kemmeren
6104YJR060WCBF1standard0.00.166077754kemmeren
7106YJR140CHIR3standard0.00.358890772kemmeren
890YHR178WSTB5standard0.00.315486642kemmeren
911YBR289WSNF5standard0.00.133379120kemmeren
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition \\\n", + "0 1 YBL005W PDR3 standard \n", + "1 5 YBR083W TEC1 standard \n", + "2 100 YJL056C ZAP1 standard \n", + "3 101 YJL110C GZF3 standard \n", + "4 102 YJL176C SWI3 standard \n", + "5 9 YBR239C ERT1 standard \n", + "6 104 YJR060W CBF1 standard \n", + "7 106 YJR140C HIR3 standard \n", + "8 90 YHR178W STB5 standard \n", + "9 11 YBR289W SNF5 standard \n", + "\n", + " dto_empirical_pvalue dto_fdr kemmeren_sample_id perturbation_id_source \n", + "0 0.0 0.077667 31 kemmeren \n", + "1 0.0 0.065859 74 kemmeren \n", + "2 0.0 0.302421 709 kemmeren \n", + "3 0.0 0.117351 723 kemmeren \n", + "4 0.0 0.127255 736 kemmeren \n", + "5 0.0 0.147725 106 kemmeren \n", + "6 0.0 0.166077 754 kemmeren \n", + "7 0.0 0.358890 772 kemmeren \n", + "8 0.0 0.315486 642 kemmeren \n", + "9 0.0 0.133379 120 kemmeren " + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mahendrawada_kemmeren = vdb.query(\"\"\"\n", + " SELECT\n", + " m.sample_id,\n", + " m.regulator_locus_tag,\n", + " m.regulator_symbol,\n", + " m.condition,\n", + " d.dto_empirical_pvalue,\n", + " d.dto_fdr,\n", + " d.perturbation_id_id AS kemmeren_sample_id,\n", + " d.perturbation_id_source\n", + " FROM (\n", + " SELECT DISTINCT\n", + " sample_id,\n", + " regulator_locus_tag,\n", + " regulator_symbol,\n", + " condition\n", + " FROM chec_mahendrawada_m2025_af_combined_meta\n", + " ) m\n", + " JOIN dto_expanded d\n", + " ON CAST(m.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source LIKE '%mahendrawada%af_combined%'\n", + " WHERE d.perturbation_id_source = 'kemmeren'\n", + " AND d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + "\"\"\")\n", + "\n", + "print(f\"Number of records that meet the criteria: {len(mahendrawada_kemmeren)}\")\n", + "mahendrawada_kemmeren.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "274eab77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of records after merging: 875\n", + "\n", + "Distribution among datasets:\n", + "source_dataset\n", + "rossi 389\n", + "mahendrawada 262\n", + "harbison 224\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagdto_empirical_pvaluedto_fdrkemmeren_sample_idperturbation_id_sourcesource_dataset
070YDR310C0.00.013631299kemmerenharbison
1151YHR084W0.00.128768622kemmerenharbison
2319YPL038W0.00.0212991367kemmerenharbison
370YDR310C0.00.006238299kemmerenharbison
4151YHR084W0.00.117883622kemmerenharbison
5242YML051W0.00.0000001006kemmerenharbison
6319YPL038W0.00.0263171367kemmerenharbison
7323YPL049C0.00.2365021372kemmerenharbison
849YDR043C0.00.230039221kemmerenharbison
9198YKL038W0.00.003463786kemmerenharbison
\n", + "
" + ], + "text/plain": [ + " sample_id regulator_locus_tag dto_empirical_pvalue dto_fdr \\\n", + "0 70 YDR310C 0.0 0.013631 \n", + "1 151 YHR084W 0.0 0.128768 \n", + "2 319 YPL038W 0.0 0.021299 \n", + "3 70 YDR310C 0.0 0.006238 \n", + "4 151 YHR084W 0.0 0.117883 \n", + "5 242 YML051W 0.0 0.000000 \n", + "6 319 YPL038W 0.0 0.026317 \n", + "7 323 YPL049C 0.0 0.236502 \n", + "8 49 YDR043C 0.0 0.230039 \n", + "9 198 YKL038W 0.0 0.003463 \n", + "\n", + " kemmeren_sample_id perturbation_id_source source_dataset \n", + "0 299 kemmeren harbison \n", + "1 622 kemmeren harbison \n", + "2 1367 kemmeren harbison \n", + "3 299 kemmeren harbison \n", + "4 622 kemmeren harbison \n", + "5 1006 kemmeren harbison \n", + "6 1367 kemmeren harbison \n", + "7 1372 kemmeren harbison \n", + "8 221 kemmeren harbison \n", + "9 786 kemmeren harbison " + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "common_cols = [\n", + " \"sample_id\", \"regulator_locus_tag\",\n", + " \"dto_empirical_pvalue\", \"dto_fdr\",\n", + " \"kemmeren_sample_id\", \"perturbation_id_source\"\n", + "]\n", + "\n", + "harbison_kemmeren[\"source_dataset\"] = \"harbison\"\n", + "rossi_kemmeren[\"source_dataset\"] = \"rossi\"\n", + "mahendrawada_kemmeren[\"source_dataset\"] = \"mahendrawada\"\n", + "\n", + "kemmeren_all = pd.concat([\n", + " harbison_kemmeren[common_cols + [\"source_dataset\"]],\n", + " rossi_kemmeren[common_cols + [\"source_dataset\"]],\n", + " mahendrawada_kemmeren[common_cols + [\"source_dataset\"]],\n", + "], ignore_index=True)\n", + "\n", + "print(f\"Total number of records after merging: {len(kemmeren_all)}\")\n", + "print(f\"\\nDistribution among datasets:\")\n", + "print(kemmeren_all[\"source_dataset\"].value_counts())\n", + "kemmeren_all.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "adad7e17", + "metadata": {}, + "source": [ + "## Step3-Take the intersection of Hackett and Kemmern" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "437193f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of significant samples in Hackett: 404\n", + "Number of significant samples in Kemmeren: 472\n", + "Intersection (active set): 278\n", + "\n", + "Distribution among datasets:\n", + "source_dataset\n", + "mahendrawada 104\n", + "harbison 96\n", + "rossi 78\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
source_datasetsample_idregulator_locus_tag
0harbison16YBR083W
1harbison248YMR016C
2harbison50YDR043C
3harbison323YPL049C
4harbison281YNL199C
5harbison86YER040W
6harbison282YNL199C
7harbison18YBR083W
8harbison7YBL103C
9harbison225YLR176C
\n", + "
" + ], + "text/plain": [ + " source_dataset sample_id regulator_locus_tag\n", + "0 harbison 16 YBR083W\n", + "1 harbison 248 YMR016C\n", + "2 harbison 50 YDR043C\n", + "3 harbison 323 YPL049C\n", + "4 harbison 281 YNL199C\n", + "5 harbison 86 YER040W\n", + "6 harbison 282 YNL199C\n", + "7 harbison 18 YBR083W\n", + "8 harbison 7 YBL103C\n", + "9 harbison 225 YLR176C" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use (source_dataset, sample_id) to uniquely identify a binding sample\n", + "hackett_keys = hackett_all[[\"source_dataset\", \"sample_id\", \"regulator_locus_tag\"]].drop_duplicates()\n", + "kemmeren_keys = kemmeren_all[[\"source_dataset\", \"sample_id\", \"regulator_locus_tag\"]].drop_duplicates()\n", + "\n", + "# Inner join = intersection\n", + "active_set = pd.merge(\n", + " hackett_keys,\n", + " kemmeren_keys,\n", + " on=[\"source_dataset\", \"sample_id\", \"regulator_locus_tag\"],\n", + " how=\"inner\"\n", + ")\n", + "\n", + "print(f\"Number of significant samples in Hackett: {len(hackett_keys)}\")\n", + "print(f\"Number of significant samples in Kemmeren: {len(kemmeren_keys)}\")\n", + "print(f\"Intersection (active set): {len(active_set)}\")\n", + "print(f\"\\nDistribution among datasets:\")\n", + "print(active_set[\"source_dataset\"].value_counts())\n", + "active_set.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "db6ea6c8", + "metadata": {}, + "source": [ + "## Step4-Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "6f262b24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 119 different regulators in the active set.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regulator_locus_tagn_active_samplesdatasets
0YEL009C18harbison, mahendrawada, rossi
1YHR206W6harbison, mahendrawada, rossi
2YBL103C6harbison, mahendrawada, rossi
3YPL049C6harbison, mahendrawada, rossi
4YDR043C5harbison, mahendrawada, rossi
5YDL056W5harbison, mahendrawada, rossi
6YHR084W5harbison, mahendrawada, rossi
7YMR037C5harbison, mahendrawada
8YNL068C5harbison, mahendrawada, rossi
9YER040W4harbison, mahendrawada, rossi
10YNL314W4harbison, mahendrawada
11YJR060W4harbison, mahendrawada, rossi
12YIR023W4harbison, mahendrawada
13YOR028C4harbison, rossi
14YPL038W4harbison, mahendrawada, rossi
15YKR099W4harbison, mahendrawada, rossi
16YLR451W4harbison, mahendrawada, rossi
17YOR358W4harbison, mahendrawada, rossi
18YBR083W4harbison, mahendrawada
19YDR310C3harbison, mahendrawada, rossi
\n", + "
" + ], + "text/plain": [ + " regulator_locus_tag n_active_samples datasets\n", + "0 YEL009C 18 harbison, mahendrawada, rossi\n", + "1 YHR206W 6 harbison, mahendrawada, rossi\n", + "2 YBL103C 6 harbison, mahendrawada, rossi\n", + "3 YPL049C 6 harbison, mahendrawada, rossi\n", + "4 YDR043C 5 harbison, mahendrawada, rossi\n", + "5 YDL056W 5 harbison, mahendrawada, rossi\n", + "6 YHR084W 5 harbison, mahendrawada, rossi\n", + "7 YMR037C 5 harbison, mahendrawada\n", + "8 YNL068C 5 harbison, mahendrawada, rossi\n", + "9 YER040W 4 harbison, mahendrawada, rossi\n", + "10 YNL314W 4 harbison, mahendrawada\n", + "11 YJR060W 4 harbison, mahendrawada, rossi\n", + "12 YIR023W 4 harbison, mahendrawada\n", + "13 YOR028C 4 harbison, rossi\n", + "14 YPL038W 4 harbison, mahendrawada, rossi\n", + "15 YKR099W 4 harbison, mahendrawada, rossi\n", + "16 YLR451W 4 harbison, mahendrawada, rossi\n", + "17 YOR358W 4 harbison, mahendrawada, rossi\n", + "18 YBR083W 4 harbison, mahendrawada\n", + "19 YDR310C 3 harbison, mahendrawada, rossi" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regulator_counts = (\n", + " active_set\n", + " .groupby(\"regulator_locus_tag\")\n", + " .agg(\n", + " n_active_samples=(\"sample_id\", \"nunique\"),\n", + " datasets=(\"source_dataset\", lambda x: \", \".join(sorted(x.unique())))\n", + " )\n", + " .sort_values(\"n_active_samples\", ascending=False)\n", + " .reset_index()\n", + ")\n", + "\n", + "print(f\"There are {len(regulator_counts)} different regulators in the active set.\")\n", + "regulator_counts.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "53a236f7", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "dist = regulator_counts[\"n_active_samples\"].value_counts().sort_index()\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 5))\n", + "dist.plot(kind=\"bar\", ax=ax)\n", + "ax.set_xlabel(\"Number of active samples per regulator\")\n", + "ax.set_ylabel(\"Number of regulators (TFs)\")\n", + "ax.set_title(\"Distribution of active sample counts across TFs\\n(DTO P≤0.01 in both Hackett & Kemmeren)\")\n", + "ax.set_xticklabels(ax.get_xticklabels(), rotation=0)\n", + "\n", + "for i, (x, y) in enumerate(zip(dist.index, dist.values)):\n", + " ax.text(i, y + 0.5, str(y), ha=\"center\", fontsize=9)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tfbpapi/datasets.yaml b/tfbpapi/datasets.yaml new file mode 100644 index 0000000..2d3044a --- /dev/null +++ b/tfbpapi/datasets.yaml @@ -0,0 +1,139 @@ +repositories: + BrentLab/harbison_2004: + dataset: + # binding + harbison_2004: + db_name: harbison + sample_id: + field: sample_id + carbon_source: + field: condition + path: media.carbon_source.compound + temperature_celsius: + field: condition + path: temperature_celsius + dtype: numeric + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + + BrentLab/rossi_2021: + # binding + carbon_source: + path: experimental_conditions.media.carbon_source.compound + temperature_celsius: + path: experimental_conditions.temperature_celsius + dataset: + rossi_2021_af_combined: + sample_id: + field: sample_id + regulator_locus_tag: + field: regulator_locus_tag + target_locus_tag: + field: target_locus_tag + + BrentLab/mahendrawada_2025: + temperature_celsius: + path: experimental_conditions.temperature_celsius + dataset: + # binding + chec_mahendrawada_m2025_af_combined_meta: + sample_id: + field: sample_id + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + + + BrentLab/hughes_2006: + dataset: + # perturbation + overexpression: + sample_id: + field: sample_id + carbon_source: + path: experimental_conditions.media.carbon_source.compound + temperature_celsius: + path: experimental_conditions.temperature_celsius + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + # perturbation + knockout: + sample_id: + field: sample_id + carbon_source: + path: experimental_conditions.media.carbon_source.compound + temperature_celsius: + path: experimental_conditions.temperature_celsius + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + + BrentLab/kemmeren_2014: + dataset: + # perturbation + kemmeren_2014: + db_name: kemmeren + sample_id: + field: sample_id + carbon_source: + path: experimental_conditions.media.carbon_source.compound + temperature_celsius: + path: experimental_conditions.temperature_celsius + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + + BrentLab/hackett_2020: + dataset: + # perturbation + hackett_2020: + db_name: hackett + sample_id: + field: sample_id + carbon_source: + path: experimental_conditions.media.carbon_source.compound + temperature_celsius: + path: experimental_conditions.temperature_celsius + dtype: numeric + regulator_locus_tag: + field: regulator_locus_tag + regulator_symbol: + field: regulator_symbol + + BrentLab/yeast_comparative_analysis: + dataset: + dto: + dto_pvalue: + field: dto_empirical_pvalue + dto_fdr: + field: dto_fdr + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + - [BrentLab/rossi_2021, rossi_2021_af_combined] + - [BrentLab/mahendrawada_2025, chec_mahendrawada_m2025_af_combined_meta] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] + - [BrentLab/hackett_2020, hackett_2020] + - [BrentLab/hughes_2006, overexpression] + - [BrentLab/hughes_2006, knockout] + +factor_aliases: + carbon_source: + glucose: [D-glucose, dextrose, glu] + galactose: [D-galactose, gal] + raffinose: [D-raffinose] + +missing_value_labels: + carbon_source: unspecified + +description: + carbon_source: The carbon source provided during growth + temperature_celsius: Growth temperature in degrees Celsius