diff --git a/docs/tutorials/virtual_db_tutorial.ipynb b/docs/tutorials/virtual_db_tutorial.ipynb index c5f87f6..7305146 100644 --- a/docs/tutorials/virtual_db_tutorial.ipynb +++ b/docs/tutorials/virtual_db_tutorial.ipynb @@ -1,3087 +1,5050 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# VirtualDB Tutorial: Unified Cross-Dataset Queries\n", - "\n", - "The `VirtualDB` class provides a unified query interface across heterogeneous datasets with different experimental condition structures and terminologies. Each dataset defines conditions in its own way, with properties at different hierarchy levels and using different naming conventions. VirtualDB uses external YAML configuration to:\n", - "\n", - "- Map varying structures to a common schema\n", - "- Normalize factor level names (e.g., \"D-glucose\", \"dextrose\", \"glu\" all become \"glucose\")\n", - "- Enable cross-dataset queries with standardized field names and values\n", - "\n", - "In this tutorial, we'll explore how to use VirtualDB to query and compare data across multiple datasets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a VirtualDB Specification\n", - "\n", - "VirtualDB requires a YAML configuration file that defines:\n", - "- Which datasets to include\n", - "- How to map their fields to common names\n", - "- How to normalize factor levels" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Configuration saved to: /tmp/tmpnetd9hv1/vdb_config.yaml\n" - ] - } - ], - "source": [ - "# For this tutorial, we'll create a sample configuration\n", - "# In practice, you'd load this from a YAML file\n", - "config_yaml = \"\"\"\n", - "repositories:\n", - " BrentLab/harbison_2004:\n", - " dataset:\n", - " harbison_2004:\n", - " sample_id:\n", - " field: sample_id\n", - " carbon_source:\n", - " field: condition\n", - " path: media.carbon_source.compound\n", - " temperature_celsius:\n", - " field: condition\n", - " path: temperature_celsius\n", - " dtype: numeric\n", - " environmental_condition:\n", - " field: condition\n", - "\n", - " comparative_analyses:\n", - " - repo: BrentLab/yeast_comparative_analysis\n", - " dataset: dto\n", - " via_field: binding_id\n", - "\n", - " BrentLab/kemmeren_2014:\n", - " dataset:\n", - " kemmeren_2014:\n", - " sample_id:\n", - " field: sample_id\n", - " carbon_source:\n", - " path: media.carbon_source.compound\n", - " temperature_celsius:\n", - " path: temperature_celsius\n", - " dtype: numeric\n", - "\n", - " comparative_analyses:\n", - " - repo: BrentLab/yeast_comparative_analysis\n", - " dataset: dto\n", - " via_field: perturbation_id\n", - "\n", - "factor_aliases:\n", - " carbon_source:\n", - " glucose: [D-glucose, dextrose, glu]\n", - " galactose: [D-galactose, gal]\n", - " raffinose: [D-raffinose]\n", - "\n", - "missing_value_labels:\n", - " carbon_source: \"unspecified\"\n", - "\n", - "description:\n", - " carbon_source: The carbon source provided during growth\n", - " temperature_celsius: Growth temperature in degrees Celsius\n", - " environmental_condition: Named environmental condition\n", - "\"\"\"\n", - "\n", - "# Save config to temporary file\n", - "import tempfile\n", - "from pathlib import Path\n", - "\n", - "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n", - "temp_config.write_text(config_yaml)\n", - "\n", - "print(f\"Configuration saved to: {temp_config}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/chase/code/tfbp/tfbpapi/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "VirtualDB initialized successfully!\n", - "Configured repositories: 2\n" - ] - } - ], - "source": [ - "from tfbpapi.virtual_db import VirtualDB\n", - "\n", - "# Initialize VirtualDB with the configuration\n", - "vdb = VirtualDB(str(temp_config))\n", - "\n", - "print(\"VirtualDB initialized successfully!\")\n", - "print(f\"Configured repositories: {len(vdb.config.repositories)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Schema Discovery\n", - "\n", - "The VirtualDB class provides methods to inspect the unified schema after loading the\n", - "configuration." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All available fields:\n", - " - carbon_source\n", - " - environmental_condition\n", - " - sample_id\n", - " - temperature_celsius\n" - ] - } - ], - "source": [ - "# Get all fields defined in any dataset\n", - "all_fields = vdb.get_fields()\n", - "\n", - "print(\"All available fields:\")\n", - "for field in sorted(all_fields):\n", - " print(f\" - {field}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Common fields (present in all datasets):\n", - " - carbon_source\n", - " - temperature_celsius\n" - ] - } - ], - "source": [ - "# Get fields present in ALL datasets (common fields)\n", - "common_fields = vdb.get_common_fields()\n", - "\n", - "print(\"Common fields (present in all datasets):\")\n", - "for field in sorted(common_fields):\n", - " print(f\" - {field}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Datasets with comparative data\n", - "\n", - "\n", - "BrentLab/harbison_2004/harbison_2004:\n", - " - BrentLab/yeast_comparative_analysis/dto\n", - " via field: binding_id\n", - " fields available: 8\n", - "\n", - "BrentLab/kemmeren_2014/kemmeren_2014:\n", - " - BrentLab/yeast_comparative_analysis/dto\n", - " via field: perturbation_id\n", - " fields available: 8\n", - "Comparative data fields\n", - "\n", - "BrentLab/yeast_comparative_analysis/dto:\n", - " binding_id binding_rank_threshold binding_set_size \n", - " dto_empirical_pvalue dto_fdr perturbation_id \n", - " perturbation_rank_threshold perturbation_set_size \n" - ] - } - ], - "source": [ - "# Get fields that may be used to filter two or more datasets at a time\n", - "comp_info = vdb.get_comparative_analyses()\n", - "\n", - "print(\"Datasets with comparative data\\n\")\n", - "for primary_dataset, comparatives in sorted(comp_info[\"primary_to_comparative\"].items()):\n", - " print(f\"\\n{primary_dataset}:\")\n", - " for comp in comparatives:\n", - " comp_key = f\"{comp['comparative_repo']}/{comp['comparative_dataset']}\"\n", - " print(f\" - {comp_key}\")\n", - " print(f\" via field: {comp['via_field']}\")\n", - " num_fields = len(comp_info[\"comparative_fields\"].get(comp_key, []))\n", - " print(f\" fields available: {num_fields}\")\n", - "\n", - "# Show fields available from comparative datasets\n", - "print(\"Comparative data fields\")\n", - "for comp_dataset, fields in sorted(comp_info[\"comparative_fields\"].items()):\n", - " print(f\"\\n{comp_dataset}:\")\n", - " if fields:\n", - " # Print in columns for better readability\n", - " fields_sorted = sorted(fields)\n", - " for i in range(0, len(fields_sorted), 3):\n", - " row_fields = fields_sorted[i:i + 3]\n", - " print(\" \" + \" \".join(f\"{f:<28}\" for f in row_fields))\n", - " else:\n", - " print(\" (no fields found)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Discovering Valid Values\n", - "\n", - "VirtualDB can tell you what values exist for each field." - ] - }, + "cells": [ + { + "cell_type": "markdown", + "id": "cell-0", + "metadata": {}, + "source": [ + "# VirtualDB Tutorial: SQL-First Cross-Dataset Queries\n", + "\n", + "The `VirtualDB` class provides a SQL query interface across heterogeneous\n", + "Huggingface datasets stored in Parquet format. See the [VirtualDB configuration\n", + "documentation](../virtual_db_configuration.md) for more details on how to set up your\n", + "datasets for use with `VirtualDB`." + ] + }, + { + "cell_type": "markdown", + "id": "cell-1", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "The configuration for `VirtualDB` is defined in a YAML file that specifies the datasets to include, their locations, and any relevant metadata or mappings. Below is an example configuration:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cell-2", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique carbon sources (normalized):\n", - " - galactose\n", - " - glucose\n", - " - raffinose\n", - " - unspecified\n" - ] - } - ], - "source": [ - "# Get all unique values for a field (normalized)\n", - "carbon_source_factor_levels = vdb.get_unique_values(\"carbon_source\")\n", - "\n", - "print(\"Unique carbon sources (normalized):\")\n", - "for source in sorted(carbon_source_factor_levels):\n", - " print(f\" - {source}\")" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Config saved to: /tmp/tmpf610qghb/vdb_config.yaml\n" + ] + } + ], + "source": [ + "config_yaml = \"\"\"\n", + "repositories:\n", + " BrentLab/harbison_2004:\n", + " dataset:\n", + " harbison_2004:\n", + " db_name: harbison\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " field: condition\n", + " path: media.carbon_source.compound\n", + " temperature_celsius:\n", + " field: condition\n", + " path: temperature_celsius\n", + " dtype: numeric\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + "\n", + " BrentLab/kemmeren_2014:\n", + " dataset:\n", + " kemmeren_2014:\n", + " db_name: kemmeren\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " path: experimental_conditions.media.carbon_source.compound\n", + " temperature_celsius:\n", + " path: experimental_conditions.temperature_celsius\n", + " dtype: numeric\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + "\n", + " BrentLab/hackett_2020:\n", + " dataset:\n", + " hackett_2020:\n", + " db_name: hackett\n", + " sample_id:\n", + " field: sample_id\n", + " carbon_source:\n", + " path: experimental_conditions.media.carbon_source.compound\n", + " temperature_celsius:\n", + " path: experimental_conditions.temperature_celsius\n", + " dtype: numeric\n", + " regulator_locus_tag:\n", + " field: regulator_locus_tag\n", + " regulator_symbol:\n", + " field: regulator_symbol\n", + "\n", + " BrentLab/yeast_comparative_analysis:\n", + " dataset:\n", + " dto:\n", + " dto_pvalue:\n", + " field: dto_empirical_pvalue\n", + " dto_fdr:\n", + " field: dto_fdr\n", + " links:\n", + " binding_id:\n", + " - [BrentLab/harbison_2004, harbison_2004]\n", + " perturbation_id:\n", + " - [BrentLab/kemmeren_2014, kemmeren_2014]\n", + " - [BrentLab/hackett_2020, hackett_2020]\n", + "\n", + "factor_aliases:\n", + " carbon_source:\n", + " glucose: [D-glucose, dextrose, glu]\n", + " galactose: [D-galactose, gal]\n", + " raffinose: [D-raffinose]\n", + "\n", + "missing_value_labels:\n", + " carbon_source: unspecified\n", + "\n", + "description:\n", + " carbon_source: The carbon source provided during growth\n", + " temperature_celsius: Growth temperature in degrees Celsius\n", + "\"\"\"\n", + "\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n", + "temp_config.write_text(config_yaml)\n", + "print(f\"Config saved to: {temp_config}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-3", + "metadata": {}, + "source": [ + "## Initializing VirtualDB\n", + "\n", + "Creating a VirtualDB instance loads and validates the config but does\n", + "**not** download any data yet. Views are registered lazily on the first\n", + "`query()`, `tables()`, or `describe()` call." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cell-4", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Carbon sources by dataset:\n", - "\n", - "BrentLab/harbison_2004/harbison_2004:\n", - " - galactose\n", - " - glucose\n", - " - raffinose\n", - " - unspecified\n", - "\n", - "BrentLab/kemmeren_2014/kemmeren_2014:\n", - " - glucose\n" - ] - } - ], - "source": [ - "# Get values broken down by dataset\n", - "carbon_by_dataset = vdb.get_unique_values(\"carbon_source\", by_dataset=True)\n", - "\n", - "print(\"Carbon sources by dataset:\")\n", - "for dataset, sources in carbon_by_dataset.items():\n", - " print(f\"\\n{dataset}:\")\n", - " for source in sorted(sources):\n", - " print(f\" - {source}\")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chase/code/tfbp/tfbpapi/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Simple Queries\n", - "\n", - "Now let's start querying data. The `query()` method is the primary interface." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "VirtualDB(4 repos, 4 datasets, views not yet registered)\n" + ] + } + ], + "source": [ + "from tfbpapi.virtual_db import VirtualDB\n", + "\n", + "# Pass an HF token if the repos are private:\n", + "# vdb = VirtualDB(str(temp_config), token=\"hf_...\")\n", + "vdb = VirtualDB(str(temp_config))\n", + "print(repr(vdb))" + ] + }, + { + "cell_type": "markdown", + "id": "cell-5", + "metadata": {}, + "source": [ + "## Schema Discovery\n", + "\n", + "Use `tables()`, `describe()`, `get_fields()`, and `get_common_fields()`\n", + "to explore the registered views before writing SQL.\n", + "\n", + "Note that primary datasets get **two** views each:\n", + "- `` -- the full measurement-level data (one row per sample-target pair)\n", + "- `_meta` -- deduplicated sample-level metadata (one row per sample),\n", + " including derived columns from config property mappings (e.g., `carbon_source` resolved from DataCard field definitions, with factor aliases applied)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cell-6", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Basic Query: All Samples with Glucose\n", - "\n", - "By default, queries return sample-level data (one row per sample) with all configured fields." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Registered views:\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 1797 samples with glucose\n", - "\n", - "Columns: ['sample_id', 'regulator_locus_tag', 'regulator_symbol', 'condition', 'carbon_source', 'temperature_celsius', 'dataset_id']\n", - "\n", - "First few rows:\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_locus_tag", - "rawType": "object", - "type": "string" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "condition", - "rawType": "object", - "type": "string" - }, - { - "name": "carbon_source", - "rawType": "object", - "type": "string" - }, - { - "name": "temperature_celsius", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "b22d63b4-294f-452a-b64e-f79320f4da61", - "rows": [ - [ - "0", - "1", - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "2", - "YAL051W", - "OAF1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "3", - "YBL005W", - "PDR3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "4", - "YBL008W", - "HIR1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "5", - "YBL021C", - "HAP3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 7, - "rows": 5 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 1 YSC0017 MATA1 YPD glucose \n", - "1 2 YAL051W OAF1 YPD glucose \n", - "2 3 YBL005W PDR3 YPD glucose \n", - "3 4 YBL008W HIR1 YPD glucose \n", - "4 5 YBL021C HAP3 YPD glucose \n", - "\n", - " temperature_celsius dataset_id \n", - "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "4 30.0 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Query all datasets for samples grown on glucose\n", - "glucose_samples = vdb.query(filters={\"carbon_source\": \"glucose\"})\n", - "\n", - "print(f\"Found {len(glucose_samples)} samples with glucose\")\n", - "print(f\"\\nColumns: {list(glucose_samples.columns)}\")\n", - "print(f\"\\nFirst few rows:\")\n", - "glucose_samples.head()" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 12264.05it/s]\n", + "Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 9731.56it/s]\n", + "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 21883.33it/s]\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'carbon_source' not found at path 'media.carbon_source' (current keys: ['name'])\n", + "Key 'temperature_celsius' not found at path 'temperature_celsius' (current keys: ['description', 'initial_temperature_celsius', 'temperature_shift_celsius', 'temperature_shift_duration_minutes', 'growth_phase_at_harvest', 'media'])\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Query Specific Datasets\n", - "\n", - "Limit your query to specific datasets using the `datasets` parameter." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " dto_expanded\n", + " hackett\n", + " hackett_meta\n", + " harbison\n", + " harbison_meta\n", + " kemmeren\n", + " kemmeren_meta\n" + ] + } + ], + "source": [ + "# List all registered views\n", + "print(\"Registered views:\")\n", + "for name in vdb.tables():\n", + " print(f\" {name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "pdebujnqb9q", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 310 samples from harbison_2004\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_locus_tag", - "rawType": "object", - "type": "string" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "condition", - "rawType": "object", - "type": "string" - }, - { - "name": "carbon_source", - "rawType": "object", - "type": "string" - }, - { - "name": "temperature_celsius", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "0c9ebf95-0bf1-46d7-83ee-57b87c5def44", - "rows": [ - [ - "0", - "1", - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "2", - "YAL051W", - "OAF1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "3", - "YBL005W", - "PDR3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "4", - "YBL008W", - "HIR1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "5", - "YBL021C", - "HAP3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 7, - "rows": 5 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 1 YSC0017 MATA1 YPD glucose \n", - "1 2 YAL051W OAF1 YPD glucose \n", - "2 3 YBL005W PDR3 YPD glucose \n", - "3 4 YBL008W HIR1 YPD glucose \n", - "4 5 YBL021C HAP3 YPD glucose \n", - "\n", - " temperature_celsius dataset_id \n", - "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "4 30.0 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "table", + "rawType": "object", + "type": "string" + }, + { + "name": "column_name", + "rawType": "object", + "type": "string" + }, + { + "name": "column_type", + "rawType": "object", + "type": "string" + }, + { + "name": "null", + "rawType": "object", + "type": "string" + }, + { + "name": "key", + "rawType": "object", + "type": "unknown" + }, + { + "name": "default", + "rawType": "object", + "type": "unknown" + }, + { + "name": "extra", + "rawType": "object", + "type": "unknown" } + ], + "ref": "8720a362-ea0c-4293-9656-ba6725dcaa3d", + "rows": [ + [ + "0", + "harbison_meta", + "sample_id", + "INTEGER", + "YES", + null, + null, + null + ], + [ + "1", + "harbison_meta", + "regulator_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "2", + "harbison_meta", + "regulator_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "3", + "harbison_meta", + "condition", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "4", + "harbison_meta", + "carbon_source", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "5", + "harbison_meta", + "temperature_celsius", + "DOUBLE", + "YES", + null, + null, + null + ] + ], + "shape": { + "columns": 7, + "rows": 6 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tablecolumn_namecolumn_typenullkeydefaultextra
0harbison_metasample_idINTEGERYESNoneNoneNone
1harbison_metaregulator_locus_tagVARCHARYESNoneNoneNone
2harbison_metaregulator_symbolVARCHARYESNoneNoneNone
3harbison_metaconditionVARCHARYESNoneNoneNone
4harbison_metacarbon_sourceVARCHARYESNoneNoneNone
5harbison_metatemperature_celsiusDOUBLEYESNoneNoneNone
\n", + "
" ], - "source": [ - "# Query only harbison_2004\n", - "harbison_glucose = vdb.query(\n", - " filters={\"carbon_source\": \"glucose\"},\n", - " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")]\n", - ")\n", - "\n", - "print(f\"Found {len(harbison_glucose)} samples from harbison_2004\")\n", - "harbison_glucose.head()" + "text/plain": [ + " table column_name column_type null key default extra\n", + "0 harbison_meta sample_id INTEGER YES None None None\n", + "1 harbison_meta regulator_locus_tag VARCHAR YES None None None\n", + "2 harbison_meta regulator_symbol VARCHAR YES None None None\n", + "3 harbison_meta condition VARCHAR YES None None None\n", + "4 harbison_meta carbon_source VARCHAR YES None None None\n", + "5 harbison_meta temperature_celsius DOUBLE YES None None None" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select Specific Fields\n", - "\n", - "Return only the fields you need with the `fields` parameter." - ] - }, + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The _meta view has sample-level metadata plus derived columns\n", + "# (carbon_source, temperature_celsius resolved from condition definitions)\n", + "vdb.describe(\"harbison_meta\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9deee334", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Columns: ['sample_id', 'carbon_source', 'temperature_celsius', 'dataset_id']\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "carbon_source", - "rawType": "object", - "type": "string" - }, - { - "name": "temperature_celsius", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "38c6fa44-08cf-4751-9476-7bca3cb1c41c", - "rows": [ - [ - "0", - "1", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "2", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "3", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "4", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "5", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 4, - "rows": 5 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idcarbon_sourcetemperature_celsiusdataset_id
01glucose30.0BrentLab/harbison_2004/harbison_2004
12glucose30.0BrentLab/harbison_2004/harbison_2004
23glucose30.0BrentLab/harbison_2004/harbison_2004
34glucose30.0BrentLab/harbison_2004/harbison_2004
45glucose30.0BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id carbon_source temperature_celsius \\\n", - "0 1 glucose 30.0 \n", - "1 2 glucose 30.0 \n", - "2 3 glucose 30.0 \n", - "3 4 glucose 30.0 \n", - "4 5 glucose 30.0 \n", - "\n", - " dataset_id \n", - "0 BrentLab/harbison_2004/harbison_2004 \n", - "1 BrentLab/harbison_2004/harbison_2004 \n", - "2 BrentLab/harbison_2004/harbison_2004 \n", - "3 BrentLab/harbison_2004/harbison_2004 \n", - "4 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "table", + "rawType": "object", + "type": "string" + }, + { + "name": "column_name", + "rawType": "object", + "type": "string" + }, + { + "name": "column_type", + "rawType": "object", + "type": "string" + }, + { + "name": "null", + "rawType": "object", + "type": "string" + }, + { + "name": "key", + "rawType": "object", + "type": "unknown" + }, + { + "name": "default", + "rawType": "object", + "type": "unknown" + }, + { + "name": "extra", + "rawType": "object", + "type": "unknown" } + ], + "ref": "001db2c7-a5c2-4561-9b12-35733ce1b2e6", + "rows": [ + [ + "0", + "harbison", + "sample_id", + "INTEGER", + "YES", + null, + null, + null + ], + [ + "1", + "harbison", + "db_id", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "2", + "harbison", + "regulator_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "3", + "harbison", + "regulator_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "4", + "harbison", + "condition", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "5", + "harbison", + "target_locus_tag", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "6", + "harbison", + "target_symbol", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "7", + "harbison", + "effect", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "8", + "harbison", + "pvalue", + "DOUBLE", + "YES", + null, + null, + null + ], + [ + "9", + "harbison", + "carbon_source", + "VARCHAR", + "YES", + null, + null, + null + ], + [ + "10", + "harbison", + "temperature_celsius", + "DOUBLE", + "YES", + null, + null, + null + ] + ], + "shape": { + "columns": 7, + "rows": 11 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tablecolumn_namecolumn_typenullkeydefaultextra
0harbisonsample_idINTEGERYESNoneNoneNone
1harbisondb_idDOUBLEYESNoneNoneNone
2harbisonregulator_locus_tagVARCHARYESNoneNoneNone
3harbisonregulator_symbolVARCHARYESNoneNoneNone
4harbisonconditionVARCHARYESNoneNoneNone
5harbisontarget_locus_tagVARCHARYESNoneNoneNone
6harbisontarget_symbolVARCHARYESNoneNoneNone
7harbisoneffectDOUBLEYESNoneNoneNone
8harbisonpvalueDOUBLEYESNoneNoneNone
9harbisoncarbon_sourceVARCHARYESNoneNoneNone
10harbisontemperature_celsiusDOUBLEYESNoneNoneNone
\n", + "
" ], - "source": [ - "# Get just sample_id, carbon_source, and temperature\n", - "minimal_data = vdb.query(\n", - " filters={\"carbon_source\": \"glucose\"},\n", - " fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n", - ")\n", - "\n", - "print(f\"Columns: {list(minimal_data.columns)}\")\n", - "minimal_data.head()" + "text/plain": [ + " table column_name column_type null key default extra\n", + "0 harbison sample_id INTEGER YES None None None\n", + "1 harbison db_id DOUBLE YES None None None\n", + "2 harbison regulator_locus_tag VARCHAR YES None None None\n", + "3 harbison regulator_symbol VARCHAR YES None None None\n", + "4 harbison condition VARCHAR YES None None None\n", + "5 harbison target_locus_tag VARCHAR YES None None None\n", + "6 harbison target_symbol VARCHAR YES None None None\n", + "7 harbison effect DOUBLE YES None None None\n", + "8 harbison pvalue DOUBLE YES None None None\n", + "9 harbison carbon_source VARCHAR YES None None None\n", + "10 harbison temperature_celsius DOUBLE YES None None None" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Advanced Queries\n", - "\n", - "VirtualDB supports more sophisticated query patterns." - ] - }, + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The full view has measurement-level data (one row per sample-target pair)\n", + "vdb.describe(\"harbison\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cell-9", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multiple Filter Conditions" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Common fields: ['carbon_source', 'regulator_locus_tag', 'regulator_symbol', 'sample_id', 'temperature_celsius']\n" + ] + } + ], + "source": [ + "# Columns common to ALL primary dataset views\n", + "print(\"Common fields:\", vdb.get_common_fields())" + ] + }, + { + "cell_type": "markdown", + "id": "cell-10", + "metadata": {}, + "source": [ + "## Querying VirtualDB\n", + "\n", + "The `.query()` method executes SQL queries against the registered views. You can write complex SQL queries that join across multiple datasets, filter based on metadata, and aggregate results as needed. \n", + "\n", + "You can also use parameterized queries to safely inject variables into your SQL statements, and prepared statements for repeated queries with different parameters. \n", + "Named prepared statements can be passed to `.prepare()` and then executed with\n", + "`.query()` with any parameterized values passed in as an arbitrary number of key/value\n", + "arguments." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1a705f1c", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 1791 samples with glucose at 30C\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_locus_tag", - "rawType": "object", - "type": "string" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "condition", - "rawType": "object", - "type": "string" - }, - { - "name": "carbon_source", - "rawType": "object", - "type": "string" - }, - { - "name": "temperature_celsius", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "357f69ed-ad79-4401-8458-5a1cc48f14c5", - "rows": [ - [ - "0", - "1", - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "2", - "YAL051W", - "OAF1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "3", - "YBL005W", - "PDR3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "4", - "YBL008W", - "HIR1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "5", - "YBL021C", - "HAP3", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 7, - "rows": 5 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
01YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
12YAL051WOAF1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
23YBL005WPDR3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
34YBL008WHIR1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
45YBL021CHAP3YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 1 YSC0017 MATA1 YPD glucose \n", - "1 2 YAL051W OAF1 YPD glucose \n", - "2 3 YBL005W PDR3 YPD glucose \n", - "3 4 YBL008W HIR1 YPD glucose \n", - "4 5 YBL021C HAP3 YPD glucose \n", - "\n", - " temperature_celsius dataset_id \n", - "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "4 30.0 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" } + ], + "ref": "e5bb4909-b231-44d7-85b8-5219b51f4a4b", + "rows": [ + [ + "0", + "166", + "YIL131C", + "FKH1", + "YPD", + "glucose", + "30.0" + ], + [ + "1", + "3", + "YBL005W", + "PDR3", + "YPD", + "glucose", + "30.0" + ], + [ + "2", + "173", + "YIR023W", + "DAL81", + "YPD", + "glucose", + "30.0" + ], + [ + "3", + "220", + "YLR014C", + "PPR1", + "YPD", + "glucose", + "30.0" + ], + [ + "4", + "83", + "YEL009C", + "GCN4", + "YPD", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 6, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsius
0166YIL131CFKH1YPDglucose30.0
13YBL005WPDR3YPDglucose30.0
2173YIR023WDAL81YPDglucose30.0
3220YLR014CPPR1YPDglucose30.0
483YEL009CGCN4YPDglucose30.0
\n", + "
" ], - "source": [ - "# Samples with glucose at 30C\n", - "glucose_30c = vdb.query(\n", - " filters={\n", - " \"carbon_source\": \"glucose\",\n", - " \"temperature_celsius\": 30\n", - " }\n", - ")\n", - "\n", - "print(f\"Found {len(glucose_30c)} samples with glucose at 30C\")\n", - "glucose_30c.head()" + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 166 YIL131C FKH1 YPD glucose \n", + "1 3 YBL005W PDR3 YPD glucose \n", + "2 173 YIR023W DAL81 YPD glucose \n", + "3 220 YLR014C PPR1 YPD glucose \n", + "4 83 YEL009C GCN4 YPD glucose \n", + "\n", + " temperature_celsius \n", + "0 30.0 \n", + "1 30.0 \n", + "2 30.0 \n", + "3 30.0 \n", + "4 30.0 " ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Numeric Range Queries" - ] - }, + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query the _meta view for sample-level metadata (one row per sample)\n", + "# Note: carbon_source is derived from the condition column's DataCard definitions\n", + "# with factor aliases already applied (D-glucose -> glucose)\n", + "df_meta = vdb.query(\"SELECT * FROM harbison_meta LIMIT 5\")\n", + "df_meta" + ] + }, + { + "cell_type": "markdown", + "id": "cell-16", + "metadata": {}, + "source": [ + "## 5. Parameterized Queries\n", + "\n", + "Pass keyword arguments to `query()` and reference them with\n", + "DuckDB's `$name` syntax." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cell-17", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 1833 samples at >= 30C\n", - "Found 1833 samples between 28-32C\n" - ] + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "db_id", + "rawType": "float64", + "type": "float" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "target_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "target_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "effect", + "rawType": "float64", + "type": "float" + }, + { + "name": "pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" } + ], + "ref": "a6cb8a91-c1c2-4bc8-af51-12e900d7a4bf", + "rows": [ + [ + "0", + "14", + "13.0", + "YBR049C", + "REB1", + "H2O2Lo", + "YPR204W", + "YPR204W", + "0.78449615", + "0.53566521", + "glucose", + "30.0" + ], + [ + "1", + "14", + "13.0", + "YBR049C", + "REB1", + "H2O2Lo", + "YPR203W", + "YPR203W", + "1.4509147", + "0.95955603", + "glucose", + "30.0" + ], + [ + "2", + "14", + "13.0", + "YBR049C", + "REB1", + "H2O2Lo", + "YPR202W", + "YPR202W", + "1.4509147", + "0.95955603", + "glucose", + "30.0" + ], + [ + "3", + "14", + "13.0", + "YBR049C", + "REB1", + "H2O2Lo", + "YPR201W", + "ARR3", + "0.92586339", + "0.45367192", + "glucose", + "30.0" + ], + [ + "4", + "14", + "13.0", + "YBR049C", + "REB1", + "H2O2Lo", + "YPR200C", + "ARR2", + "0.92586339", + "0.45367192", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 11, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_iddb_idregulator_locus_tagregulator_symbolconditiontarget_locus_tagtarget_symboleffectpvaluecarbon_sourcetemperature_celsius
01413.0YBR049CREB1H2O2LoYPR204WYPR204W0.7844960.535665glucose30.0
11413.0YBR049CREB1H2O2LoYPR203WYPR203W1.4509150.959556glucose30.0
21413.0YBR049CREB1H2O2LoYPR202WYPR202W1.4509150.959556glucose30.0
31413.0YBR049CREB1H2O2LoYPR201WARR30.9258630.453672glucose30.0
41413.0YBR049CREB1H2O2LoYPR200CARR20.9258630.453672glucose30.0
\n", + "
" ], - "source": [ - "# Samples at temperature >= 30C\n", - "warm_samples = vdb.query(\n", - " filters={\"temperature_celsius\": (\">=\", 30)}\n", - ")\n", - "\n", - "print(f\"Found {len(warm_samples)} samples at >= 30C\")\n", - "\n", - "# Samples between 28C and 32C\n", - "moderate_temp = vdb.query(\n", - " filters={\"temperature_celsius\": (\"between\", 28, 32)}\n", - ")\n", - "\n", - "print(f\"Found {len(moderate_temp)} samples between 28-32C\")" + "text/plain": [ + " sample_id db_id regulator_locus_tag regulator_symbol condition \\\n", + "0 14 13.0 YBR049C REB1 H2O2Lo \n", + "1 14 13.0 YBR049C REB1 H2O2Lo \n", + "2 14 13.0 YBR049C REB1 H2O2Lo \n", + "3 14 13.0 YBR049C REB1 H2O2Lo \n", + "4 14 13.0 YBR049C REB1 H2O2Lo \n", + "\n", + " target_locus_tag target_symbol effect pvalue carbon_source \\\n", + "0 YPR204W YPR204W 0.784496 0.535665 glucose \n", + "1 YPR203W YPR203W 1.450915 0.959556 glucose \n", + "2 YPR202W YPR202W 1.450915 0.959556 glucose \n", + "3 YPR201W ARR3 0.925863 0.453672 glucose \n", + "4 YPR200C ARR2 0.925863 0.453672 glucose \n", + "\n", + " temperature_celsius \n", + "0 30.0 \n", + "1 30.0 \n", + "2 30.0 \n", + "3 30.0 \n", + "4 30.0 " ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Factor Alias Expansion\n", - "\n", - "When you query for a normalized value, VirtualDB automatically expands to all original aliases." - ] - }, + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# A parameterized query has the following form, where `$reg` is a placeholder\n", + "# that gets replaced with the value provided in the `reg` argument.\n", + "vdb.query(\n", + " \"SELECT * FROM harbison WHERE regulator_symbol = $reg LIMIT 5\",\n", + " reg=\"REB1\",\n", + ")\n", + "\n", + "# A parameterized query can be saved for future use with the `.prepare()` method" + ] + }, + { + "cell_type": "markdown", + "id": "cell-18", + "metadata": {}, + "source": [ + "## Prepared Queries\n", + "\n", + "Use `prepare()` to register a named, reusable query template.\n", + "Then call it by name via `query()`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cell-19", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 68 YDR277C MTH1 GAL galactose \n", - "1 112 YGL035C MIG1 GAL galactose \n", - "2 197 YKL038W RGT1 GAL galactose \n", - "3 335 YPL248C GAL4 GAL galactose \n", - "\n", - " temperature_celsius dataset_id \n", - "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "3 30.0 BrentLab/harbison_2004/harbison_2004 \n" - ] + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "n", + "rawType": "int64", + "type": "integer" } + ], + "ref": "9234aaf4-a313-42c2-838a-a13568eed01d", + "rows": [ + [ + "0", + "MSN2", + "6" + ], + [ + "1", + "MSN4", + "5" + ], + [ + "2", + "HSF1", + "4" + ], + [ + "3", + "STE12", + "4" + ], + [ + "4", + "RTG3", + "4" + ], + [ + "5", + "YAP1", + "4" + ], + [ + "6", + "SKN7", + "4" + ], + [ + "7", + "DIG1", + "4" + ], + [ + "8", + "GAT1", + "3" + ], + [ + "9", + "RPN4", + "3" + ], + [ + "10", + "YAP7", + "3" + ], + [ + "11", + "TEC1", + "3" + ], + [ + "12", + "AFT1", + "3" + ], + [ + "13", + "MAL33", + "3" + ], + [ + "14", + "PHO2", + "3" + ], + [ + "15", + "MBP1", + "3" + ], + [ + "16", + "KSS1", + "3" + ], + [ + "17", + "SFP1", + "3" + ], + [ + "18", + "CIN5", + "3" + ], + [ + "19", + "YJL206C", + "3" + ], + [ + "20", + "GZF3", + "3" + ], + [ + "21", + "MOT3", + "3" + ], + [ + "22", + "FHL1", + "3" + ], + [ + "23", + "ROX1", + "3" + ], + [ + "24", + "FKH2", + "3" + ], + [ + "25", + "AFT2", + "3" + ], + [ + "26", + "REB1", + "3" + ], + [ + "27", + "RIM101", + "3" + ], + [ + "28", + "YAP6", + "3" + ], + [ + "29", + "RPH1", + "3" + ], + [ + "30", + "PHD1", + "3" + ], + [ + "31", + "NRG1", + "3" + ], + [ + "32", + "MGA1", + "2" + ], + [ + "33", + "UME1", + "2" + ], + [ + "34", + "YAP3", + "2" + ], + [ + "35", + "XBP1", + "2" + ], + [ + "36", + "RDS1", + "2" + ], + [ + "37", + "MSS11", + "2" + ], + [ + "38", + "HAP2", + "2" + ], + [ + "39", + "MCM1", + "2" + ], + [ + "40", + "ADR1", + "2" + ], + [ + "41", + "GCN4", + "2" + ], + [ + "42", + "MIG2", + "2" + ], + [ + "43", + "SOK2", + "2" + ], + [ + "44", + "RTG1", + "2" + ], + [ + "45", + "MOT2", + "2" + ], + [ + "46", + "UGA3", + "2" + ], + [ + "47", + "PUT3", + "2" + ], + [ + "48", + "YAP5", + "2" + ], + [ + "49", + "UME6", + "2" + ] + ], + "shape": { + "columns": 2, + "rows": 63 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regulator_symboln
0MSN26
1MSN45
2HSF14
3STE124
4RTG34
.........
58DAL822
59DAL802
60HAP42
61PDR12
62RLM12
\n", + "

63 rows × 2 columns

\n", + "
" ], - "source": [ - "# Query for \"galactose\" matches \"D-galactose\", \"gal\", and \"galactose\"\n", - "galactose_samples = vdb.query(filters={\"carbon_source\": \"galactose\"})\n", - "\n", - "print(galactose_samples)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Complete Data Retrieval\n", - "\n", - "By default, `query()` returns sample-level metadata (one row per sample). \n", - "Set `complete=True` to get all measurements (many rows per sample)." + "text/plain": [ + " regulator_symbol n\n", + "0 MSN2 6\n", + "1 MSN4 5\n", + "2 HSF1 4\n", + "3 STE12 4\n", + "4 RTG3 4\n", + ".. ... ..\n", + "58 DAL82 2\n", + "59 DAL80 2\n", + "60 HAP4 2\n", + "61 PDR1 2\n", + "62 RLM1 2\n", + "\n", + "[63 rows x 2 columns]" ] - }, + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Register a prepared query\n", + "vdb.prepare(\"glucose_regs\", \"\"\"\n", + " SELECT regulator_symbol, COUNT(*) AS n\n", + " FROM harbison_meta\n", + " WHERE carbon_source = $cs\n", + " GROUP BY regulator_symbol\n", + " HAVING n >= $min_n\n", + " ORDER BY n DESC\n", + "\"\"\")\n", + "\n", + "# note that rather than a SQL statement, we pass in the name of the prepared query\n", + "# and provide the appropriate parameters\n", + "vdb.query(\"glucose_regs\", cs=\"glucose\", min_n=2)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-20", + "metadata": {}, + "source": [ + "## 7. Comparative Dataset Views\n", + "\n", + "Comparative datasets (those with `links`) get an extra view type:\n", + "\n", + "**`_expanded`**: For each composite ID field, adds two parsed columns:\n", + "- `_source` -- the source dataset, aliased to `db_name` when\n", + " the `repo_id;config_name` pair is in the VirtualDB config.\n", + "- `_id` -- the sample_id component.\n", + "\n", + "This makes it easy to join or filter by source dataset without manually\n", + "parsing composite IDs." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cell-21", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Complete data: 1930060 rows\n", - "Columns: ['sample_id', 'db_id', 'target_locus_tag', 'target_symbol', 'effect', 'pvalue', 'regulator_locus_tag', 'regulator_symbol', 'condition', 'carbon_source', 'temperature_celsius', 'dataset_id']\n", - "\n", - "First few measurements:\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "db_id", - "rawType": "float64", - "type": "float" - }, - { - "name": "target_locus_tag", - "rawType": "object", - "type": "string" - }, - { - "name": "target_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "effect", - "rawType": "float64", - "type": "float" - }, - { - "name": "pvalue", - "rawType": "float64", - "type": "float" - }, - { - "name": "regulator_locus_tag", - "rawType": "object", - "type": "string" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "condition", - "rawType": "object", - "type": "string" - }, - { - "name": "carbon_source", - "rawType": "object", - "type": "string" - }, - { - "name": "temperature_celsius", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "0b10b74e-6f1a-42af-8654-7811d039bfac", - "rows": [ - [ - "0", - "1", - "0.0", - "YAL001C", - "TFC3", - "1.697754", - "0.068704735", - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "1", - "0.0", - "YAL002W", - "VPS8", - null, - null, - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "1", - "0.0", - "YAL003W", - "EFB1", - null, - null, - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "1", - "0.0", - "YAL004W", - "YAL004W", - "0.74534215", - "0.83592938", - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "1", - "0.0", - "YAL005C", - "SSA1", - null, - null, - "YSC0017", - "MATA1", - "YPD", - "glucose", - "30.0", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 12, - "rows": 5 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_iddb_idtarget_locus_tagtarget_symboleffectpvalueregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdataset_id
010.0YAL001CTFC31.6977540.068705YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
110.0YAL002WVPS8NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
210.0YAL003WEFB1NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
310.0YAL004WYAL004W0.7453420.835929YSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
410.0YAL005CSSA1NaNNaNYSC0017MATA1YPDglucose30.0BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id db_id target_locus_tag target_symbol effect pvalue \\\n", - "0 1 0.0 YAL001C TFC3 1.697754 0.068705 \n", - "1 1 0.0 YAL002W VPS8 NaN NaN \n", - "2 1 0.0 YAL003W EFB1 NaN NaN \n", - "3 1 0.0 YAL004W YAL004W 0.745342 0.835929 \n", - "4 1 0.0 YAL005C SSA1 NaN NaN \n", - "\n", - " regulator_locus_tag regulator_symbol condition carbon_source \\\n", - "0 YSC0017 MATA1 YPD glucose \n", - "1 YSC0017 MATA1 YPD glucose \n", - "2 YSC0017 MATA1 YPD glucose \n", - "3 YSC0017 MATA1 YPD glucose \n", - "4 YSC0017 MATA1 YPD glucose \n", - "\n", - " temperature_celsius dataset_id \n", - "0 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "1 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "2 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "3 30.0 BrentLab/harbison_2004/harbison_2004 \n", - "4 30.0 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "binding_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_source", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_source", + "rawType": "object", + "type": "string" } + ], + "ref": "3464c093-78d3-4dde-9a28-850a7be5d032", + "rows": [ + [ + "0", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;85", + "2.0", + "2.0", + "3.0", + "2.0", + "0.0002250900360144", + "0.004", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "85", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "1", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;83", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "83", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "2", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;84", + "2.0", + "1.0", + "3.0", + "1.0", + "0.0", + "0.011", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "84", + "BrentLab/Hackett_2020;hackett_2020" + ] + ], + "shape": { + "columns": 14, + "rows": 3 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idbinding_rank_thresholdperturbation_rank_thresholdbinding_set_sizeperturbation_set_sizedto_fdrdto_empirical_pvaluebinding_repo_datasetperturbation_repo_datasetbinding_id_idbinding_id_sourceperturbation_id_idperturbation_id_source
0BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;852.02.03.02.00.0002250.004harbison_2004-harbison_2004Hackett_2020-hackett_20203harbison85BrentLab/Hackett_2020;hackett_2020
1BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;83NaNNaNNaNNaNNaNNaNharbison_2004-harbison_2004Hackett_2020-hackett_20203harbison83BrentLab/Hackett_2020;hackett_2020
2BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;842.01.03.01.00.0000000.011harbison_2004-harbison_2004Hackett_2020-hackett_20203harbison84BrentLab/Hackett_2020;hackett_2020
\n", + "
" ], - "source": [ - "# Get complete data with measurements\n", - "complete_data = vdb.query(\n", - " filters={\"carbon_source\": \"glucose\"},\n", - " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", - " complete=True\n", - ")\n", - "\n", - "print(f\"Complete data: {len(complete_data)} rows\")\n", - "print(f\"Columns: {list(complete_data.columns)}\")\n", - "print(\"\\nFirst few measurements:\")\n", - "complete_data.head()" + "text/plain": [ + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;3 \n", + "1 BrentLab/harbison_2004;harbison_2004;3 \n", + "2 BrentLab/harbison_2004;harbison_2004;3 \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", + "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", + "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 2.0 3.0 2.0 \n", + "1 NaN NaN NaN \n", + "2 1.0 3.0 1.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", + "0 0.000225 0.004 harbison_2004-harbison_2004 \n", + "1 NaN NaN harbison_2004-harbison_2004 \n", + "2 0.000000 0.011 harbison_2004-harbison_2004 \n", + "\n", + " perturbation_repo_dataset binding_id_id binding_id_source \\\n", + "0 Hackett_2020-hackett_2020 3 harbison \n", + "1 Hackett_2020-hackett_2020 3 harbison \n", + "2 Hackett_2020-hackett_2020 3 harbison \n", + "\n", + " perturbation_id_id perturbation_id_source \n", + "0 85 BrentLab/Hackett_2020;hackett_2020 \n", + "1 83 BrentLab/Hackett_2020;hackett_2020 \n", + "2 84 BrentLab/Hackett_2020;hackett_2020 " ] - }, + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The expanded view has parsed _source and _id columns for each link field\n", + "vdb.query(\"SELECT * FROM dto_expanded LIMIT 3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cell-22", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Binding data: 1930060 measurements\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "target_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "effect", - "rawType": "float64", - "type": "float" - }, - { - "name": "pvalue", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "0b7cb890-7e9c-44d2-9ef5-59374bcf3a8a", - "rows": [ - [ - "0", - "2", - "OAF1", - "TFC3", - "1.5895642", - "0.088986168", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "2", - "OAF1", - "VPS8", - "1.1413208", - "0.32480496", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "2", - "OAF1", - "EFB1", - "0.72911994", - "0.87882413", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "2", - "OAF1", - "YAL004W", - "1.1679044", - "0.28225283", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "2", - "OAF1", - "SSA1", - "0.72911994", - "0.87882413", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "5", - "2", - "OAF1", - "ERP2", - "1.0508274", - "0.43070675", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "6", - "2", - "OAF1", - "FUN14", - "1.3478761", - "0.15551056", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "7", - "2", - "OAF1", - "SPO7", - "0.93967306", - "0.57823415", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "8", - "2", - "OAF1", - "MDM10", - "0.93967306", - "0.57823415", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "9", - "2", - "OAF1", - "SWC3", - "0.86566703", - "0.6711192", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 6, - "rows": 10 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_symboltarget_symboleffectpvaluedataset_id
02OAF1TFC31.5895640.088986BrentLab/harbison_2004/harbison_2004
12OAF1VPS81.1413210.324805BrentLab/harbison_2004/harbison_2004
22OAF1EFB10.7291200.878824BrentLab/harbison_2004/harbison_2004
32OAF1YAL004W1.1679040.282253BrentLab/harbison_2004/harbison_2004
42OAF1SSA10.7291200.878824BrentLab/harbison_2004/harbison_2004
52OAF1ERP21.0508270.430707BrentLab/harbison_2004/harbison_2004
62OAF1FUN141.3478760.155511BrentLab/harbison_2004/harbison_2004
72OAF1SPO70.9396730.578234BrentLab/harbison_2004/harbison_2004
82OAF1MDM100.9396730.578234BrentLab/harbison_2004/harbison_2004
92OAF1SWC30.8656670.671119BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_symbol target_symbol effect pvalue \\\n", - "0 2 OAF1 TFC3 1.589564 0.088986 \n", - "1 2 OAF1 VPS8 1.141321 0.324805 \n", - "2 2 OAF1 EFB1 0.729120 0.878824 \n", - "3 2 OAF1 YAL004W 1.167904 0.282253 \n", - "4 2 OAF1 SSA1 0.729120 0.878824 \n", - "5 2 OAF1 ERP2 1.050827 0.430707 \n", - "6 2 OAF1 FUN14 1.347876 0.155511 \n", - "7 2 OAF1 SPO7 0.939673 0.578234 \n", - "8 2 OAF1 MDM10 0.939673 0.578234 \n", - "9 2 OAF1 SWC3 0.865667 0.671119 \n", - "\n", - " dataset_id \n", - "0 BrentLab/harbison_2004/harbison_2004 \n", - "1 BrentLab/harbison_2004/harbison_2004 \n", - "2 BrentLab/harbison_2004/harbison_2004 \n", - "3 BrentLab/harbison_2004/harbison_2004 \n", - "4 BrentLab/harbison_2004/harbison_2004 \n", - "5 BrentLab/harbison_2004/harbison_2004 \n", - "6 BrentLab/harbison_2004/harbison_2004 \n", - "7 BrentLab/harbison_2004/harbison_2004 \n", - "8 BrentLab/harbison_2004/harbison_2004 \n", - "9 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "condition", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" } + ], + "ref": "58c1f0ca-b0a7-4ce7-b29f-f4e789b74707", + "rows": [ + [ + "0", + "50", + "YDR043C", + "NRG1", + "H2O2Lo", + "glucose", + "30.0", + "0.0", + "0.081863152643831" + ], + [ + "1", + "213", + "YKL222C", + "YKL222C", + "YPD", + "glucose", + "30.0", + "0.0", + "0.0" + ], + [ + "2", + "18", + "YBR083W", + "TEC1", + "YPD", + "glucose", + "30.0", + "0.0", + "0.0620669105826265" + ], + [ + "3", + "7", + "YBL103C", + "RTG3", + "H2O2Hi", + "glucose", + "30.0", + "0.0", + "0.1577232390460343" + ], + [ + "4", + "277", + "YNL103W", + "MET4", + "YPD", + "glucose", + "30.0", + "0.0", + "0.016281512605042" + ], + [ + "5", + "281", + "YNL199C", + "GCR2", + "SM", + "unspecified", + "30.0", + "0.0", + "0.0296346442259623" + ], + [ + "6", + "86", + "YER040W", + "GLN3", + "SM", + "unspecified", + "30.0", + "0.0", + "0.2298889521004841" + ], + [ + "7", + "225", + "YLR176C", + "RFX1", + "YPD", + "glucose", + "30.0", + "0.0", + "0.0144559001906082" + ], + [ + "8", + "86", + "YER040W", + "GLN3", + "SM", + "unspecified", + "30.0", + "0.0", + "0.0961169019780866" + ], + [ + "9", + "225", + "YLR176C", + "RFX1", + "YPD", + "glucose", + "30.0", + "0.0", + "0.0335260614428719" + ] + ], + "shape": { + "columns": 8, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symbolconditioncarbon_sourcetemperature_celsiusdto_empirical_pvaluedto_fdr
050YDR043CNRG1H2O2Loglucose30.00.00.081863
1213YKL222CYKL222CYPDglucose30.00.00.000000
218YBR083WTEC1YPDglucose30.00.00.062067
37YBL103CRTG3H2O2Higlucose30.00.00.157723
4277YNL103WMET4YPDglucose30.00.00.016282
5281YNL199CGCR2SMunspecified30.00.00.029635
686YER040WGLN3SMunspecified30.00.00.229889
7225YLR176CRFX1YPDglucose30.00.00.014456
886YER040WGLN3SMunspecified30.00.00.096117
9225YLR176CRFX1YPDglucose30.00.00.033526
\n", + "
" ], - "source": [ - "# You can combine complete=True with field selection\n", - "# Get just the binding data columns\n", - "binding_data = vdb.query(\n", - " filters={\"carbon_source\": \"glucose\"},\n", - " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", - " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"effect\", \"pvalue\"],\n", - " complete=True\n", - ")\n", - "\n", - "print(f\"Binding data: {len(binding_data)} measurements\")\n", - "binding_data.head(10)" + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol condition carbon_source \\\n", + "0 50 YDR043C NRG1 H2O2Lo glucose \n", + "1 213 YKL222C YKL222C YPD glucose \n", + "2 18 YBR083W TEC1 YPD glucose \n", + "3 7 YBL103C RTG3 H2O2Hi glucose \n", + "4 277 YNL103W MET4 YPD glucose \n", + "5 281 YNL199C GCR2 SM unspecified \n", + "6 86 YER040W GLN3 SM unspecified \n", + "7 225 YLR176C RFX1 YPD glucose \n", + "8 86 YER040W GLN3 SM unspecified \n", + "9 225 YLR176C RFX1 YPD glucose \n", + "\n", + " temperature_celsius dto_empirical_pvalue dto_fdr \n", + "0 30.0 0.0 0.081863 \n", + "1 30.0 0.0 0.000000 \n", + "2 30.0 0.0 0.062067 \n", + "3 30.0 0.0 0.157723 \n", + "4 30.0 0.0 0.016282 \n", + "5 30.0 0.0 0.029635 \n", + "6 30.0 0.0 0.229889 \n", + "7 30.0 0.0 0.014456 \n", + "8 30.0 0.0 0.096117 \n", + "9 30.0 0.0 0.033526 " ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example analysis\n", - "\n", - "The following is an example of using VirtualDB to extract and summarize data across\n", - "datasets." - ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Join harbison metadata to dto via the expanded view's parsed columns\n", + "vdb.query(\"\"\"\n", + " SELECT h.*, d.dto_empirical_pvalue, d.dto_fdr\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cell-23", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sample counts by dataset and carbon source:\n", - " dataset_id carbon_source num_samples\n", - "BrentLab/harbison_2004/harbison_2004 galactose 4\n", - "BrentLab/harbison_2004/harbison_2004 glucose 310\n", - "BrentLab/harbison_2004/harbison_2004 raffinose 1\n", - "BrentLab/harbison_2004/harbison_2004 unspecified 37\n", - "BrentLab/kemmeren_2014/kemmeren_2014 glucose 1487\n" - ] + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "harbison_sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "hackett_sample_id", + "rawType": "object", + "type": "string" } + ], + "ref": "b916ca80-75d1-448d-82a5-c82086ca1ed9", + "rows": [ + [ + "0", + "289", + "DAL82", + "0.0", + "1208" + ], + [ + "1", + "251", + "MAC1", + "0.0", + "1103" + ], + [ + "2", + "321", + "DIG1", + "0.0", + "1372" + ], + [ + "3", + "238", + "YAP1", + "0.0", + "996" + ], + [ + "4", + "303", + "CIN5", + "0.0", + "1365" + ], + [ + "5", + "245", + "ARG81", + "0.0", + "1023" + ], + [ + "6", + "184", + "CBF1", + "0.0", + "754" + ], + [ + "7", + "252", + "MAC1", + "0.0", + "1103" + ], + [ + "8", + "200", + "PHD1", + "0.0", + "890" + ], + [ + "9", + "251", + "MAC1", + "0.0", + "1110" + ] + ], + "shape": { + "columns": 4, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
harbison_sample_idregulator_symboldto_empirical_pvaluehackett_sample_id
0289DAL820.01208
1251MAC10.01103
2321DIG10.01372
3238YAP10.0996
4303CIN50.01365
5245ARG810.01023
6184CBF10.0754
7252MAC10.01103
8200PHD10.0890
9251MAC10.01110
\n", + "
" ], - "source": [ - "# Compare number of samples by carbon source across datasets\n", - "\n", - "# Get all samples\n", - "all_samples = vdb.query()\n", - "\n", - "# Count by dataset and carbon source\n", - "summary = all_samples.groupby(['dataset_id', 'carbon_source']).size()\n", - "summary = summary.reset_index(name='num_samples')\n", - "\n", - "print(\"Sample counts by dataset and carbon source:\")\n", - "print(summary.to_string(index=False))" + "text/plain": [ + " harbison_sample_id regulator_symbol dto_empirical_pvalue hackett_sample_id\n", + "0 289 DAL82 0.0 1208\n", + "1 251 MAC1 0.0 1103\n", + "2 321 DIG1 0.0 1372\n", + "3 238 YAP1 0.0 996\n", + "4 303 CIN5 0.0 1365\n", + "5 245 ARG81 0.0 1023\n", + "6 184 CBF1 0.0 754\n", + "7 252 MAC1 0.0 1103\n", + "8 200 PHD1 0.0 890\n", + "9 251 MAC1 0.0 1110" ] - }, + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cross-dataset join: harbison binding with hackett perturbation data\n", + "# via the DTO comparative dataset\n", + "vdb.query(\"\"\"\n", + " SELECT\n", + " h.sample_id AS harbison_sample_id,\n", + " h.regulator_symbol,\n", + " d.dto_empirical_pvalue,\n", + " d.perturbation_id_id AS hackett_sample_id\n", + " FROM harbison_meta h\n", + " JOIN dto_expanded d\n", + " ON CAST(h.sample_id AS VARCHAR) = d.binding_id_id\n", + " AND d.binding_id_source = 'harbison'\n", + " WHERE d.dto_empirical_pvalue <= 0.01\n", + " ORDER BY d.dto_empirical_pvalue\n", + " LIMIT 10\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-24", + "metadata": {}, + "source": [ + "## A realistic example\n", + "\n", + "Hackett has multiple experimental conditions that are unique to that dataset. There are\n", + "some regulators which have replicates within those conditions. We need to find those \n", + "regulators and design a query which returns only 1 sample per condition set." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f03e942a", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Glucose samples by temperature:\n", - " 30.0C: 1791 samples\n" - ] + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, + { + "name": "mechanism", + "rawType": "object", + "type": "string" + }, + { + "name": "restriction", + "rawType": "object", + "type": "string" + }, + { + "name": "n", + "rawType": "int64", + "type": "integer" } + ], + "ref": "1185b490-3375-41d0-b61c-0f35dae2b815", + "rows": [ + [ + "0", + "SWI1", + "15.0", + "ZEV", + "P", + "3" + ], + [ + "1", + "SWI1", + "30.0", + "ZEV", + "P", + "3" + ], + [ + "2", + "SWI1", + "45.0", + "ZEV", + "P", + "3" + ], + [ + "3", + "SWI1", + "5.0", + "ZEV", + "P", + "3" + ], + [ + "4", + "SWI1", + "0.0", + "ZEV", + "P", + "3" + ], + [ + "5", + "SWI1", + "90.0", + "ZEV", + "P", + "3" + ], + [ + "6", + "SWI1", + "10.0", + "ZEV", + "P", + "3" + ], + [ + "7", + "SWI1", + "20.0", + "ZEV", + "P", + "3" + ], + [ + "8", + "GCN4", + "0.0", + "ZEV", + "P", + "2" + ], + [ + "9", + "GCN4", + "30.0", + "ZEV", + "P", + "2" + ], + [ + "10", + "MAC1", + "0.0", + "GEV", + "P", + "2" + ], + [ + "11", + "RDS2", + "5.0", + "ZEV", + "P", + "2" + ], + [ + "12", + "RDS2", + "45.0", + "ZEV", + "P", + "2" + ], + [ + "13", + "Z3EV", + "30.0", + "ZEV", + "P", + "2" + ], + [ + "14", + "GCN4", + "90.0", + "ZEV", + "P", + "2" + ], + [ + "15", + "Z3EV", + "15.0", + "ZEV", + "P", + "2" + ], + [ + "16", + "GCN4", + "45.0", + "ZEV", + "P", + "2" + ], + [ + "17", + "MAC1", + "5.0", + "GEV", + "P", + "2" + ], + [ + "18", + "MAC1", + "90.0", + "GEV", + "P", + "2" + ], + [ + "19", + "Z3EV", + "45.0", + "ZEV", + "P", + "2" + ], + [ + "20", + "RDS2", + "10.0", + "ZEV", + "P", + "2" + ], + [ + "21", + "GCN4", + "15.0", + "ZEV", + "P", + "2" + ], + [ + "22", + "RDS2", + "90.0", + "ZEV", + "P", + "2" + ], + [ + "23", + "RDS2", + "0.0", + "ZEV", + "P", + "2" + ], + [ + "24", + "Z3EV", + "5.0", + "ZEV", + "P", + "2" + ], + [ + "25", + "Z3EV", + "90.0", + "ZEV", + "P", + "2" + ], + [ + "26", + "Z3EV", + "20.0", + "ZEV", + "P", + "2" + ], + [ + "27", + "RDS2", + "30.0", + "ZEV", + "P", + "2" + ], + [ + "28", + "Z3EV", + "0.0", + "ZEV", + "P", + "2" + ], + [ + "29", + "RDS2", + "15.0", + "ZEV", + "P", + "2" + ], + [ + "30", + "Z3EV", + "10.0", + "ZEV", + "P", + "2" + ], + [ + "31", + "RDS2", + "20.0", + "ZEV", + "P", + "2" + ], + [ + "32", + "MAC1", + "45.0", + "GEV", + "P", + "2" + ], + [ + "33", + "MAC1", + "15.0", + "GEV", + "P", + "2" + ], + [ + "34", + "MAC1", + "30.0", + "GEV", + "P", + "2" + ] + ], + "shape": { + "columns": 5, + "rows": 35 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regulator_symboltimemechanismrestrictionn
0SWI115.0ZEVP3
1SWI130.0ZEVP3
2SWI145.0ZEVP3
3SWI15.0ZEVP3
4SWI10.0ZEVP3
5SWI190.0ZEVP3
6SWI110.0ZEVP3
7SWI120.0ZEVP3
8GCN40.0ZEVP2
9GCN430.0ZEVP2
10MAC10.0GEVP2
11RDS25.0ZEVP2
12RDS245.0ZEVP2
13Z3EV30.0ZEVP2
14GCN490.0ZEVP2
15Z3EV15.0ZEVP2
16GCN445.0ZEVP2
17MAC15.0GEVP2
18MAC190.0GEVP2
19Z3EV45.0ZEVP2
20RDS210.0ZEVP2
21GCN415.0ZEVP2
22RDS290.0ZEVP2
23RDS20.0ZEVP2
24Z3EV5.0ZEVP2
25Z3EV90.0ZEVP2
26Z3EV20.0ZEVP2
27RDS230.0ZEVP2
28Z3EV0.0ZEVP2
29RDS215.0ZEVP2
30Z3EV10.0ZEVP2
31RDS220.0ZEVP2
32MAC145.0GEVP2
33MAC115.0GEVP2
34MAC130.0GEVP2
\n", + "
" ], - "source": [ - "# Compare glucose experiments at different temperatures\n", - "\n", - "glucose_by_temp = vdb.query(\n", - " filters={\"carbon_source\": \"glucose\"},\n", - " fields=[\"sample_id\", \"temperature_celsius\", \"environmental_condition\"]\n", - ")\n", - "\n", - "# Count samples by temperature\n", - "temp_counts = glucose_by_temp['temperature_celsius'].value_counts().sort_index()\n", - "\n", - "print(\"Glucose samples by temperature:\")\n", - "for temp, count in temp_counts.items():\n", - " print(f\" {temp}C: {count} samples\")" + "text/plain": [ + " regulator_symbol time mechanism restriction n\n", + "0 SWI1 15.0 ZEV P 3\n", + "1 SWI1 30.0 ZEV P 3\n", + "2 SWI1 45.0 ZEV P 3\n", + "3 SWI1 5.0 ZEV P 3\n", + "4 SWI1 0.0 ZEV P 3\n", + "5 SWI1 90.0 ZEV P 3\n", + "6 SWI1 10.0 ZEV P 3\n", + "7 SWI1 20.0 ZEV P 3\n", + "8 GCN4 0.0 ZEV P 2\n", + "9 GCN4 30.0 ZEV P 2\n", + "10 MAC1 0.0 GEV P 2\n", + "11 RDS2 5.0 ZEV P 2\n", + "12 RDS2 45.0 ZEV P 2\n", + "13 Z3EV 30.0 ZEV P 2\n", + "14 GCN4 90.0 ZEV P 2\n", + "15 Z3EV 15.0 ZEV P 2\n", + "16 GCN4 45.0 ZEV P 2\n", + "17 MAC1 5.0 GEV P 2\n", + "18 MAC1 90.0 GEV P 2\n", + "19 Z3EV 45.0 ZEV P 2\n", + "20 RDS2 10.0 ZEV P 2\n", + "21 GCN4 15.0 ZEV P 2\n", + "22 RDS2 90.0 ZEV P 2\n", + "23 RDS2 0.0 ZEV P 2\n", + "24 Z3EV 5.0 ZEV P 2\n", + "25 Z3EV 90.0 ZEV P 2\n", + "26 Z3EV 20.0 ZEV P 2\n", + "27 RDS2 30.0 ZEV P 2\n", + "28 Z3EV 0.0 ZEV P 2\n", + "29 RDS2 15.0 ZEV P 2\n", + "30 Z3EV 10.0 ZEV P 2\n", + "31 RDS2 20.0 ZEV P 2\n", + "32 MAC1 45.0 GEV P 2\n", + "33 MAC1 15.0 GEV P 2\n", + "34 MAC1 30.0 GEV P 2" ] - }, + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Query hackett to find regulators with multiple samples in the same (time, mechanism)\n", + "# condition\n", + "vdb.query(\"\"\"\n", + " SELECT regulator_symbol, time, mechanism, restriction, COUNT(*) AS n\n", + " FROM hackett_meta\n", + " GROUP BY regulator_symbol, time, mechanism, restriction\n", + " HAVING n > 1\n", + " ORDER BY n DESC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4d869036", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 18678 FHL1 binding measurements in glucose\n", - "Significant targets: 379\n", - "\n", - "Top 10 targets by effect size:\n", - "target_symbol effect pvalue\n", - " RPS5 24.145013 9.739702e-09\n", - " RPL11A 20.585725 1.232356e-08\n", - " PRE2 20.585725 1.232356e-08\n", - " SRF1 20.342898 1.226799e-08\n", - " SLX8 20.057080 1.513076e-08\n", - " RPL23B 20.057080 1.513076e-08\n", - " RPL40A 19.262139 1.761808e-08\n", - " MLP2 19.262139 1.761808e-08\n", - " RPS6A 18.704379 1.544172e-08\n", - " RPL22A 17.926705 1.560357e-08\n" - ] + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "sample_id", + "rawType": "int32", + "type": "integer" + }, + { + "name": "regulator_locus_tag", + "rawType": "object", + "type": "string" + }, + { + "name": "regulator_symbol", + "rawType": "object", + "type": "string" + }, + { + "name": "time", + "rawType": "float64", + "type": "float" + }, + { + "name": "mechanism", + "rawType": "object", + "type": "string" + }, + { + "name": "restriction", + "rawType": "object", + "type": "string" + }, + { + "name": "date", + "rawType": "object", + "type": "string" + }, + { + "name": "strain", + "rawType": "object", + "type": "string" + }, + { + "name": "carbon_source", + "rawType": "object", + "type": "string" + }, + { + "name": "temperature_celsius", + "rawType": "float64", + "type": "float" } + ], + "ref": "440ab0a2-f84a-4505-8380-e218512394f7", + "rows": [ + [ + "0", + "1620", + "YPL016W", + "SWI1", + "20.0", + "ZEV", + "P", + "20161117", + "SMY2266a", + "glucose", + "30.0" + ], + [ + "1", + "1628", + "YPL016W", + "SWI1", + "20.0", + "ZEV", + "P", + "20161117", + "SMY2266b", + "glucose", + "30.0" + ], + [ + "2", + "1636", + "YPL016W", + "SWI1", + "20.0", + "ZEV", + "P", + "20161117", + "SMY2266c", + "glucose", + "30.0" + ] + ], + "shape": { + "columns": 10, + "rows": 3 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idregulator_locus_tagregulator_symboltimemechanismrestrictiondatestraincarbon_sourcetemperature_celsius
01620YPL016WSWI120.0ZEVP20161117SMY2266aglucose30.0
11628YPL016WSWI120.0ZEVP20161117SMY2266bglucose30.0
21636YPL016WSWI120.0ZEVP20161117SMY2266cglucose30.0
\n", + "
" ], - "source": [ - "# Get binding data for a specific regulator across datasets\n", - "\n", - "# Query for FHL1 binding in glucose conditions\n", - "fhl1_binding = vdb.query(\n", - " filters={\n", - " \"carbon_source\": \"glucose\",\n", - " \"regulator_symbol\": \"FHL1\"\n", - " },\n", - " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"effect\", \"pvalue\"],\n", - " complete=True\n", - ")\n", - "\n", - "print(f\"Found {len(fhl1_binding)} FHL1 binding measurements in glucose\")\n", - "\n", - "# Find significant targets (p < 0.001)\n", - "significant = fhl1_binding[fhl1_binding['pvalue'] < 0.001]\n", - "print(f\"Significant targets: {len(significant)}\")\n", - "\n", - "# Top 10 by effect size\n", - "top_targets = significant.nlargest(10, 'effect')[['target_symbol', 'effect', 'pvalue']]\n", - "print(\"\\nTop 10 targets by effect size:\")\n", - "print(top_targets.to_string(index=False))" + "text/plain": [ + " sample_id regulator_locus_tag regulator_symbol time mechanism restriction \\\n", + "0 1620 YPL016W SWI1 20.0 ZEV P \n", + "1 1628 YPL016W SWI1 20.0 ZEV P \n", + "2 1636 YPL016W SWI1 20.0 ZEV P \n", + "\n", + " date strain carbon_source temperature_celsius \n", + "0 20161117 SMY2266a glucose 30.0 \n", + "1 20161117 SMY2266b glucose 30.0 \n", + "2 20161117 SMY2266c glucose 30.0 " ] - }, + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# SWI1 has 3 samples at time=20, mechanism=ZEV. Let's look at just those samples\n", + "vdb.query(\"\"\"\n", + " SELECT *\n", + " FROM hackett_meta\n", + " WHERE regulator_symbol = 'SWI1'\n", + " AND time = 20\n", + " AND mechanism = 'ZEV'\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "89408d2b", + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Querying Comparative Datasets\n", - "\n", - "Comparative datasets like DTO (Direct Target Overlap) contain analysis results that relate samples across multiple datasets. These datasets can be queried directly to find significant cross-dataset relationships." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2']\n" + ] + } + ], + "source": [ + "# In this case, there are three strains with otherwise the same experimental conditions.\n", + "# Rather than trying to choose among these right now, we might just want to get a\n", + "# unique list of the regulators with replicates in order to exclude them from an\n", + "# analysis that doesn't expect replicates.\n", + "replicated_hackett_regulators = vdb.query(\"\"\"\n", + " SELECT DISTINCT regulator_symbol\n", + " FROM hackett_meta\n", + " GROUP BY regulator_symbol, time, mechanism, restriction\n", + " HAVING COUNT(*) > 1\n", + "\"\"\").regulator_symbol.tolist()\n", + "print(replicated_hackett_regulators)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5a3b802b", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 65536.00it/s]\n", - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 57325.34it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 32 FHL1 binding measurements\n", - "\n", - "Columns: ['sample_id', 'regulator_symbol', 'condition', 'dto_fdr', 'perturbation_id', 'dataset_id']\n", - "\n", - "Rows with DTO data: 4\n", - "\n", - "First few results:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "condition", - "rawType": "object", - "type": "string" - }, - { - "name": "dto_fdr", - "rawType": "float64", - "type": "float" - }, - { - "name": "perturbation_id", - "rawType": "object", - "type": "string" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "a0eb6112-b457-4642-add7-4bcd5068e495", - "rows": [ - [ - "0", - "345", - "FHL1", - "H2O2Hi", - "0.4549087454017032", - "BrentLab/Hackett_2020;hackett_2020;1666", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "1", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1665", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "2", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1667", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "3", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1669", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "4", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1663", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "5", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1664", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "6", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1670", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "7", - "345", - "FHL1", - "H2O2Hi", - null, - "BrentLab/Hackett_2020;hackett_2020;1668", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "8", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1667", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "9", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1663", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "10", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1670", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "11", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1668", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "12", - "346", - "FHL1", - "RAPA", - "0.0", - "BrentLab/Hackett_2020;hackett_2020;1666", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "13", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1669", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "14", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1664", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "15", - "346", - "FHL1", - "RAPA", - null, - "BrentLab/Hackett_2020;hackett_2020;1665", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "16", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1667", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "17", - "347", - "FHL1", - "SM", - "0.0221957781456953", - "BrentLab/Hackett_2020;hackett_2020;1666", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "18", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1669", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "19", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1664", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "20", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1663", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "21", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1670", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "22", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1668", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "23", - "347", - "FHL1", - "SM", - null, - "BrentLab/Hackett_2020;hackett_2020;1665", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "24", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1664", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "25", - "348", - "FHL1", - "YPD", - "0.089578429724277", - "BrentLab/Hackett_2020;hackett_2020;1666", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "26", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1663", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "27", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1667", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "28", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1669", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "29", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1665", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "30", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1670", - "BrentLab/harbison_2004/harbison_2004" - ], - [ - "31", - "348", - "FHL1", - "YPD", - null, - "BrentLab/Hackett_2020;hackett_2020;1668", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 6, - "rows": 32 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_symbolconditiondto_fdrperturbation_iddataset_id
0345FHL1H2O2Hi0.454909BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
1345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
2345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
3345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
4345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
5345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
6345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
7345FHL1H2O2HiNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
8346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
9346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
10346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
11346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
12346FHL1RAPA0.000000BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
13346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
14346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
15346FHL1RAPANaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
16347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
17347FHL1SM0.022196BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
18347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
19347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
20347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
21347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
22347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
23347FHL1SMNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
24348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1664BrentLab/harbison_2004/harbison_2004
25348FHL1YPD0.089578BrentLab/Hackett_2020;hackett_2020;1666BrentLab/harbison_2004/harbison_2004
26348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1663BrentLab/harbison_2004/harbison_2004
27348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1667BrentLab/harbison_2004/harbison_2004
28348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1669BrentLab/harbison_2004/harbison_2004
29348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1665BrentLab/harbison_2004/harbison_2004
30348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1670BrentLab/harbison_2004/harbison_2004
31348FHL1YPDNaNBrentLab/Hackett_2020;hackett_2020;1668BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_symbol condition dto_fdr \\\n", - "0 345 FHL1 H2O2Hi 0.454909 \n", - "1 345 FHL1 H2O2Hi NaN \n", - "2 345 FHL1 H2O2Hi NaN \n", - "3 345 FHL1 H2O2Hi NaN \n", - "4 345 FHL1 H2O2Hi NaN \n", - "5 345 FHL1 H2O2Hi NaN \n", - "6 345 FHL1 H2O2Hi NaN \n", - "7 345 FHL1 H2O2Hi NaN \n", - "8 346 FHL1 RAPA NaN \n", - "9 346 FHL1 RAPA NaN \n", - "10 346 FHL1 RAPA NaN \n", - "11 346 FHL1 RAPA NaN \n", - "12 346 FHL1 RAPA 0.000000 \n", - "13 346 FHL1 RAPA NaN \n", - "14 346 FHL1 RAPA NaN \n", - "15 346 FHL1 RAPA NaN \n", - "16 347 FHL1 SM NaN \n", - "17 347 FHL1 SM 0.022196 \n", - "18 347 FHL1 SM NaN \n", - "19 347 FHL1 SM NaN \n", - "20 347 FHL1 SM NaN \n", - "21 347 FHL1 SM NaN \n", - "22 347 FHL1 SM NaN \n", - "23 347 FHL1 SM NaN \n", - "24 348 FHL1 YPD NaN \n", - "25 348 FHL1 YPD 0.089578 \n", - "26 348 FHL1 YPD NaN \n", - "27 348 FHL1 YPD NaN \n", - "28 348 FHL1 YPD NaN \n", - "29 348 FHL1 YPD NaN \n", - "30 348 FHL1 YPD NaN \n", - "31 348 FHL1 YPD NaN \n", - "\n", - " perturbation_id \\\n", - "0 BrentLab/Hackett_2020;hackett_2020;1666 \n", - "1 BrentLab/Hackett_2020;hackett_2020;1665 \n", - "2 BrentLab/Hackett_2020;hackett_2020;1667 \n", - "3 BrentLab/Hackett_2020;hackett_2020;1669 \n", - "4 BrentLab/Hackett_2020;hackett_2020;1663 \n", - "5 BrentLab/Hackett_2020;hackett_2020;1664 \n", - "6 BrentLab/Hackett_2020;hackett_2020;1670 \n", - "7 BrentLab/Hackett_2020;hackett_2020;1668 \n", - "8 BrentLab/Hackett_2020;hackett_2020;1667 \n", - "9 BrentLab/Hackett_2020;hackett_2020;1663 \n", - "10 BrentLab/Hackett_2020;hackett_2020;1670 \n", - "11 BrentLab/Hackett_2020;hackett_2020;1668 \n", - "12 BrentLab/Hackett_2020;hackett_2020;1666 \n", - "13 BrentLab/Hackett_2020;hackett_2020;1669 \n", - "14 BrentLab/Hackett_2020;hackett_2020;1664 \n", - "15 BrentLab/Hackett_2020;hackett_2020;1665 \n", - "16 BrentLab/Hackett_2020;hackett_2020;1667 \n", - "17 BrentLab/Hackett_2020;hackett_2020;1666 \n", - "18 BrentLab/Hackett_2020;hackett_2020;1669 \n", - "19 BrentLab/Hackett_2020;hackett_2020;1664 \n", - "20 BrentLab/Hackett_2020;hackett_2020;1663 \n", - "21 BrentLab/Hackett_2020;hackett_2020;1670 \n", - "22 BrentLab/Hackett_2020;hackett_2020;1668 \n", - "23 BrentLab/Hackett_2020;hackett_2020;1665 \n", - "24 BrentLab/Hackett_2020;hackett_2020;1664 \n", - "25 BrentLab/Hackett_2020;hackett_2020;1666 \n", - "26 BrentLab/Hackett_2020;hackett_2020;1663 \n", - "27 BrentLab/Hackett_2020;hackett_2020;1667 \n", - "28 BrentLab/Hackett_2020;hackett_2020;1669 \n", - "29 BrentLab/Hackett_2020;hackett_2020;1665 \n", - "30 BrentLab/Hackett_2020;hackett_2020;1670 \n", - "31 BrentLab/Hackett_2020;hackett_2020;1668 \n", - "\n", - " dataset_id \n", - "0 BrentLab/harbison_2004/harbison_2004 \n", - "1 BrentLab/harbison_2004/harbison_2004 \n", - "2 BrentLab/harbison_2004/harbison_2004 \n", - "3 BrentLab/harbison_2004/harbison_2004 \n", - "4 BrentLab/harbison_2004/harbison_2004 \n", - "5 BrentLab/harbison_2004/harbison_2004 \n", - "6 BrentLab/harbison_2004/harbison_2004 \n", - "7 BrentLab/harbison_2004/harbison_2004 \n", - "8 BrentLab/harbison_2004/harbison_2004 \n", - "9 BrentLab/harbison_2004/harbison_2004 \n", - "10 BrentLab/harbison_2004/harbison_2004 \n", - "11 BrentLab/harbison_2004/harbison_2004 \n", - "12 BrentLab/harbison_2004/harbison_2004 \n", - "13 BrentLab/harbison_2004/harbison_2004 \n", - "14 BrentLab/harbison_2004/harbison_2004 \n", - "15 BrentLab/harbison_2004/harbison_2004 \n", - "16 BrentLab/harbison_2004/harbison_2004 \n", - "17 BrentLab/harbison_2004/harbison_2004 \n", - "18 BrentLab/harbison_2004/harbison_2004 \n", - "19 BrentLab/harbison_2004/harbison_2004 \n", - "20 BrentLab/harbison_2004/harbison_2004 \n", - "21 BrentLab/harbison_2004/harbison_2004 \n", - "22 BrentLab/harbison_2004/harbison_2004 \n", - "23 BrentLab/harbison_2004/harbison_2004 \n", - "24 BrentLab/harbison_2004/harbison_2004 \n", - "25 BrentLab/harbison_2004/harbison_2004 \n", - "26 BrentLab/harbison_2004/harbison_2004 \n", - "27 BrentLab/harbison_2004/harbison_2004 \n", - "28 BrentLab/harbison_2004/harbison_2004 \n", - "29 BrentLab/harbison_2004/harbison_2004 \n", - "30 BrentLab/harbison_2004/harbison_2004 \n", - "31 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Query harbison_2004 binding data enriched with DTO metrics\n", - "# This demonstrates field-based joins: requesting dto_fdr field\n", - "# while querying the primary binding dataset\n", - "\n", - "binding_with_dto = vdb.query(\n", - " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", - " filters={\"regulator_symbol\": \"FHL1\"},\n", - " fields=[\"sample_id\", \"regulator_symbol\", \"condition\", \"dto_fdr\", \"binding_id\", \"perturbation_id\"],\n", - ")\n", - "\n", - "print(f\"Found {len(binding_with_dto)} FHL1 binding measurements\")\n", - "print(f\"\\nColumns: {list(binding_with_dto.columns)}\")\n", - "print(f\"\\nRows with DTO data: {binding_with_dto['dto_fdr'].notna().sum()}\")\n", - "print(f\"\\nFirst few results:\")\n", - "binding_with_dto" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "['GCN4', 'MAC1', 'SWI1', 'Z3EV', 'RDS2', 'GEV']\n" + ] + } + ], + "source": [ + "# GEV is another \"regulator\" we want to exclude\n", + "replicated_hackett_regulators.append(\"GEV\")\n", + "print(replicated_hackett_regulators)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "abed8bc2", + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 122760.12it/s]\n", - "Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 35951.18it/s]\n" - ] - }, - { - "data": { - "application/vnd.microsoft.datawrangler.viewer.v0+json": { - "columns": [ - { - "name": "index", - "rawType": "int64", - "type": "integer" - }, - { - "name": "sample_id", - "rawType": "int32", - "type": "integer" - }, - { - "name": "regulator_symbol", - "rawType": "object", - "type": "string" - }, - { - "name": "perturbation_id", - "rawType": "object", - "type": "string" - }, - { - "name": "dto_empirical_pvalue", - "rawType": "float64", - "type": "float" - }, - { - "name": "dataset_id", - "rawType": "object", - "type": "string" - } - ], - "ref": "f666fc22-ce67-46fc-80bb-c44baafdf799", - "rows": [ - [ - "0", - "347", - "FHL1", - "BrentLab/Hackett_2020;hackett_2020;1666", - "0.297", - "BrentLab/harbison_2004/harbison_2004" - ] - ], - "shape": { - "columns": 5, - "rows": 1 - } - }, - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idregulator_symbolperturbation_iddto_empirical_pvaluedataset_id
0347FHL1BrentLab/Hackett_2020;hackett_2020;16660.297BrentLab/harbison_2004/harbison_2004
\n", - "
" - ], - "text/plain": [ - " sample_id regulator_symbol perturbation_id \\\n", - "0 347 FHL1 BrentLab/Hackett_2020;hackett_2020;1666 \n", - "\n", - " dto_empirical_pvalue dataset_id \n", - "0 0.297 BrentLab/harbison_2004/harbison_2004 " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "binding_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_rank_threshold", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "perturbation_set_size", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_fdr", + "rawType": "float64", + "type": "float" + }, + { + "name": "dto_empirical_pvalue", + "rawType": "float64", + "type": "float" + }, + { + "name": "binding_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_repo_dataset", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "binding_id_source", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_id", + "rawType": "object", + "type": "string" + }, + { + "name": "perturbation_id_source", + "rawType": "object", + "type": "string" } + ], + "ref": "b9dead21-45e7-491d-82d4-a2358af05efe", + "rows": [ + [ + "0", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;85", + "2.0", + "2.0", + "3.0", + "2.0", + "0.0002250900360144", + "0.004", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "85", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "1", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;83", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "83", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "2", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;84", + "2.0", + "1.0", + "3.0", + "1.0", + "0.0", + "0.011", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "84", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "3", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;78", + "487.0", + "96.0", + "479.0", + "92.0", + "0.4121918908550328", + "0.576", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "78", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "4", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;81", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "81", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "5", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;33", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "33", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "6", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;73", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "73", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "7", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;47", + "407.0", + "310.0", + "378.0", + "306.0", + "0.2038622347205313", + "0.441", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "47", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "8", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;46", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "46", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "9", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;45", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "45", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "10", + "BrentLab/harbison_2004;harbison_2004;8", + "BrentLab/Hackett_2020;hackett_2020;48", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "8", + "harbison", + "48", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "11", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;34", + "198.0", + "26.0", + "193.0", + "24.0", + "0.7367526600236447", + "0.512", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "34", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "12", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;88", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "88", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "13", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;79", + "278.0", + "82.0", + "275.0", + "76.0", + "0.3669436052366566", + "0.531", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "79", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "14", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;74", + "386.0", + "2.0", + "381.0", + "2.0", + "0.0478033736153071", + "0.596", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "74", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "15", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;87", + "2.0", + "2.0", + "3.0", + "2.0", + "0.0002250900360144", + "0.01", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "87", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "16", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;82", + "2.0", + "2.0", + "3.0", + "2.0", + "0.0002250900360144", + "0.005", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "82", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "17", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;40", + "233.0", + "887.0", + "228.0", + "853.0", + "0.4419109947643979", + "0.306", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "40", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "18", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;37", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "37", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "19", + "BrentLab/harbison_2004;harbison_2004;3", + "BrentLab/Hackett_2020;hackett_2020;86", + "2.0", + "2.0", + "3.0", + "2.0", + "0.0002250900360144", + "0.014", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "3", + "harbison", + "86", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "20", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;75", + "386.0", + "4.0", + "381.0", + "4.0", + "0.1752790365894595", + "0.871", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "75", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "21", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;77", + "487.0", + "15.0", + "479.0", + "13.0", + "0.1591137965760322", + "0.23", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "77", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "22", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;38", + "28.0", + "394.0", + "29.0", + "375.0", + "0.1464068569498395", + "0.309", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "38", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "23", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;36", + "242.0", + "239.0", + "237.0", + "230.0", + "0.4474384543548884", + "0.644", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "36", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "24", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;35", + "12.0", + "136.0", + "12.0", + "129.0", + "0.1014820131734504", + "0.411", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "35", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "25", + "BrentLab/harbison_2004;harbison_2004;2", + "BrentLab/Hackett_2020;hackett_2020;39", + "236.0", + "462.0", + "231.0", + "442.0", + "0.4406392501266677", + "0.536", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "2", + "harbison", + "39", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "26", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;65", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "65", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "27", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;80", + "386.0", + "12.0", + "381.0", + "11.0", + "0.1530190500167841", + "0.26", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "80", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "28", + "BrentLab/harbison_2004;harbison_2004;4", + "BrentLab/Hackett_2020;hackett_2020;76", + "386.0", + "13.0", + "381.0", + "13.0", + "0.3335221550855992", + "0.723", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "4", + "harbison", + "76", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "29", + "BrentLab/harbison_2004;harbison_2004;10", + "BrentLab/Hackett_2020;hackett_2020;48", + "467.0", + "60.0", + "454.0", + "60.0", + "0.1983655120981107", + "0.035", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "10", + "harbison", + "48", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "30", + "BrentLab/harbison_2004;harbison_2004;10", + "BrentLab/Hackett_2020;hackett_2020;47", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "10", + "harbison", + "47", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "31", + "BrentLab/harbison_2004;harbison_2004;10", + "BrentLab/Hackett_2020;hackett_2020;46", + "284.0", + "47.0", + "278.0", + "46.0", + "0.0992715955737997", + "0.003", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "10", + "harbison", + "46", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "32", + "BrentLab/harbison_2004;harbison_2004;11", + "BrentLab/Hackett_2020;hackett_2020;48", + "472.0", + "1.0", + "459.0", + "1.0", + "0.0", + "0.915", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "11", + "harbison", + "48", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "33", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;41", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "41", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "34", + "BrentLab/harbison_2004;harbison_2004;16", + "BrentLab/Hackett_2020;hackett_2020;89", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "16", + "harbison", + "89", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "35", + "BrentLab/harbison_2004;harbison_2004;8", + "BrentLab/Hackett_2020;hackett_2020;41", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "8", + "harbison", + "41", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "36", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;43", + "2.0", + "330.0", + "2.0", + "318.0", + "0.0", + "0.195", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "43", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "37", + "BrentLab/harbison_2004;harbison_2004;16", + "BrentLab/Hackett_2020;hackett_2020;91", + "9.0", + "1.0", + "9.0", + "1.0", + "0.0", + "0.019", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "16", + "harbison", + "91", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "38", + "BrentLab/harbison_2004;harbison_2004;17", + "BrentLab/Hackett_2020;hackett_2020;91", + "2.0", + "1.0", + "2.0", + "1.0", + "0.0", + "0.008", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "17", + "harbison", + "91", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "39", + "BrentLab/harbison_2004;harbison_2004;8", + "BrentLab/Hackett_2020;hackett_2020;43", + "290.0", + "412.0", + "278.0", + "386.0", + "0.4521656634210855", + "0.208", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "8", + "harbison", + "43", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "40", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;66", + "398.0", + "16.0", + "390.0", + "15.0", + "0.2406042358803986", + "0.431", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "66", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "41", + "BrentLab/harbison_2004;harbison_2004;7", + "BrentLab/Hackett_2020;hackett_2020;42", + "122.0", + "212.0", + "120.0", + "206.0", + "0.3447911486822476", + "0.49", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "7", + "harbison", + "42", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "42", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;72", + "346.0", + "18.0", + "338.0", + "16.0", + "0.22671996124031", + "0.528", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "72", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "43", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;69", + "118.0", + "120.0", + "115.0", + "113.0", + "0.3139880952380952", + "0.454", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "69", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "44", + "BrentLab/harbison_2004;harbison_2004;20", + "BrentLab/Hackett_2020;hackett_2020;99", + "3.0", + "1.0", + "4.0", + "1.0", + "0.0", + "0.006", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "20", + "harbison", + "99", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "45", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;70", + "260.0", + "17.0", + "256.0", + "17.0", + "0.1850671373200443", + "0.455", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "70", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "46", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;67", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "67", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "47", + "BrentLab/harbison_2004;harbison_2004;20", + "BrentLab/Hackett_2020;hackett_2020;97", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "20", + "harbison", + "97", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "48", + "BrentLab/harbison_2004;harbison_2004;5", + "BrentLab/Hackett_2020;hackett_2020;68", + "260.0", + "2.0", + "256.0", + "2.0", + "0.0317379568106312", + "0.647", + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "5", + "harbison", + "68", + "BrentLab/Hackett_2020;hackett_2020" + ], + [ + "49", + "BrentLab/harbison_2004;harbison_2004;10", + "BrentLab/Hackett_2020;hackett_2020;41", + null, + null, + null, + null, + null, + null, + "harbison_2004-harbison_2004", + "Hackett_2020-hackett_2020", + "10", + "harbison", + "41", + "BrentLab/Hackett_2020;hackett_2020" + ] + ], + "shape": { + "columns": 14, + "rows": 9604 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
binding_idperturbation_idbinding_rank_thresholdperturbation_rank_thresholdbinding_set_sizeperturbation_set_sizedto_fdrdto_empirical_pvaluebinding_repo_datasetperturbation_repo_datasetbinding_id_idbinding_id_sourceperturbation_id_idperturbation_id_source
0BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;852.02.03.02.00.0002250.004harbison_2004-harbison_2004Hackett_2020-hackett_20203harbison85BrentLab/Hackett_2020;hackett_2020
1BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;83NaNNaNNaNNaNNaNNaNharbison_2004-harbison_2004Hackett_2020-hackett_20203harbison83BrentLab/Hackett_2020;hackett_2020
2BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;842.01.03.01.00.0000000.011harbison_2004-harbison_2004Hackett_2020-hackett_20203harbison84BrentLab/Hackett_2020;hackett_2020
3BrentLab/harbison_2004;harbison_2004;4BrentLab/Hackett_2020;hackett_2020;78487.096.0479.092.00.4121920.576harbison_2004-harbison_2004Hackett_2020-hackett_20204harbison78BrentLab/Hackett_2020;hackett_2020
4BrentLab/harbison_2004;harbison_2004;3BrentLab/Hackett_2020;hackett_2020;81NaNNaNNaNNaNNaNNaNharbison_2004-harbison_2004Hackett_2020-hackett_20203harbison81BrentLab/Hackett_2020;hackett_2020
.............................................
9599BrentLab/callingcards;annotated_features;804BrentLab/kemmeren_2014;kemmeren_2014;90114.039.013.039.00.0008790.000callingcards-annotated_featureskemmeren_2014-kemmeren_2014804BrentLab/callingcards;annotated_features901kemmeren
9600BrentLab/callingcards;annotated_features;805BrentLab/kemmeren_2014;kemmeren_2014;105318.0278.017.0171.00.0014550.000callingcards-annotated_featureskemmeren_2014-kemmeren_2014805BrentLab/callingcards;annotated_features1053kemmeren
9601BrentLab/callingcards;annotated_features;808BrentLab/kemmeren_2014;kemmeren_2014;21820.057.019.027.00.0031160.000callingcards-annotated_featureskemmeren_2014-kemmeren_2014808BrentLab/callingcards;annotated_features218kemmeren
9602BrentLab/callingcards;annotated_features;806BrentLab/kemmeren_2014;kemmeren_2014;102310.09.011.09.00.0000000.000callingcards-annotated_featureskemmeren_2014-kemmeren_2014806BrentLab/callingcards;annotated_features1023kemmeren
9603BrentLab/callingcards;annotated_features;809BrentLab/kemmeren_2014;kemmeren_2014;913150.0221.0140.0206.00.1168900.000callingcards-annotated_featureskemmeren_2014-kemmeren_2014809BrentLab/callingcards;annotated_features913kemmeren
\n", + "

9604 rows × 14 columns

\n", + "
" ], - "source": [ - "# You can also filter on comparative dataset fields\n", - "# This returns only binding measurements with significant DTO results\n", - "\n", - "significant_dtos = vdb.query(\n", - " datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n", - " filters={\n", - " \"regulator_symbol\": \"FHL1\",\n", - " # the threshold is high here b/c FHL1 didn't have significant results in harbison\n", - " \"dto_empirical_pvalue\": (\"<\", 0.5)\n", - " },\n", - " fields=[\"sample_id\", \"regulator_symbol\", \"target_symbol\", \"perturbation_id\", \"dto_empirical_pvalue\"],\n", - ")\n", - "\n", - "significant_dtos" + "text/plain": [ + " binding_id \\\n", + "0 BrentLab/harbison_2004;harbison_2004;3 \n", + "1 BrentLab/harbison_2004;harbison_2004;3 \n", + "2 BrentLab/harbison_2004;harbison_2004;3 \n", + "3 BrentLab/harbison_2004;harbison_2004;4 \n", + "4 BrentLab/harbison_2004;harbison_2004;3 \n", + "... ... \n", + "9599 BrentLab/callingcards;annotated_features;804 \n", + "9600 BrentLab/callingcards;annotated_features;805 \n", + "9601 BrentLab/callingcards;annotated_features;808 \n", + "9602 BrentLab/callingcards;annotated_features;806 \n", + "9603 BrentLab/callingcards;annotated_features;809 \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/Hackett_2020;hackett_2020;85 2.0 \n", + "1 BrentLab/Hackett_2020;hackett_2020;83 NaN \n", + "2 BrentLab/Hackett_2020;hackett_2020;84 2.0 \n", + "3 BrentLab/Hackett_2020;hackett_2020;78 487.0 \n", + "4 BrentLab/Hackett_2020;hackett_2020;81 NaN \n", + "... ... ... \n", + "9599 BrentLab/kemmeren_2014;kemmeren_2014;901 14.0 \n", + "9600 BrentLab/kemmeren_2014;kemmeren_2014;1053 18.0 \n", + "9601 BrentLab/kemmeren_2014;kemmeren_2014;218 20.0 \n", + "9602 BrentLab/kemmeren_2014;kemmeren_2014;1023 10.0 \n", + "9603 BrentLab/kemmeren_2014;kemmeren_2014;913 150.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 2.0 3.0 2.0 \n", + "1 NaN NaN NaN \n", + "2 1.0 3.0 1.0 \n", + "3 96.0 479.0 92.0 \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "9599 39.0 13.0 39.0 \n", + "9600 278.0 17.0 171.0 \n", + "9601 57.0 19.0 27.0 \n", + "9602 9.0 11.0 9.0 \n", + "9603 221.0 140.0 206.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", + "0 0.000225 0.004 harbison_2004-harbison_2004 \n", + "1 NaN NaN harbison_2004-harbison_2004 \n", + "2 0.000000 0.011 harbison_2004-harbison_2004 \n", + "3 0.412192 0.576 harbison_2004-harbison_2004 \n", + "4 NaN NaN harbison_2004-harbison_2004 \n", + "... ... ... ... \n", + "9599 0.000879 0.000 callingcards-annotated_features \n", + "9600 0.001455 0.000 callingcards-annotated_features \n", + "9601 0.003116 0.000 callingcards-annotated_features \n", + "9602 0.000000 0.000 callingcards-annotated_features \n", + "9603 0.116890 0.000 callingcards-annotated_features \n", + "\n", + " perturbation_repo_dataset binding_id_id \\\n", + "0 Hackett_2020-hackett_2020 3 \n", + "1 Hackett_2020-hackett_2020 3 \n", + "2 Hackett_2020-hackett_2020 3 \n", + "3 Hackett_2020-hackett_2020 4 \n", + "4 Hackett_2020-hackett_2020 3 \n", + "... ... ... \n", + "9599 kemmeren_2014-kemmeren_2014 804 \n", + "9600 kemmeren_2014-kemmeren_2014 805 \n", + "9601 kemmeren_2014-kemmeren_2014 808 \n", + "9602 kemmeren_2014-kemmeren_2014 806 \n", + "9603 kemmeren_2014-kemmeren_2014 809 \n", + "\n", + " binding_id_source perturbation_id_id \\\n", + "0 harbison 85 \n", + "1 harbison 83 \n", + "2 harbison 84 \n", + "3 harbison 78 \n", + "4 harbison 81 \n", + "... ... ... \n", + "9599 BrentLab/callingcards;annotated_features 901 \n", + "9600 BrentLab/callingcards;annotated_features 1053 \n", + "9601 BrentLab/callingcards;annotated_features 218 \n", + "9602 BrentLab/callingcards;annotated_features 1023 \n", + "9603 BrentLab/callingcards;annotated_features 913 \n", + "\n", + " perturbation_id_source \n", + "0 BrentLab/Hackett_2020;hackett_2020 \n", + "1 BrentLab/Hackett_2020;hackett_2020 \n", + "2 BrentLab/Hackett_2020;hackett_2020 \n", + "3 BrentLab/Hackett_2020;hackett_2020 \n", + "4 BrentLab/Hackett_2020;hackett_2020 \n", + "... ... \n", + "9599 kemmeren \n", + "9600 kemmeren \n", + "9601 kemmeren \n", + "9602 kemmeren \n", + "9603 kemmeren \n", + "\n", + "[9604 rows x 14 columns]" ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": "tfbpapi-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" + ], + "source": [ + "vdb.query(\"SELECT * FROM dto_expanded\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cell-25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " sample_id regulator_symbol time mechanism \\\n", + "0 448 ACA1 15.0 ZEV \n", + "1 448 ACA1 15.0 ZEV \n", + "2 448 ACA1 15.0 ZEV \n", + "3 448 ACA1 15.0 ZEV \n", + "4 448 ACA1 15.0 ZEV \n", + "\n", + " binding_id \\\n", + "0 BrentLab/callingcards;annotated_features;803 \n", + "1 BrentLab/callingcards;annotated_features;156 \n", + "2 BrentLab/callingcards;annotated_features;126 \n", + "3 BrentLab/callingcards;annotated_features;189 \n", + "4 BrentLab/callingcards;annotated_features;146 \n", + "\n", + " perturbation_id binding_rank_threshold \\\n", + "0 BrentLab/Hackett_2020;hackett_2020;448 112.0 \n", + "1 BrentLab/Hackett_2020;hackett_2020;448 31.0 \n", + "2 BrentLab/Hackett_2020;hackett_2020;448 21.0 \n", + "3 BrentLab/Hackett_2020;hackett_2020;448 164.0 \n", + "4 BrentLab/Hackett_2020;hackett_2020;448 23.0 \n", + "\n", + " perturbation_rank_threshold binding_set_size perturbation_set_size \\\n", + "0 98.0 108.0 90.0 \n", + "1 98.0 26.0 90.0 \n", + "2 98.0 17.0 90.0 \n", + "3 154.0 150.0 144.0 \n", + "4 98.0 18.0 90.0 \n", + "\n", + " dto_fdr dto_empirical_pvalue binding_repo_dataset \\\n", + "0 0.187319 0.074 callingcards-annotated_features \n", + "1 0.072561 0.047 callingcards-annotated_features \n", + "2 0.061941 0.071 callingcards-annotated_features \n", + "3 0.213716 0.011 callingcards-annotated_features \n", + "4 0.066616 0.171 callingcards-annotated_features \n", + "\n", + " perturbation_repo_dataset binding_id_id \\\n", + "0 Hackett_2020-hackett_2020 803 \n", + "1 Hackett_2020-hackett_2020 156 \n", + "2 Hackett_2020-hackett_2020 126 \n", + "3 Hackett_2020-hackett_2020 189 \n", + "4 Hackett_2020-hackett_2020 146 \n", + "\n", + " binding_id_source perturbation_id_id \\\n", + "0 BrentLab/callingcards;annotated_features 448 \n", + "1 BrentLab/callingcards;annotated_features 448 \n", + "2 BrentLab/callingcards;annotated_features 448 \n", + "3 BrentLab/callingcards;annotated_features 448 \n", + "4 BrentLab/callingcards;annotated_features 448 \n", + "\n", + " perturbation_id_source \n", + "0 BrentLab/Hackett_2020;hackett_2020 \n", + "1 BrentLab/Hackett_2020;hackett_2020 \n", + "2 BrentLab/Hackett_2020;hackett_2020 \n", + "3 BrentLab/Hackett_2020;hackett_2020 \n", + "4 BrentLab/Hackett_2020;hackett_2020 \n" + ] } + ], + "source": [ + "# We can remove those regulators from our query using a parameterized query\n", + "hackett_harbison_dto = vdb.query(\"\"\"\n", + "SELECT h.sample_id, h.regulator_symbol, h.time, h.mechanism,\n", + " dto.*\n", + "FROM hackett_meta h\n", + "LEFT JOIN (\n", + " SELECT *\n", + " FROM dto_expanded\n", + ") AS dto\n", + "ON CAST(h.sample_id AS VARCHAR) = dto.perturbation_id_id\n", + "WHERE h.regulator_symbol NOT IN $replicated_hacket_regulators\n", + " AND h.mechanism = 'ZEV'\n", + " AND h.restriction = 'P'\n", + " AND h.time = 15\n", + "ORDER BY h.regulator_symbol, h.time, h.mechanism\n", + "\"\"\",\n", + " replicated_hacket_regulators=replicated_hackett_regulators\n", + ")\n", + "print(hackett_harbison_dto.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "cell-26", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up temp file\n", + "temp_config.unlink(missing_ok=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tfbpapi-py3.11", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/docs/virtual_database_concepts.md b/docs/virtual_database_concepts.md deleted file mode 100644 index 55fe6c9..0000000 --- a/docs/virtual_database_concepts.md +++ /dev/null @@ -1,525 +0,0 @@ -# Virtual Database - -VirtualDB provides a unified query interface across heterogeneous datasets with -different experimental condition structures and terminologies. Each dataset -defines experimental conditions in its own way, with properties stored at -different hierarchy levels (repository, dataset, or field) and using different -naming conventions. VirtualDB uses an external YAML configuration to map these -varying structures to a common schema, normalize factor level names (e.g., -"D-glucose", "dextrose", "glu" all become "glucose"), and enable cross-dataset -queries with standardized field names and values. - -## Configuration Structure - -This is a basic example of a VirtualDB configuration YAML file: - -```yaml -repositories: - # Each repository defines a "table" in the virtual database - BrentLab/harbison_2004: - # REQUIRED: Specify which field is the sample identifier. At this level, it means - # that all datasets have a field `sample_id` that uniquely identifies samples. - sample_id: - field: sample_id - # Repository-wide properties (apply to all datasets in this repository) - nitrogen_source: - path: media.nitrogen_source.name - - dataset: - # Each dataset gets its own view with standardized fields - harbison_2004: - # Dataset-specific properties (constant for all samples) - phosphate_source: - path: media.phosphate_source.compound - - # Field-level properties (vary per sample) - carbon_source: - field: condition - path: media.carbon_source.compound - dtype: string # Optional: specify data type - - # Field without path (column alias with normalization) - environmental_condition: - field: condition - - # if there is a `comparative_analysis` dataset that you want to link to - # a given dataset, you can declare it at the dataset level - # For more information on this section, see the section - # 'Comparative Datasets in VirtualDB' - comparative_analyses: - # specify the comparative analysis repo - - repo: BrentLab/yeast_comparative_analysis - # and dataset - dataset: dto - # and the field in the comparative analysis that links back tot this - # dataset. Note that this field should have role `source_sample`, and it - # should therefore be formated as `repo_id;config_name;sample_id` where the - # sample_id is derived from the field in this dataset that is specified - # for this dataset in the `sample_id` field above. - via_field: perturbation_id - - BrentLab/kemmeren_2014: - dataset: - kemmeren_2014: - # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be - # defined at the dataset level for each dataset in the repo - sample_id: - field: sample_id - # Same logical fields, different physical paths - carbon_source: - path: media.carbon_source.compound - dtype: string - temperature_celsius: - path: temperature_celsius - dtype: numeric # Enables numeric filtering with comparison operators - -# ===== Normalization Rules ===== -# Map varying terminologies to standardized values -factor_aliases: - carbon_source: - glucose: [D-glucose, glu, dextrose] - galactose: [D-galactose, gal] - -# Handle missing values with defaults -missing_value_labels: - carbon_source: "unspecified" - -# ===== Documentation ===== -description: - carbon_source: The carbon source provided to the cells during growth -``` - -### Property Hierarchy - -Properties are extracted at three hierarchy levels: - -1. **Repository-wide**: Common to all datasets in a repository - - Paths relative to repository-level `experimental_conditions` - - Example: `path: media.nitrogen_source.name` - -2. **Dataset-specific**: Specific to one dataset configuration - - Paths relative to config-level `experimental_conditions` - - Example: `path: media.phosphate_source.compound` - -3. **Field-level**: Vary per sample, defined in field definitions - - `field` specifies which field to extract from - - `path` relative to field definitions (not `experimental_conditions`) - - Example: `field: condition, path: media.carbon_source.compound` - -**Special case**: Field without path creates a column alias -- `field: condition` (no path) to renames `condition` column, enables normalization - -### Path Resolution - -Paths use dot notation to navigate nested structures: - -**Repository/Dataset-level** (automatically prepends `experimental_conditions.`): -- `path: temperature_celsius` to `experimental_conditions.temperature_celsius` -- `path: media.carbon_source.compound` to - `experimental_conditions.media.carbon_source.compound` - -**Field-level** (paths relative to field definitions): -- `field: condition, path: media.carbon_source.compound` to looks in field -`condition`'s definitions to navigates to `media.carbon_source.compound` - -### Data Type Specifications - -Field mappings support an optional `dtype` parameter to ensure proper type handling -during metadata extraction and query filtering. - -**Supported dtypes**: -- `string` - Text data (default if not specified) -- `numeric` - Numeric values (integers or floating-point numbers) -- `bool` - Boolean values (true/false) - -**When to use dtype**: - -1. **Numeric filtering**: Required for fields used with comparison operators - (`<`, `>`, `<=`, `>=`, `between`) -2. **Type consistency**: When source data might be extracted with incorrect type -3. **Performance**: Helps with query optimization and prevents type mismatches - -**Type conversion process**: - -Type conversion happens during metadata extraction: -1. Extract value from source using path -2. Convert to specified dtype if provided -3. Store in metadata DataFrame with correct type - -**Example - The problem**: -```python -# Without dtype: temperature extracted as string "30" -# Comparison fails or produces incorrect results -df = vdb.query(filters={"temperature_celsius": (">", 25)}) -# String comparison: "30" > 25 evaluates incorrectly -``` - -**Example - The solution**: -```yaml -temperature_celsius: - path: temperature_celsius - dtype: numeric # Ensures numeric type for proper comparison -``` - -```python -# With dtype: temperature extracted as numeric 30.0 -# Comparison works correctly -df = vdb.query(filters={"temperature_celsius": (">", 25)}) -# Numeric comparison: 30.0 > 25 is True (correct!) -``` - -**Usage examples**: -```yaml -repositories: - BrentLab/example: - dataset: - example_dataset: - # String field for categorical data - strain_background: - path: strain_background - dtype: string - - # Numeric field for quantitative filtering - temperature_celsius: - path: temperature_celsius - dtype: numeric - - # Numeric field for concentration measurements - drug_concentration_um: - path: drug_treatment.concentration_um - dtype: numeric - - # Boolean field - is_heat_shock: - path: is_heat_shock - dtype: bool -``` - -## VirtualDB Structure - -VirtualDB maintains a collection of dataset-specific metadata tables, one per -configured dataset. Each table has the same structure (standardized schema) but -contains data specific to that dataset. - -Unless directed, these tables are not stored on desk and instead generated via -query against the source parquet files. Think of them as a typical database view. - -### Internal Structure - -```python -{ - # Primary datasets with sample_id - ("BrentLab/harbison_2004", "harbison_2004"): DataFrame( - # Columns: sample_id, carbon_source, temperature_celsius, nitrogen_source, ... - # Values: Normalized according to factor_aliases - # Example rows: - # sample_id carbon_source temperature_celsius nitrogen_source - # harbison_001 glucose 30 yeast nitrogen base - # harbison_002 galactose 30 yeast nitrogen base - ), - - ("BrentLab/kemmeren_2014", "kemmeren_2014"): DataFrame( - # Columns: sample_id, carbon_source, temperature_celsius, ... - # Note: Different physical source paths, same logical schema - # Example rows: - # sample_id carbon_source temperature_celsius - # kemmeren_001 glucose 30 - # kemmeren_002 raffinose 30 - ), - - # Comparative datasets with parsed composite identifiers - ("BrentLab/yeast_comparative_analysis", "dto"): DataFrame( - # Original composite ID columns preserved - # Columns: binding_id, perturbation_id, dto_fdr, dto_empirical_pvalue, ... - # Example rows: - # binding_id perturbation_id dto_fdr - # BrentLab/harbison_2004;harbison_2004;harbison_001 BrentLab/kemmeren_2014;kemmeren_2014;sample_42 0.001 - # BrentLab/harbison_2004;harbison_2004;harbison_002 BrentLab/kemmeren_2014;kemmeren_2014;sample_43 0.045 - # - # When materialized with foreign keys, additional parsed columns are created: - # Columns: binding_id, binding_repo_id, binding_config_name, binding_sample_id, - # perturbation_id, perturbation_repo_id, perturbation_config_name, perturbation_sample_id, - # dto_fdr, dto_empirical_pvalue, ... - # Example rows: - # binding_repo_id binding_config_name binding_sample_id dto_fdr - # BrentLab/harbison_2004 harbison_2004 harbison_001 0.001 - # BrentLab/harbison_2004 harbison_2004 harbison_002 0.045 - ) -} -``` - -### View Materialization - -Tables can be cached for faster subsequent queries via materialization: - -```python -# Cache all views for faster subsequent queries -vdb.materialize_views() - -# Cache specific datasets -vdb.materialize([("BrentLab/harbison_2004", "harbison_2004")]) - -# Invalidate cache (e.g., after data updates) -vdb.invalidate_cache() -vdb.invalidate_cache([("BrentLab/harbison_2004", "harbison_2004")]) -``` - -Materialized views are stored locally and reused for queries. - -## VirtualDB Interface - -### Schema Discovery - -**List all queryable fields**: -```python -from tfbpapi.virtual_db import VirtualDB - -vdb = VirtualDB("config.yaml") - -# All fields defined in any dataset -fields = vdb.get_fields() -# ["carbon_source", "temperature_celsius", "nitrogen_source", "phosphate_source", ...] - -# Fields present in ALL datasets (common fields) -common = vdb.get_common_fields() -# ["carbon_source", "temperature_celsius"] - -# Fields for specific dataset -dataset_fields = vdb.get_fields("BrentLab/harbison_2004", "harbison_2004") -# ["carbon_source", "temperature_celsius", "nitrogen_source", "phosphate_source"] -``` - -**Discover valid values for fields**: -```python -# Unique values across all datasets (normalized) -values = vdb.get_unique_values("carbon_source") -# ["glucose", "galactose", "raffinose", "unspecified"] - -# Values broken down by dataset -values_by_dataset = vdb.get_unique_values("carbon_source", by_dataset=True) -# { -# "BrentLab/harbison_2004": ["glucose", "galactose"], -# "BrentLab/kemmeren_2014": ["glucose", "raffinose"] -# } -``` - -### Querying Data - -The `query()` method is the primary interface for retrieving data from VirtualDB. - -**Basic usage** (sample-level, all fields): -```python -# Query across all configured datasets -# Returns one row per sample with all configured fields -df = vdb.query(filters={"carbon_source": "glucose"}) -# DataFrame: sample_id, carbon_source, temperature_celsius, nitrogen_source, ... -``` - -**Query specific datasets**: -```python -# Limit query to specific datasets -df = vdb.query( - filters={"carbon_source": "glucose", "temperature_celsius": 30}, - datasets=[("BrentLab/harbison_2004", "harbison_2004")] -) -``` - -**Select specific fields**: -```python -# Return only specified fields -df = vdb.query( - filters={"carbon_source": "glucose"}, - fields=["sample_id", "carbon_source", "temperature_celsius"] -) -# DataFrame: sample_id, carbon_source, temperature_celsius -``` - -**Complete data** (measurement-level): -```python -# Set complete=True to get all measurements, not just sample-level -# Returns many rows per sample (one per target/feature/coordinate) -df = vdb.query( - filters={"carbon_source": "glucose"}, - complete=True -) -# DataFrame: sample_id, target, value, carbon_source, temperature_celsius, ... -# For annotated_features: target-level data for all matching samples -# For genome_map: coordinate-level data for all matching samples - -# Can combine with field selection -df = vdb.query( - filters={"carbon_source": "glucose"}, - fields=["sample_id", "target", "effect"], - complete=True -) -# DataFrame: sample_id, target, effect -``` - -### Factor Alias Expansion - -When querying with aliased values, VirtualDB automatically expands to all -original values specified in the configuration: - -```python -# User queries for normalized value -df = vdb.query(filters={"carbon_source": "galactose"}) - -# Internally expands to all aliases -# WHERE carbon_source IN ('D-galactose', 'gal', 'galactose') -``` - -### Numeric Field Filtering - -Numeric fields support exact matching and range queries: - -```python -# Exact match -df = vdb.query(filters={"temperature_celsius": 30}) - -# Range queries -df = vdb.query(filters={"temperature_celsius": (">=", 28)}) -# inclusive of the boundaries, ie [28, 32] -df = vdb.query(filters={"temperature_celsius": ("between", 28, 32)}) - -# Missing value labels. This analogous to how factor_aliases work. In this case, it -# will return where the temprature_celsius is missing/None/Null/NaN/etc and/or the -# value matches the specified label, in this case "room". If the missing value label -# is a character value and the field is a numeric field, then only missing values will -# be matched. -df = vdb.query(filters={"temperature_celsius": "room"}) -# Matches samples where temperature is None/missing -``` - -## Comparative Datasets in VirtualDB - -Comparative datasets differ from other dataset types in that they represent -relationships between samples across datasets rather than individual samples. -Each row relates 2+ samples from other datasets. - -### Structure - -Comparative datasets use `source_sample` fields instead of a single `sample_id`: -- Multiple fields with `role: source_sample` -- Each contains composite identifier: `"repo_id;config_name;sample_id"` -- Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` - -### Querying Comparative Data - -Comparative datasets can be queried in two ways: **direct queries** for analysis -results, and **field-based queries** to enrich primary dataset queries with -comparative metrics. - -#### Direct Queries - -Query the comparative dataset directly to find analysis results: - -```python -# Find significant DTO results across all experiments -dto_results = vdb.query( - datasets=[("BrentLab/yeast_comparative_analysis", "dto")], - filters={"dto_fdr": ("<", 0.05)}, - complete=True -) -# Returns: binding_id, perturbation_id, dto_fdr, dto_empirical_pvalue, -# binding_rank_threshold, perturbation_rank_threshold, ... - -# Filter by source dataset -dto_for_harbison = vdb.query( - datasets=[("BrentLab/yeast_comparative_analysis", "dto")], - filters={"binding_id": ("contains", "harbison_2004")}, - complete=True -) - -# Combine filters on both metrics and source samples -high_quality_dto = vdb.query( - datasets=[("BrentLab/yeast_comparative_analysis", "dto")], - filters={ - "dto_fdr": ("<", 0.01), - "binding_id": ("contains", "callingcards") - }, - complete=True -) -``` - -#### Field-based Queries - -```python -# Query binding data, automatically include DTO metrics -binding_with_dto = vdb.query( - datasets=[("BrentLab/callingcards", "annotated_features")], - filters={"regulator_locus_tag": "YJR060W"}, - fields=["sample_id", "target_locus_tag", "binding_score", "dto_fdr"], - complete=True -) -# Returns binding data WITH dto_fdr joined automatically via composite ID - -# Query perturbation data, include derived significance field -perturbation_with_significance = vdb.query( - datasets=[("BrentLab/hackett_2020", "hackett_2020")], - filters={"regulator_locus_tag": "YJR060W"}, - fields=["sample_id", "target_locus_tag", "log2fc", "is_significant"], - complete=True -) -# Returns perturbation data WITH is_significant (computed from dto_fdr < 0.05) -``` - -### Configuration - -Comparative datasets work differently - -**primary datasets declare which comparative datasets reference them**: - -```yaml -repositories: - # Primary dataset (e.g., binding data) - BrentLab/callingcards: - dataset: - annotated_features: - # REQUIRED: Specify which field is the sample identifier - sample_id: - field: sample_id - - # OPTIONAL: Declare comparative analyses that include this dataset - comparative_analyses: - - repo: BrentLab/yeast_comparative_analysis - dataset: dto - via_field: binding_id - # VirtualDB knows composite format: "BrentLab/callingcards;annotated_features;" - - # Regular fields - regulator_locus_tag: - field: regulator_locus_tag - # ... other fields - - # Another primary dataset (e.g., perturbation data) - BrentLab/hu_2007_reimand_2010: - dataset: - data: - sample_id: - field: sample_id - - comparative_analyses: - - repo: BrentLab/yeast_comparative_analysis - dataset: dto - via_field: perturbation_id - - # Regular fields - # ... other fields - - # Comparative dataset - OPTIONAL field mappings for renaming/aliasing - BrentLab/yeast_comparative_analysis: - dataset: - dto: - # Optional: Rename fields for clarity or add derived columns - fdr: - field: dto_fdr # Rename dto_fdr to fdr - - empirical_pvalue: - field: dto_empirical_pvalue # Rename for consistency - - is_significant: - # Derived field: computed from dto_fdr - expression: "dto_fdr < 0.05" -``` - -## See Also -- [DataCard Documentation](huggingface_datacard.md) diff --git a/docs/virtual_db.md b/docs/virtual_db.md index 8fe590e..e3b40ac 100644 --- a/docs/virtual_db.md +++ b/docs/virtual_db.md @@ -1,11 +1,22 @@ # VirtualDB +VirtualDB provides a unified query interface across heterogeneous datasets with +different experimental condition structures and terminologies. Each dataset +defines experimental conditions in its own way, with properties stored at +different hierarchy levels (repository, dataset, or field) and using different +naming conventions. VirtualDB uses an external YAML configuration to map these +varying structures to a common schema, normalize factor level names (e.g., +"D-glucose", "dextrose", "glu" all become "glucose"), and enable cross-dataset +queries with standardized field names and values. + +## API Reference + ::: tfbpapi.virtual_db.VirtualDB options: show_root_heading: true show_source: true -## Helper Functions +### Helper Functions ::: tfbpapi.virtual_db.get_nested_value options: @@ -14,8 +25,3 @@ ::: tfbpapi.virtual_db.normalize_value options: show_root_heading: true - -## Usage - -For comprehensive usage documentation including comparative datasets, see -[Virtual Database Concepts](virtual_database_concepts.md). diff --git a/docs/virtual_db_configuration.md b/docs/virtual_db_configuration.md new file mode 100644 index 0000000..45320d4 --- /dev/null +++ b/docs/virtual_db_configuration.md @@ -0,0 +1,257 @@ +# VirtualDB Configuration Guide + +VirtualDB requires a YAML configuration file that defines which datasets to +include, how to map their fields to common names, and how to normalize factor +levels. + +## Basic Example + +```yaml +repositories: + # Each repository defines a "table" in the virtual database + BrentLab/harbison_2004: + # REQUIRED: Specify which field is the sample identifier. At this level, it means + # that all datasets have a field `sample_id` that uniquely identifies samples. + sample_id: + field: sample_id + # Repository-wide properties (apply to all datasets in this repository) + # Paths are explicit from the datacard root + nitrogen_source: + path: experimental_conditions.media.nitrogen_source.name + + dataset: + # Each dataset gets its own view with standardized fields + harbison_2004: + # note: this is optional. If not specified, then the config_name is used. + # This is useful if the config_name isn't suited to a table name, or if it + # were to conflict with another dataset in the configuration + db_name: harbison + # Dataset-specific properties (constant for all samples) + # Explicit path from datacard/config root + phosphate_source: + path: experimental_conditions.media.phosphate_source.compound + + # Field-level properties (vary per sample) + # Path is relative to field's definitions dict + carbon_source: + field: condition + path: media.carbon_source.compound + dtype: string # Optional: specify data type + + # Field without path (column alias with normalization) + environmental_condition: + field: condition + + BrentLab/kemmeren_2014: + dataset: + kemmeren_2014: + # optional -- see the note for `db_name` in harbison above + db_name: kemmeren + # REQUIRED: If `sample_id` isn't defined at the repo level, then it must be + # defined at the dataset level for each dataset in the repo + sample_id: + field: sample_id + # Same logical fields, different physical paths + # Explicit path from datacard/config root + carbon_source: + path: experimental_conditions.media.carbon_source.compound + dtype: string + temperature_celsius: + path: experimental_conditions.temperature_celsius + dtype: numeric # Enables numeric filtering with comparison operators + + # Comparative dataset example + BrentLab/yeast_comparative_analysis: + dataset: + dto: + # Use field mappings to change a field's displayed name. If not specifically + # listed, then the field is included as it exists in the source data + dto_fdr: + field: dto_fdr + dto_empirical_pvalue: + field: empirical_pvalue + + # links specify which primary datasets are referenced by composite ID fields + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] + +# ===== Normalization Rules ===== +# Map varying terminologies to standardized values +factor_aliases: + carbon_source: + glucose: [D-glucose, glu, dextrose] + galactose: [D-galactose, gal] + +# Handle missing values with defaults +missing_value_labels: + carbon_source: "unspecified" + +# ===== Documentation ===== +description: + carbon_source: The carbon source provided to the cells during growth +``` + +### Property Hierarchy + +Properties are extracted at three hierarchy levels: + +1. **Repository-wide**: Common to all datasets in a repository + - Paths relative to datacard/config root (explicit) + - Example: `path: experimental_conditions.media.nitrogen_source.name` + +2. **Dataset-specific**: Specific to one dataset configuration + - Paths relative to datacard/config root (explicit) + - Example: `path: experimental_conditions.media.phosphate_source.compound` + +3. **Field-level**: Vary per sample, defined in field definitions + - `field` specifies which field to extract from + - `path` relative to that field's definitions dict + - Example: `field: condition, path: media.carbon_source.compound` + +**Special case**: Field without path creates a column alias +- `field: condition` (no path) renames `condition` column, enables normalization + +### Path Resolution + +Paths use dot notation to navigate nested structures: + +**Repository/Dataset-level** (explicit paths from datacard root): +- `path: experimental_conditions.temperature_celsius` - access experimental conditions +- `path: experimental_conditions.media.carbon_source.compound` - nested condition data +- `path: description` - access fields outside experimental_conditions + +**Field-level** (paths relative to field definitions): +- `field: condition, path: media.carbon_source.compound` looks in field + `condition`'s definitions and navigates to `media.carbon_source.compound` + +### Data Type Specifications + +Field mappings support an optional `dtype` parameter to ensure proper type handling +during metadata extraction and query filtering. + +**Supported dtypes**: +- `string` - Text data (default if not specified) +- `numeric` - Numeric values (integers or floating-point numbers) +- `bool` - Boolean values (true/false) + +**When to use dtype**: + +1. **Numeric filtering**: Required for fields used with comparison operators + (`<`, `>`, `<=`, `>=`, `between`) +2. **Type consistency**: When source data might be extracted with incorrect type +3. **Performance**: Helps with query optimization and prevents type mismatches + +## Comparative Datasets + +Comparative datasets differ from other dataset types in that they represent +relationships between samples across datasets rather than individual samples. +Each row relates 2+ samples from other datasets. + +### Structure + +Comparative datasets use `source_sample` fields instead of a single `sample_id`: +- Multiple fields with `role: source_sample` +- Each contains composite identifier: `"repo_id;config_name;sample_id"` +- Example: `binding_id = "BrentLab/harbison_2004;harbison_2004;42"` + +### Fields + +All fields in the comparative dataset are included. But they may be re-named +(aliased) by specifically mapping them in the configuration. + +```yaml +dto: + # this would make the displayed field name 'dto_pvalue' + instead of 'empirical_pvalue' + dto_pvalue: + field: empirical_pvalue +``` + +### Link Structure + +the `links` section specifies how the composite IDs map to primary datasets. The first +sub-element under `links` is the name of the field in the comparative dataset that +contains the composite IDs. The value is a list of `[repo_id, config_name]` +pairs indicating which primary datasets are referenced by that field. Those primary +datasets must also be defined in the overall VirtualDB configuration. + +```yaml +# Within the comparative dataset config +dto: + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] # [repo_id, config_name] + - [BrentLab/callingcards, annotated_features] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] +``` + +See the [huggingface datacard documentation](huggingface_datacard.md#5-comparative) +for more detailed explanation of comparative datasets and composite IDs. + +## Internal Structure + +VirtualDB uses an in-memory DuckDB database to construct a layered hierarchy +of SQL views over locally cached Parquet files. Views are created lazily on +first query and are not persisted to disk. + +### View Hierarchy + +For each configured dataset, VirtualDB registers a series of views that +build on each other. Using `harbison` as an example primary dataset and +`dto` as a comparative dataset: + +**1. Metadata view** + +One row per unique `sample_id`. Derived columns from the configuration +(e.g., `carbon_source`, `temperature_celsius`) are resolved here using +datacard definitions, factor aliases, and missing value labels. This is +the primary view for querying sample-level metadata. + +**2. Raw data view** + +The full parquet data joined to the metadata view so that every row +carries both the raw measurement columns and the derived metadata +columns. **Developer note**: There is an internal view called ___parquet that +is just the raw parquet data without any metadata joins or derived columns. +This is used as the base for joining to the metadata view, but is not exposed directly +to users. + +**3. Expanded view (comparative only)** -- `dto_expanded` + +For comparative datasets, each composite ID field (e.g. `binding_id` +with format `"repo_id;config_name;sample_id"`) is parsed into two +additional columns: + +- `_source` -- the `repo_id;config_name` prefix, aliased + to the configured `db_name` when the pair is in the VirtualDB config. + For example, `BrentLab/harbison_2004;harbison_2004` becomes `harbison`. +- `_id` -- the sample_id component. + +This makes it straightforward to join back to primary dataset views +or filter by source dataset without parsing composite IDs in SQL. + +### View Diagram + +``` +__harbison_parquet (raw parquet, not directly exposed) + | + +-> harbison_meta (deduplicated, one row per sample_id, + | with derived columns from config) + | + +-> harbison (full parquet joined to harbison_meta) + +__dto_parquet (raw parquet, not directly exposed) + | + +-> dto_expanded (parquet + parsed columns: + binding_id_source, binding_id_id, + perturbation_id_source, perturbation_id_id) +``` + +## Usage + +For usage examples and tutorials, +see the [VirtualDB Tutorial](tutorials/virtual_db_tutorial.ipynb). \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 0635060..42d3bf6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -154,8 +154,6 @@ nav: - "Cache Management": tutorials/cache_manager_tutorial.ipynb - "Querying Data": - "VirtualDB: Unified Cross-Dataset Queries": tutorials/virtual_db_tutorial.ipynb - - Concepts: - - "Virtual Database Design": virtual_database_concepts.md - API Reference: - Core: - VirtualDB: virtual_db.md @@ -169,3 +167,4 @@ nav: - HuggingFace Configuration: - HuggingFace Dataset Card Format: huggingface_datacard.md - BrentLab Collection: brentlab_yeastresources_collection.md + - VirtualDB Configuration: virtual_db_configuration.md diff --git a/tfbpapi/datacard.py b/tfbpapi/datacard.py index b8798fc..abf94dd 100644 --- a/tfbpapi/datacard.py +++ b/tfbpapi/datacard.py @@ -264,7 +264,7 @@ def get_repository_info(self) -> dict[str, Any]: "dataset_types": [config.dataset_type.value for config in card.configs], "total_files": total_files, "last_modified": last_modified, - "has_default_config": self.dataset_card.get_default_config() is not None, + "has_default_config": self.dataset_card.default_config is not None, } def extract_metadata_schema(self, config_name: str) -> dict[str, Any]: diff --git a/tfbpapi/models.py b/tfbpapi/models.py index bb86f2e..a8660a1 100644 --- a/tfbpapi/models.py +++ b/tfbpapi/models.py @@ -10,11 +10,23 @@ """ from enum import Enum +from functools import cached_property from pathlib import Path -from typing import Any +from typing import Any, TypeAlias import yaml # type: ignore[import-untyped] -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + computed_field, + field_serializer, + field_validator, + model_validator, +) + +# Type aliases for improved readability +FactorAliases: TypeAlias = dict[str, dict[str, list[str | int | float | bool]]] class DatasetType(str, Enum): @@ -105,10 +117,20 @@ class DatasetConfig(BaseModel): model_config = ConfigDict(extra="allow") - @field_validator("applies_to") + @field_validator("applies_to", mode="after") @classmethod - def applies_to_only_for_metadata(cls, v, info): - """Validate that applies_to is only used for metadata or comparative configs.""" + def applies_to_only_for_metadata( + cls, v: list[str] | None, info + ) -> list[str] | None: + """ + Validate that applies_to is only used for metadata or comparative configs. + + :param v: The applies_to field value + :param info: Validation info containing other field values + :return: The validated applies_to value + :raises ValueError: If applies_to is used with invalid dataset type + + """ if v is not None: dataset_type = info.data.get("dataset_type") if dataset_type not in (DatasetType.METADATA, DatasetType.COMPARATIVE): @@ -118,10 +140,17 @@ def applies_to_only_for_metadata(cls, v, info): ) return v - @field_validator("metadata_fields") + @field_validator("metadata_fields", mode="after") @classmethod - def metadata_fields_validation(cls, v): - """Validate metadata_fields usage.""" + def metadata_fields_not_empty(cls, v: list[str] | None) -> list[str] | None: + """ + Validate metadata_fields is not an empty list. + + :param v: The metadata_fields value + :return: The validated metadata_fields value + :raises ValueError: If metadata_fields is an empty list + + """ if v is not None and len(v) == 0: raise ValueError("metadata_fields cannot be empty list, use None instead") return v @@ -140,52 +169,95 @@ class DatasetCard(BaseModel): model_config = ConfigDict(extra="allow") - @field_validator("configs") + @field_validator("configs", mode="after") @classmethod - def configs_not_empty(cls, v): - """Ensure at least one config is present.""" + def validate_configs(cls, v: list[DatasetConfig]) -> list[DatasetConfig]: + """ + Validate configs list. + + Ensures at least one config exists, all config names are unique, and at most one + config is marked as default. + + :param v: The list of DatasetConfig objects + :return: The validated list of configs + :raises ValueError: If validation fails + + """ + # Check non-empty if not v: raise ValueError("At least one dataset configuration is required") - return v - @field_validator("configs") - @classmethod - def unique_config_names(cls, v): - """Ensure config names are unique.""" + # Check unique names names = [config.config_name for config in v] if len(names) != len(set(names)): raise ValueError("Configuration names must be unique") - return v - @field_validator("configs") - @classmethod - def at_most_one_default(cls, v): - """Ensure at most one config is marked as default.""" - defaults = [config for config in v if config.default] - if len(defaults) > 1: + # Check at most one default + defaults = sum(1 for config in v if config.default) + if defaults > 1: raise ValueError("At most one configuration can be marked as default") + return v + # Computed properties for better discoverability + @computed_field # type: ignore[prop-decorator] + @cached_property + def default_config(self) -> DatasetConfig | None: + """ + Get the default configuration if one exists. + + :return: The default DatasetConfig or None if no default is set + + """ + for config in self.configs: + if config.default: + return config + return None + + @computed_field # type: ignore[prop-decorator] + @cached_property + def config_names(self) -> list[str]: + """ + Get all configuration names. + + :return: List of all config_name values + + """ + return [config.config_name for config in self.configs] + + # Utility methods (not serialized) def get_config_by_name(self, name: str) -> DatasetConfig | None: - """Get a configuration by name.""" + """ + Get a configuration by name. + + :param name: The configuration name to search for + :return: The matching DatasetConfig or None if not found + + """ for config in self.configs: if config.config_name == name: return config return None def get_configs_by_type(self, dataset_type: DatasetType) -> list[DatasetConfig]: - """Get all configurations of a specific type.""" + """ + Get all configurations of a specific type. + + :param dataset_type: The DatasetType to filter by + :return: List of matching DatasetConfig objects + + """ return [ config for config in self.configs if config.dataset_type == dataset_type ] - def get_default_config(self) -> DatasetConfig | None: - """Get the default configuration if one exists.""" - defaults = [config for config in self.configs if config.default] - return defaults[0] if defaults else None - def get_data_configs(self) -> list[DatasetConfig]: - """Get all non-metadata configurations.""" + """ + Get all non-metadata configurations. + + :return: List of DatasetConfig objects excluding metadata types + + """ return [ config for config in self.configs @@ -193,7 +265,12 @@ def get_data_configs(self) -> list[DatasetConfig]: ] def get_metadata_configs(self) -> list[DatasetConfig]: - """Get all metadata configurations.""" + """ + Get all metadata configurations. + + :return: List of DatasetConfig objects with metadata type + + """ return [ config for config in self.configs @@ -211,10 +288,16 @@ class ExtractedMetadata(BaseModel): values: set[str] = Field(..., description="Unique values found") extraction_method: str = Field(..., description="How the metadata was extracted") - model_config = ConfigDict( - # Allow sets in JSON serialization - json_encoders={set: list} - ) + @field_serializer("values", mode="plain") + def serialize_values(self, value: set[str]) -> list[str]: + """ + Serialize set as sorted list for JSON compatibility. + + :param value: Set of string values + :return: Sorted list of strings + + """ + return sorted(value) class MetadataRelationship(BaseModel): @@ -232,73 +315,42 @@ class MetadataRelationship(BaseModel): # ============================================================================ -class ComparativeAnalysis(BaseModel): +class PropertyMapping(BaseModel): """ - Reference to a comparative dataset that includes this dataset. - - Comparative datasets relate samples across multiple source datasets. - This model specifies which comparative dataset references the current - dataset and through which field (via_field). - - Attributes: - repo: HuggingFace repository ID of the comparative dataset - dataset: Config name of the comparative dataset - via_field: Field in the comparative dataset containing composite - identifiers that reference this dataset's samples. - Format: "repo_id;config_name;sample_id" - - Example: - ```python - # In BrentLab/callingcards config - ComparativeAnalysis( - repo="BrentLab/yeast_comparative_analysis", - dataset="dto", - via_field="binding_id" - ) - # Means: dto dataset has a binding_id field with values like: - # "BrentLab/callingcards;annotated_features;123" - ``` + Mapping specification for a single property. - """ + :ivar field: Optional field name for field-level properties. + When specified, looks in this field's definitions. + When omitted, uses repo/config-level resolution. + :ivar path: Optional dot-notation path to the property value. + For repo/config-level: relative to datacard/config root + (e.g., "experimental_conditions.media.carbon_source" or "description") + For field-level: relative to the field's definitions dict + (e.g., "temperature_celsius" resolves within each sample's definition) + When omitted with field specified, creates a column alias. + :ivar expression: Optional SQL expression for derived/computed fields. + When specified, creates a computed column. + Cannot be used with field or path. + :ivar dtype: Optional data type specification for type conversion. + Supported values: 'string', 'numeric', 'bool'. + When specified, extracted values are converted to this type. - repo: str = Field(..., description="Comparative dataset repository ID") - dataset: str = Field(..., description="Comparative dataset config name") - via_field: str = Field( - ..., description="Field containing composite sample identifiers" - ) + Examples:: + # Repo/config-level property (explicit path from datacard root) + PropertyMapping(path="experimental_conditions.media.carbon_source.compound") -class PropertyMapping(BaseModel): - """ - Mapping specification for a single property. + # Repo/config-level property outside experimental_conditions + PropertyMapping(path="description") + + # Field-level property with path (relative to field definitions) + PropertyMapping(field="condition", path="temperature_celsius") - Attributes: - path: Optional dot-notation path to the property value. - For repo/config-level: relative to experimental_conditions - For field-level: relative to field definitions - When omitted with field specified, creates a column alias. - field: Optional field name for field-level properties. - When specified, looks in this field's definitions. - When omitted, looks in repo/config-level experimental_conditions. - expression: Optional SQL expression for derived/computed fields. - When specified, creates a computed column. - Cannot be used with field or path. - dtype: Optional data type specification for type conversion. - Supported values: 'string', 'numeric', 'bool'. - When specified, extracted values are converted to this type. - - Examples: - Field-level property with path: - PropertyMapping(field="condition", path="media.carbon_source") - - Repo/config-level property: - PropertyMapping(path="temperature_celsius") - - Field-level column alias (no path): - PropertyMapping(field="condition") - - Derived field with expression: - PropertyMapping(expression="dto_fdr < 0.05") + # Field-level column alias (no path) + PropertyMapping(field="condition") + + # Derived field with expression + PropertyMapping(expression="dto_fdr < 0.05") """ @@ -311,33 +363,33 @@ class PropertyMapping(BaseModel): None, description="Data type for conversion: 'string', 'numeric', or 'bool'" ) - @field_validator("path") + @field_validator("path", "field", "expression", mode="before") @classmethod - def validate_path(cls, v: str | None) -> str | None: - """Ensure path is not just whitespace if provided.""" - if v is not None and not v.strip(): - raise ValueError("path cannot be empty or whitespace") - return v.strip() if v else None + def strip_whitespace(cls, v: str | None) -> str | None: + """ + Strip whitespace and validate non-empty strings. - @field_validator("field") - @classmethod - def validate_field(cls, v: str | None) -> str | None: - """Ensure field is not empty string if provided.""" - if v is not None and not v.strip(): - raise ValueError("field cannot be empty or whitespace") - return v.strip() if v else None + :param v: String value to validate + :return: Stripped string or None + :raises ValueError: If string is empty or only whitespace - @field_validator("expression") - @classmethod - def validate_expression(cls, v: str | None) -> str | None: - """Ensure expression is not empty string if provided.""" - if v is not None and not v.strip(): - raise ValueError("expression cannot be empty or whitespace") - return v.strip() if v else None + """ + if v is None: + return None + v = v.strip() + if not v: + raise ValueError("Value cannot be empty or whitespace") + return v @model_validator(mode="after") - def validate_at_least_one_specified(self) -> "PropertyMapping": - """Ensure at least one field type is specified and mutually exclusive.""" + def validate_field_types(self) -> "PropertyMapping": + """ + Ensure at least one field type is specified and mutually exclusive. + + :return: The validated PropertyMapping instance + :raises ValueError: If validation constraints are violated + + """ if self.expression is not None: if self.field is not None or self.path is not None: raise ValueError( @@ -355,104 +407,181 @@ class DatasetVirtualDBConfig(BaseModel): """ VirtualDB configuration for a specific dataset within a repository. - Attributes: - sample_id: Mapping for the sample identifier field (required for - primary datasets) - comparative_analyses: Optional list of comparative datasets that - reference this dataset - properties: Property mappings for this specific dataset (field names to - PropertyMapping) - - Example: - ```yaml - # In BrentLab/callingcards config + Additional property mappings can be provided as extra fields and will be + automatically parsed as PropertyMapping objects. + + :ivar sample_id: Mapping for the sample identifier field (required for + primary datasets) + :ivar links: For comparative datasets, map link_field -> list of + [repo_id, config_name] pairs specifying which primary datasets + are linked through each link field. + + Example - Primary dataset:: + annotated_features: sample_id: field: sample_id - comparative_analyses: - - repo: BrentLab/yeast_comparative_analysis - dataset: dto - via_field: binding_id regulator_locus_tag: field: regulator_locus_tag - dto_fdr: # Field from comparative dataset, optional renaming + + Example - Comparative dataset:: + + dto: + # Field mappings - use this to rename fields + dto_fdr: field: dto_fdr - ``` + dto_pvalue: + field: empirical_pvalue # renames empirical_pvalue to dto_pvalue + # Links to primary datasets + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + - [BrentLab/callingcards, annotated_features] + perturbation_id: + - [BrentLab/kemmeren_2014, kemmeren_2014] """ sample_id: PropertyMapping | None = Field( None, description="Mapping for sample identifier field" ) - comparative_analyses: list[ComparativeAnalysis] = Field( - default_factory=list, - description="Comparative datasets referencing this dataset", + db_name: str | None = Field( + None, + description=( + "Short name for this dataset in the SQL interface. " + "Falls back to the config_name (YAML dict key) if not " + "specified. Must be a valid SQL identifier." + ), + ) + links: dict[str, list[list[str]]] = Field( + default_factory=dict, + description="For comparative datasets: map link_field -> " + "[repo_id, config_name] pairs", ) - # Allow additional property mappings via extra fields + model_config = ConfigDict(extra="allow") + @field_validator("links", mode="after") + @classmethod + def validate_links( + cls, v: dict[str, list[list[str]]] + ) -> dict[str, list[list[str]]]: + """ + Validate that each link is a [repo_id, config_name] pair. + + :param v: Links dictionary + :return: Validated links + :raises ValueError: If any link is not a valid pair + + """ + for link_field, datasets in v.items(): + for i, dataset_pair in enumerate(datasets): + if not isinstance(dataset_pair, list) or len(dataset_pair) != 2: + raise ValueError( + f"Link {i} for link_field '{link_field}' must be " + f"[repo_id, config_name], got: {dataset_pair}" + ) + return v + + @field_validator("db_name", mode="after") + @classmethod + def validate_db_name(cls, v: str | None) -> str | None: + """ + Validate db_name is a valid SQL identifier and not reserved. + + :param v: db_name value + :return: Validated db_name + :raises ValueError: If db_name is invalid + + """ + if v is None: + return None + import re + + if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", v): + raise ValueError( + f"db_name '{v}' is not a valid SQL identifier. " + "Use only letters, digits, and underscores, " + "starting with a letter or underscore." + ) + reserved = {"samples"} + if v.lower() in reserved: + raise ValueError(f"db_name '{v}' is reserved for internal use.") + return v + @model_validator(mode="before") @classmethod - def parse_property_mappings(cls, data: Any) -> Any: - """Parse extra fields as PropertyMapping objects.""" + def parse_property_mappings(cls, data: Any) -> dict[str, Any]: + """ + Parse extra fields as PropertyMapping objects. + + :param data: Raw input data + :return: Processed data with PropertyMapping objects + :raises ValueError: If PropertyMapping validation fails + + """ if not isinstance(data, dict): return data - # Process all fields except sample_id and comparative_analyses result = {} for key, value in data.items(): - if key in ("sample_id", "comparative_analyses"): - # These are typed fields, let Pydantic handle them + # Known typed fields - let Pydantic handle them + if key in ("sample_id", "links", "db_name"): result[key] = value + # Dict values should be PropertyMappings elif isinstance(value, dict): - # Assume it's a PropertyMapping try: result[key] = PropertyMapping.model_validate(value) except Exception as e: raise ValueError( f"Invalid PropertyMapping for field '{key}': {e}" ) from e + # Already parsed PropertyMapping or other type else: - # Already parsed or wrong type result[key] = value return result + @property + def property_mappings(self) -> dict[str, PropertyMapping]: + """ + Get all property mappings from extra fields. + + :return: Dictionary of property names to PropertyMapping objects + + """ + if not self.model_extra: + return {} + + return { + key: value + for key, value in self.model_extra.items() + if isinstance(value, PropertyMapping) + } + class RepositoryConfig(BaseModel): """ - Configuration for a single repository. Eg BrentLab/harbison_2004. - - Attributes: - properties: Repo-wide property mappings that apply to all datasets - dataset: Dataset-specific configurations including sample_id, - comparative_analyses, and property mappings - - Example: - ```python - config = RepositoryConfig( - properties={ - "temperature_celsius": PropertyMapping(path="temperature_celsius") - }, - dataset={ - "dataset_name": DatasetVirtualDBConfig( - sample_id=PropertyMapping(field="sample_id"), - comparative_analyses=[ - ComparativeAnalysis( - repo="BrentLab/yeast_comparative_analysis", - dataset="dto", - via_field="binding_id" - ) - ], - # Additional property mappings via extra fields - **{"carbon_source": PropertyMapping( - field="condition", - path="media.carbon_source" - )} - ) - } - ) - ``` + Configuration for a single repository. + + For example: BrentLab/harbison_2004 + + :ivar properties: Repo-wide property mappings that apply to all datasets + :ivar dataset: Dataset-specific configurations including sample_id, + comparative_analyses, and property mappings + + Example:: + + BrentLab/harbison_2004: + temperature_celsius: + path: temperature_celsius + dataset: + harbison_2004: + sample_id: + field: sample_id + carbon_source: + field: condition + path: media.carbon_source """ @@ -465,14 +594,21 @@ class RepositoryConfig(BaseModel): @model_validator(mode="before") @classmethod - def parse_structure(cls, data: Any) -> Any: - """Parse raw dict structure into typed objects.""" + def parse_structure(cls, data: Any) -> dict[str, Any]: + """ + Parse raw dict structure into typed objects. + + :param data: Raw input data + :return: Processed data with typed objects + :raises ValueError: If validation fails + + """ if not isinstance(data, dict): return data - # Extract and parse dataset section - dataset_section = data.get("dataset") + # Parse dataset section parsed_datasets: dict[str, DatasetVirtualDBConfig] | None = None + dataset_section = data.get("dataset") if dataset_section: if not isinstance(dataset_section, dict): @@ -483,11 +619,6 @@ def parse_structure(cls, data: Any) -> Any: if not isinstance(config_dict, dict): raise ValueError(f"Dataset '{dataset_name}' must contain a dict") - # Parse DatasetVirtualDBConfig - # The config_dict may contain: - # - sample_id (PropertyMapping) - # - comparative_analyses (list[ComparativeAnalysis]) - # - Other fields as PropertyMappings (via extra="allow") try: parsed_datasets[dataset_name] = ( DatasetVirtualDBConfig.model_validate(config_dict) @@ -518,16 +649,14 @@ class MetadataConfig(BaseModel): Specifies optional alias mappings for normalizing factor levels across heterogeneous datasets, plus property path mappings for each repository. - Attributes: - factor_aliases: Optional mappings of standardized names to actual values. - Example: {"carbon_source": - {"glucose": ["D-glucose", "dextrose"]}} - missing_value_labels: Labels for missing values by property name - description: Human-readable descriptions for each property - repositories: Dict mapping repository IDs to their configurations + :ivar factor_aliases: Optional mappings of standardized names to actual values. + Example: {"carbon_source": {"glucose": ["D-glucose", "dextrose"]}} + :ivar missing_value_labels: Labels for missing values by property name + :ivar description: Human-readable descriptions for each property + :ivar repositories: Dict mapping repository IDs to their configurations + + Example:: - Example: - ```yaml repositories: BrentLab/harbison_2004: dataset: @@ -544,6 +673,18 @@ class MetadataConfig(BaseModel): carbon_source: path: media.carbon_source + # Comparative dataset with aliases and links + BrentLab/yeast_comparative_analysis: + dataset: + dto: + dto_fdr: + field: dto_fdr + aliases: + dto_pvalue: dto_empirical_pvalue + links: + binding_id: + - [BrentLab/harbison_2004, harbison_2004] + factor_aliases: carbon_source: glucose: ["D-glucose", "dextrose"] @@ -554,11 +695,10 @@ class MetadataConfig(BaseModel): description: carbon_source: "Carbon source in growth media" - ``` """ - factor_aliases: dict[str, dict[str, list[Any]]] = Field( + factor_aliases: FactorAliases = Field( default_factory=dict, description="Optional alias mappings for normalizing factor levels", ) @@ -574,74 +714,83 @@ class MetadataConfig(BaseModel): ..., description="Repository configurations keyed by repo ID" ) - @field_validator("missing_value_labels", mode="before") + @field_validator("missing_value_labels", "description", mode="before") @classmethod - def validate_missing_value_labels(cls, v: Any) -> dict[str, str]: - """Validate missing value labels structure, filtering out None values.""" - if not v: - return {} - if not isinstance(v, dict): - raise ValueError("missing_value_labels must be a dict") - # Filter out None values that may come from empty YAML values - return {k: val for k, val in v.items() if val is not None} + def filter_none_values(cls, v: dict[str, str] | None) -> dict[str, str]: + """ + Filter out None values that may come from empty YAML values. - @field_validator("description", mode="before") - @classmethod - def validate_description(cls, v: Any) -> dict[str, str]: - """Validate description structure, filtering out None values.""" + :param v: Dictionary that may contain None values + :return: Dictionary with None values filtered out + + """ if not v: return {} - if not isinstance(v, dict): - raise ValueError("description must be a dict") - # Filter out None values that may come from empty YAML values + # Pydantic will validate it's a dict, we just filter None values return {k: val for k, val in v.items() if val is not None} - @field_validator("factor_aliases") + @field_validator("factor_aliases", mode="after") @classmethod - def validate_factor_aliases( - cls, v: dict[str, dict[str, list[Any]]] - ) -> dict[str, dict[str, list[Any]]]: - """Validate factor alias structure.""" - # Empty is OK - aliases are optional - if not v: - return v + def validate_factor_aliases(cls, v: FactorAliases) -> FactorAliases: + """ + Validate factor alias structure and value types. - for prop_name, aliases in v.items(): - if not isinstance(aliases, dict): - raise ValueError( - f"Property '{prop_name}' aliases must be a dict, " - f"got {type(aliases).__name__}" - ) + :param v: Factor aliases dictionary + :return: Validated factor aliases + :raises ValueError: If any alias has an empty value list - # Validate each alias mapping + """ + for prop_name, aliases in v.items(): for alias_name, actual_values in aliases.items(): - if not isinstance(actual_values, list): - raise ValueError( - f"Alias '{alias_name}' for '{prop_name}' must map " - f"to a list of values" - ) if not actual_values: raise ValueError( f"Alias '{alias_name}' for '{prop_name}' cannot " f"have empty value list" ) - for val in actual_values: - if not isinstance(val, (str, int, float, bool)): - raise ValueError( - f"Alias '{alias_name}' for '{prop_name}' contains " - f"invalid value type: {type(val).__name__}" - ) - return v + @model_validator(mode="after") + def validate_unique_db_names(self) -> "MetadataConfig": + """ + Validate that all resolved db_names are unique across datasets. + + Each dataset resolves to db_name or config_name. These must be unique to avoid + SQL view name collisions. + + :return: The validated MetadataConfig instance + :raises ValueError: If duplicate db_names are found + + """ + seen: dict[str, str] = {} + for repo_id, repo_config in self.repositories.items(): + if not repo_config.dataset: + continue + for config_name, dataset_config in repo_config.dataset.items(): + resolved = dataset_config.db_name or config_name + key = resolved.lower() + if key in seen: + raise ValueError( + f"Duplicate db_name '{resolved}': used by " + f"'{seen[key]}' and " + f"'{repo_id}/{config_name}'" + ) + seen[key] = f"{repo_id}/{config_name}" + return self + @model_validator(mode="before") @classmethod - def parse_repositories(cls, data: Any) -> Any: - """Parse repository configurations from 'repositories' key.""" + def parse_repositories(cls, data: Any) -> dict[str, Any]: + """ + Parse repository configurations from 'repositories' key. + + :param data: Raw configuration data + :return: Processed configuration with parsed repositories + :raises ValueError: If repositories are invalid or missing + + """ if not isinstance(data, dict): return data - # Extract repositories from 'repositories' key repositories_data = data.get("repositories", {}) if not repositories_data: @@ -650,9 +799,7 @@ def parse_repositories(cls, data: Any) -> Any: "with at least one repository" ) - if not isinstance(repositories_data, dict): - raise ValueError("'repositories' key must contain a dict") - + # Parse each repository config repositories = {} for repo_id, repo_config in repositories_data.items(): try: @@ -676,20 +823,19 @@ def from_yaml(cls, path: Path | str) -> "MetadataConfig": :param path: Path to YAML configuration file :return: Validated MetadataConfig instance + :raises ValidationError: If configuration is invalid :raises FileNotFoundError: If file doesn't exist - :raises ValueError: If configuration is invalid + :raises ValueError: If YAML file does not contain a dictionary """ - path = Path(path) - - if not path.exists(): - raise FileNotFoundError(f"Configuration file not found: {path}") - - with open(path) as f: + with open(Path(path)) as f: data = yaml.safe_load(f) if not isinstance(data, dict): - raise ValueError("Configuration must be a YAML dict") + raise ValueError( + f"Configuration file must contain a YAML dictionary, " + f"got {type(data).__name__} instead" + ) return cls.model_validate(data) @@ -727,8 +873,6 @@ def get_property_mappings( # Override with dataset-specific properties if repo_config.dataset and config_name in repo_config.dataset: dataset_config = repo_config.dataset[config_name] - # DatasetVirtualDBConfig stores property mappings in model_extra - if hasattr(dataset_config, "model_extra") and dataset_config.model_extra: - mappings.update(dataset_config.model_extra) + mappings.update(dataset_config.property_mappings) return mappings diff --git a/tfbpapi/models_deprecated.py b/tfbpapi/models_deprecated.py new file mode 100644 index 0000000..6888579 --- /dev/null +++ b/tfbpapi/models_deprecated.py @@ -0,0 +1,732 @@ +""" +Pydantic models for dataset card validation and metadata configuration. + +These models provide minimal structure for parsing HuggingFace dataset cards while +remaining flexible enough to accommodate diverse experimental systems. Most fields use +extra="allow" to accept domain-specific additions without requiring code changes. + +Also includes models for VirtualDB metadata normalization configuration. + +""" + +from enum import Enum +from pathlib import Path +from typing import Any + +import yaml # type: ignore[import-untyped] +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + + +class DatasetType(str, Enum): + """Supported dataset types.""" + + GENOMIC_FEATURES = "genomic_features" + ANNOTATED_FEATURES = "annotated_features" + GENOME_MAP = "genome_map" + METADATA = "metadata" + COMPARATIVE = "comparative" + + +class FeatureInfo(BaseModel): + """ + Information about a dataset feature/column. + + Minimal required fields with flexible dtype handling. + + """ + + name: str = Field(..., description="Column name in the data") + dtype: str | dict[str, Any] = Field( + ..., + description="Data type (string, int64, float64, etc.) or class_label dict", + ) + description: str = Field(..., description="Description of the field") + role: str | None = Field( + default=None, + description="Optional semantic role. 'experimental_condition' " + "has special behavior.", + ) + definitions: dict[str, Any] | None = Field( + default=None, + description="For experimental_condition fields: definitions per value", + ) + + +class PartitioningInfo(BaseModel): + """Partitioning configuration for datasets.""" + + enabled: bool = Field(default=False, description="Whether partitioning is enabled") + partition_by: list[str] | None = Field( + default=None, description="Partition column names" + ) + path_template: str | None = Field( + default=None, description="Path template for partitioned files" + ) + + +class DatasetInfo(BaseModel): + """Dataset structure information.""" + + features: list[FeatureInfo] = Field(..., description="Feature definitions") + partitioning: PartitioningInfo | None = Field( + default=None, description="Partitioning configuration" + ) + + +class DataFileInfo(BaseModel): + """Information about data files.""" + + split: str = Field(default="train", description="Dataset split name") + path: str = Field(..., description="Path to data file(s)") + + +class DatasetConfig(BaseModel): + """ + Configuration for a dataset within a repository. + + Uses extra="allow" to accept arbitrary experimental_conditions and other fields. + + """ + + config_name: str = Field(..., description="Unique configuration identifier") + description: str = Field(..., description="Human-readable description") + dataset_type: DatasetType = Field(..., description="Type of dataset") + default: bool = Field( + default=False, description="Whether this is the default config" + ) + applies_to: list[str] | None = Field( + default=None, description="Configs this metadata applies to" + ) + metadata_fields: list[str] | None = Field( + default=None, description="Fields for embedded metadata extraction" + ) + data_files: list[DataFileInfo] = Field(..., description="Data file information") + dataset_info: DatasetInfo = Field(..., description="Dataset structure information") + + model_config = ConfigDict(extra="allow") + + @field_validator("applies_to") + @classmethod + def applies_to_only_for_metadata(cls, v, info): + """Validate that applies_to is only used for metadata or comparative configs.""" + if v is not None: + dataset_type = info.data.get("dataset_type") + if dataset_type not in (DatasetType.METADATA, DatasetType.COMPARATIVE): + raise ValueError( + "applies_to field is only valid " + "for metadata and comparative dataset types" + ) + return v + + @field_validator("metadata_fields") + @classmethod + def metadata_fields_validation(cls, v): + """Validate metadata_fields usage.""" + if v is not None and len(v) == 0: + raise ValueError("metadata_fields cannot be empty list, use None instead") + return v + + +class DatasetCard(BaseModel): + """ + Complete dataset card model. + + Uses extra="allow" to accept arbitrary top-level metadata and + experimental_conditions. + + """ + + configs: list[DatasetConfig] = Field(..., description="Dataset configurations") + + model_config = ConfigDict(extra="allow") + + @field_validator("configs") + @classmethod + def configs_not_empty(cls, v): + """Ensure at least one config is present.""" + if not v: + raise ValueError("At least one dataset configuration is required") + return v + + @field_validator("configs") + @classmethod + def unique_config_names(cls, v): + """Ensure config names are unique.""" + names = [config.config_name for config in v] + if len(names) != len(set(names)): + raise ValueError("Configuration names must be unique") + return v + + @field_validator("configs") + @classmethod + def at_most_one_default(cls, v): + """Ensure at most one config is marked as default.""" + defaults = [config for config in v if config.default] + if len(defaults) > 1: + raise ValueError("At most one configuration can be marked as default") + return v + + def get_config_by_name(self, name: str) -> DatasetConfig | None: + """Get a configuration by name.""" + for config in self.configs: + if config.config_name == name: + return config + return None + + def get_configs_by_type(self, dataset_type: DatasetType) -> list[DatasetConfig]: + """Get all configurations of a specific type.""" + return [ + config for config in self.configs if config.dataset_type == dataset_type + ] + + def get_default_config(self) -> DatasetConfig | None: + """Get the default configuration if one exists.""" + defaults = [config for config in self.configs if config.default] + return defaults[0] if defaults else None + + def get_data_configs(self) -> list[DatasetConfig]: + """Get all non-metadata configurations.""" + return [ + config + for config in self.configs + if config.dataset_type != DatasetType.METADATA + ] + + def get_metadata_configs(self) -> list[DatasetConfig]: + """Get all metadata configurations.""" + return [ + config + for config in self.configs + if config.dataset_type == DatasetType.METADATA + ] + + +class ExtractedMetadata(BaseModel): + """Metadata extracted from datasets.""" + + config_name: str = Field(..., description="Source configuration name") + field_name: str = Field( + ..., description="Field name the metadata was extracted from" + ) + values: set[str] = Field(..., description="Unique values found") + extraction_method: str = Field(..., description="How the metadata was extracted") + + model_config = ConfigDict( + # Allow sets in JSON serialization + json_encoders={set: list} + ) + + +class MetadataRelationship(BaseModel): + """Relationship between a data config and its metadata.""" + + data_config: str = Field(..., description="Data configuration name") + metadata_config: str = Field(..., description="Metadata configuration name") + relationship_type: str = Field( + ..., description="Type of relationship (explicit, embedded)" + ) + + +# ============================================================================ +# VirtualDB Metadata Configuration Models +# ============================================================================ + + +class ComparativeAnalysis(BaseModel): + """ + Reference to a comparative dataset that includes this dataset. + + Comparative datasets relate samples across multiple source datasets. + This model specifies which comparative dataset references the current + dataset and through which field (via_field). + + Attributes: + repo: HuggingFace repository ID of the comparative dataset + dataset: Config name of the comparative dataset + via_field: Field in the comparative dataset containing composite + identifiers that reference this dataset's samples. + Format: "repo_id;config_name;sample_id" + + Example: + ```python + # In BrentLab/callingcards config + ComparativeAnalysis( + repo="BrentLab/yeast_comparative_analysis", + dataset="dto", + via_field="binding_id" + ) + # Means: dto dataset has a binding_id field with values like: + # "BrentLab/callingcards;annotated_features;123" + ``` + + """ + + repo: str = Field(..., description="Comparative dataset repository ID") + dataset: str = Field(..., description="Comparative dataset config name") + via_field: str = Field( + ..., description="Field containing composite sample identifiers" + ) + + +class PropertyMapping(BaseModel): + """ + Mapping specification for a single property. + + Attributes: + path: Optional dot-notation path to the property value. + For repo/config-level: relative to experimental_conditions + For field-level: relative to field definitions + When omitted with field specified, creates a column alias. + field: Optional field name for field-level properties. + When specified, looks in this field's definitions. + When omitted, looks in repo/config-level experimental_conditions. + expression: Optional SQL expression for derived/computed fields. + When specified, creates a computed column. + Cannot be used with field or path. + dtype: Optional data type specification for type conversion. + Supported values: 'string', 'numeric', 'bool'. + When specified, extracted values are converted to this type. + + Examples: + Field-level property with path: + PropertyMapping(field="condition", path="media.carbon_source") + + Repo/config-level property: + PropertyMapping(path="temperature_celsius") + + Field-level column alias (no path): + PropertyMapping(field="condition") + + Derived field with expression: + PropertyMapping(expression="dto_fdr < 0.05") + + """ + + field: str | None = Field(None, description="Field name for field-level properties") + path: str | None = Field(None, description="Dot-notation path to property") + expression: str | None = Field( + None, description="SQL expression for derived fields" + ) + dtype: str | None = Field( + None, description="Data type for conversion: 'string', 'numeric', or 'bool'" + ) + + @field_validator("path") + @classmethod + def validate_path(cls, v: str | None) -> str | None: + """Ensure path is not just whitespace if provided.""" + if v is not None and not v.strip(): + raise ValueError("path cannot be empty or whitespace") + return v.strip() if v else None + + @field_validator("field") + @classmethod + def validate_field(cls, v: str | None) -> str | None: + """Ensure field is not empty string if provided.""" + if v is not None and not v.strip(): + raise ValueError("field cannot be empty or whitespace") + return v.strip() if v else None + + @field_validator("expression") + @classmethod + def validate_expression(cls, v: str | None) -> str | None: + """Ensure expression is not empty string if provided.""" + if v is not None and not v.strip(): + raise ValueError("expression cannot be empty or whitespace") + return v.strip() if v else None + + @model_validator(mode="after") + def validate_at_least_one_specified(self) -> "PropertyMapping": + """Ensure at least one field type is specified and mutually exclusive.""" + if self.expression is not None: + if self.field is not None or self.path is not None: + raise ValueError( + "expression cannot be used with field or path - " + "derived fields are computed, not extracted" + ) + elif self.field is None and self.path is None: + raise ValueError( + "At least one of 'field', 'path', or 'expression' must be specified" + ) + return self + + +class DatasetVirtualDBConfig(BaseModel): + """ + VirtualDB configuration for a specific dataset within a repository. + + Attributes: + sample_id: Mapping for the sample identifier field (required for + primary datasets) + comparative_analyses: Optional list of comparative datasets that + reference this dataset + properties: Property mappings for this specific dataset (field names to + PropertyMapping) + + Example: + ```yaml + # In BrentLab/callingcards config + annotated_features: + sample_id: + field: sample_id + comparative_analyses: + - repo: BrentLab/yeast_comparative_analysis + dataset: dto + via_field: binding_id + regulator_locus_tag: + field: regulator_locus_tag + dto_fdr: # Field from comparative dataset, optional renaming + field: dto_fdr + ``` + + """ + + sample_id: PropertyMapping | None = Field( + None, description="Mapping for sample identifier field" + ) + comparative_analyses: list[ComparativeAnalysis] = Field( + default_factory=list, + description="Comparative datasets referencing this dataset", + ) + # Allow additional property mappings via extra fields + model_config = ConfigDict(extra="allow") + + @model_validator(mode="before") + @classmethod + def parse_property_mappings(cls, data: Any) -> Any: + """Parse extra fields as PropertyMapping objects.""" + if not isinstance(data, dict): + return data + + # Process all fields except sample_id and comparative_analyses + result = {} + for key, value in data.items(): + if key in ("sample_id", "comparative_analyses"): + # These are typed fields, let Pydantic handle them + result[key] = value + elif isinstance(value, dict): + # Assume it's a PropertyMapping + try: + result[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError( + f"Invalid PropertyMapping for field '{key}': {e}" + ) from e + else: + # Already parsed or wrong type + result[key] = value + + return result + + +class RepositoryConfig(BaseModel): + """ + Configuration for a single repository. Eg BrentLab/harbison_2004. + + Attributes: + properties: Repo-wide property mappings that apply to all datasets + dataset: Dataset-specific configurations including sample_id, + comparative_analyses, and property mappings + + Example: + ```python + config = RepositoryConfig( + properties={ + "temperature_celsius": PropertyMapping(path="temperature_celsius") + }, + dataset={ + "dataset_name": DatasetVirtualDBConfig( + sample_id=PropertyMapping(field="sample_id"), + comparative_analyses=[ + ComparativeAnalysis( + repo="BrentLab/yeast_comparative_analysis", + dataset="dto", + via_field="binding_id" + ) + ], + # Additional property mappings via extra fields + **{"carbon_source": PropertyMapping( + field="condition", + path="media.carbon_source" + )} + ) + } + ) + ``` + + """ + + properties: dict[str, PropertyMapping] = Field( + default_factory=dict, description="Repo-wide property mappings" + ) + dataset: dict[str, DatasetVirtualDBConfig] | None = Field( + None, description="Dataset-specific configurations" + ) + + @model_validator(mode="before") + @classmethod + def parse_structure(cls, data: Any) -> Any: + """Parse raw dict structure into typed objects.""" + if not isinstance(data, dict): + return data + + # Extract and parse dataset section + dataset_section = data.get("dataset") + parsed_datasets: dict[str, DatasetVirtualDBConfig] = {} + + if dataset_section: + if not isinstance(dataset_section, dict): + raise ValueError("'dataset' key must contain a dict") + for dataset_name, config_dict in dataset_section.items(): + if not isinstance(config_dict, dict): + raise ValueError(f"Dataset '{dataset_name}' must contain a dict") + + # Parse DatasetVirtualDBConfig + # The config_dict may contain: + # - sample_id (PropertyMapping) + # - comparative_analyses (list[ComparativeAnalysis]) + # - Other fields as PropertyMappings (via extra="allow") + try: + parsed_datasets[dataset_name] = ( + DatasetVirtualDBConfig.model_validate(config_dict) + ) + except Exception as e: + raise ValueError( + f"Invalid configuration for dataset '{dataset_name}': {e}" + ) from e + + # Parse repo-wide properties (all keys except 'dataset') + parsed_properties = {} + for key, value in data.items(): + if key == "dataset": + continue + + try: + parsed_properties[key] = PropertyMapping.model_validate(value) + except Exception as e: + raise ValueError(f"Invalid repo-wide property '{key}': {e}") from e + + return {"properties": parsed_properties, "dataset": parsed_datasets} + + +class MetadataConfig(BaseModel): + """ + Configuration for building standardized metadata tables. + + Specifies optional alias mappings for normalizing factor levels across + heterogeneous datasets, plus property path mappings for each repository. + + Attributes: + factor_aliases: Optional mappings of standardized names to actual values. + Example: {"carbon_source": + {"glucose": ["D-glucose", "dextrose"]}} + missing_value_labels: Labels for missing values by property name + description: Human-readable descriptions for each property + repositories: Dict mapping repository IDs to their configurations + + Example: + ```yaml + repositories: + BrentLab/harbison_2004: + dataset: + harbison_2004: + carbon_source: + field: condition + path: media.carbon_source + + BrentLab/kemmeren_2014: + temperature: + path: temperature_celsius + dataset: + kemmeren_2014: + carbon_source: + path: media.carbon_source + + factor_aliases: + carbon_source: + glucose: ["D-glucose", "dextrose"] + galactose: ["D-galactose", "Galactose"] + + missing_value_labels: + carbon_source: "unspecified" + + description: + carbon_source: "Carbon source in growth media" + ``` + + """ + + factor_aliases: dict[str, dict[str, list[Any]]] = Field( + default_factory=dict, + description="Optional alias mappings for normalizing factor levels", + ) + missing_value_labels: dict[str, str] = Field( + default_factory=dict, + description="Labels for missing values by property name", + ) + description: dict[str, str] = Field( + default_factory=dict, + description="Human-readable descriptions for each property", + ) + repositories: dict[str, RepositoryConfig] = Field( + ..., description="Repository configurations keyed by repo ID" + ) + + @field_validator("missing_value_labels", mode="before") + @classmethod + def validate_missing_value_labels(cls, v: Any) -> dict[str, str]: + """Validate missing value labels structure, filtering out None values.""" + if not v: + return {} + if not isinstance(v, dict): + raise ValueError("missing_value_labels must be a dict") + # Filter out None values that may come from empty YAML values + return {k: val for k, val in v.items() if val is not None} + + @field_validator("description", mode="before") + @classmethod + def validate_description(cls, v: Any) -> dict[str, str]: + """Validate description structure, filtering out None values.""" + if not v: + return {} + if not isinstance(v, dict): + raise ValueError("description must be a dict") + # Filter out None values that may come from empty YAML values + return {k: val for k, val in v.items() if val is not None} + + @field_validator("factor_aliases") + @classmethod + def validate_factor_aliases( + cls, v: dict[str, dict[str, list[Any]]] + ) -> dict[str, dict[str, list[Any]]]: + """Validate factor alias structure.""" + # Empty is OK - aliases are optional + if not v: + return v + + for prop_name, aliases in v.items(): + if not isinstance(aliases, dict): + raise ValueError( + f"Property '{prop_name}' aliases must be a dict, " + f"got {type(aliases).__name__}" + ) + + # Validate each alias mapping + for alias_name, actual_values in aliases.items(): + if not isinstance(actual_values, list): + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' must map " + f"to a list of values" + ) + if not actual_values: + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' cannot " + f"have empty value list" + ) + for val in actual_values: + if not isinstance(val, (str, int, float, bool)): + raise ValueError( + f"Alias '{alias_name}' for '{prop_name}' contains " + f"invalid value type: {type(val).__name__}" + ) + + return v + + @model_validator(mode="before") + @classmethod + def parse_repositories(cls, data: Any) -> Any: + """Parse repository configurations from 'repositories' key.""" + if not isinstance(data, dict): + return data + + # Extract repositories from 'repositories' key + repositories_data = data.get("repositories", {}) + + if not repositories_data: + raise ValueError( + "Configuration must have a 'repositories' key " + "with at least one repository" + ) + + if not isinstance(repositories_data, dict): + raise ValueError("'repositories' key must contain a dict") + + repositories = {} + for repo_id, repo_config in repositories_data.items(): + try: + repositories[repo_id] = RepositoryConfig.model_validate(repo_config) + except Exception as e: + raise ValueError( + f"Invalid configuration for repository '{repo_id}': {e}" + ) from e + + return { + "factor_aliases": data.get("factor_aliases", {}), + "missing_value_labels": data.get("missing_value_labels", {}), + "description": data.get("description", {}), + "repositories": repositories, + } + + @classmethod + def from_yaml(cls, path: Path | str) -> "MetadataConfig": + """ + Load and validate configuration from YAML file. + + :param path: Path to YAML configuration file + :return: Validated MetadataConfig instance + :raises FileNotFoundError: If file doesn't exist + :raises ValueError: If configuration is invalid + + """ + path = Path(path) + + if not path.exists(): + raise FileNotFoundError(f"Configuration file not found: {path}") + + with open(path) as f: + data = yaml.safe_load(f) + + if not isinstance(data, dict): + raise ValueError("Configuration must be a YAML dict") + + return cls.model_validate(data) + + def get_repository_config(self, repo_id: str) -> RepositoryConfig | None: + """ + Get configuration for a specific repository. + + :param repo_id: Repository ID (e.g., "BrentLab/harbison_2004") + :return: RepositoryConfig instance or None if not found + + """ + return self.repositories.get(repo_id) + + def get_property_mappings( + self, repo_id: str, config_name: str + ) -> dict[str, PropertyMapping]: + """ + Get merged property mappings for a repo/dataset combination. + + Merges repo-wide and dataset-specific mappings, with dataset-specific taking + precedence. + + :param repo_id: Repository ID + :param config_name: Dataset/config name + :return: Dict mapping property names to PropertyMapping objects + + """ + repo_config = self.get_repository_config(repo_id) + if not repo_config: + return {} + + # Start with repo-wide properties + mappings: dict[str, PropertyMapping] = dict(repo_config.properties) + + # Override with dataset-specific properties + if repo_config.dataset and config_name in repo_config.dataset: + dataset_config = repo_config.dataset[config_name] + # DatasetVirtualDBConfig stores property mappings in model_extra + if hasattr(dataset_config, "model_extra") and dataset_config.model_extra: + mappings.update(dataset_config.model_extra) + + return mappings diff --git a/tfbpapi/tests/test_metadata_config_models.py b/tfbpapi/tests/test_metadata_config_models.py index 1697930..9516e3d 100644 --- a/tfbpapi/tests/test_metadata_config_models.py +++ b/tfbpapi/tests/test_metadata_config_models.py @@ -35,19 +35,19 @@ def test_invalid_empty_path(self): """Test that empty path is rejected.""" with pytest.raises(ValidationError) as exc_info: PropertyMapping(path="") - assert "path cannot be empty" in str(exc_info.value) + assert "cannot be empty" in str(exc_info.value) def test_invalid_whitespace_path(self): """Test that whitespace-only path is rejected.""" with pytest.raises(ValidationError) as exc_info: PropertyMapping(path=" ") - assert "path cannot be empty" in str(exc_info.value) + assert "cannot be empty" in str(exc_info.value) def test_invalid_empty_field(self): """Test that empty field string is rejected.""" with pytest.raises(ValidationError) as exc_info: PropertyMapping(field="", path="media.carbon_source") - assert "field cannot be empty" in str(exc_info.value) + assert "cannot be empty" in str(exc_info.value) def test_path_whitespace_stripped(self): """Test that path whitespace is stripped.""" @@ -89,23 +89,6 @@ def test_invalid_expression_with_path(self): assert "expression cannot be used with field or path" in str(exc_info.value) -class TestComparativeAnalysis: - """Tests for ComparativeAnalysis model.""" - - def test_valid_comparative_analysis(self): - """Test valid comparative analysis configuration.""" - from tfbpapi.models import ComparativeAnalysis - - ca = ComparativeAnalysis( - repo="BrentLab/yeast_comparative_analysis", - dataset="dto", - via_field="binding_id", - ) - assert ca.repo == "BrentLab/yeast_comparative_analysis" - assert ca.dataset == "dto" - assert ca.via_field == "binding_id" - - class TestDatasetVirtualDBConfig: """Tests for DatasetVirtualDBConfig model.""" @@ -117,26 +100,34 @@ def test_valid_config_with_sample_id(self): assert config.sample_id is not None assert config.sample_id.field == "sample_id" - def test_valid_config_with_comparative_analyses(self): - """Test valid dataset config with comparative analyses.""" + def test_valid_config_with_field_mappings_and_links(self): + """Test valid dataset config with field mappings and links for comparative + datasets.""" from tfbpapi.models import DatasetVirtualDBConfig config_dict = { "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/yeast_comparative_analysis", - "dataset": "dto", - "via_field": "binding_id", - } - ], + "dto_fdr": {"field": "dto_fdr"}, + # Field mapping for aliasing: dto_pvalue displays dto_empirical_pvalue + "dto_pvalue": {"field": "dto_empirical_pvalue"}, + "links": { + "binding_id": [ + ["BrentLab/harbison_2004", "harbison_2004"], + ["BrentLab/callingcards", "annotated_features"], + ] + }, } config = DatasetVirtualDBConfig.model_validate(config_dict) assert config.sample_id is not None - assert len(config.comparative_analyses) == 1 - assert ( - config.comparative_analyses[0].repo == "BrentLab/yeast_comparative_analysis" - ) + # Check field mapping for aliasing via property_mappings + assert "dto_pvalue" in config.property_mappings + assert config.property_mappings["dto_pvalue"].field == "dto_empirical_pvalue" + assert "binding_id" in config.links + assert len(config.links["binding_id"]) == 2 + assert config.links["binding_id"][0] == [ + "BrentLab/harbison_2004", + "harbison_2004", + ] def test_config_with_extra_property_mappings(self): """Test that extra fields are parsed as PropertyMappings.""" @@ -153,6 +144,73 @@ def test_config_with_extra_property_mappings(self): assert "regulator_locus_tag" in config.model_extra assert "dto_fdr" in config.model_extra + def test_valid_db_name(self): + """Test valid db_name is accepted.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"db_name": "harbison", "sample_id": {"field": "sample_id"}} + ) + assert config.db_name == "harbison" + + def test_db_name_none_by_default(self): + """Test db_name defaults to None.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"sample_id": {"field": "sample_id"}} + ) + assert config.db_name is None + + def test_db_name_invalid_sql_identifier(self): + """Test that invalid SQL identifiers are rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "123bad", "sample_id": {"field": "sample_id"}} + ) + assert "not a valid SQL identifier" in str(exc_info.value) + + def test_db_name_with_spaces_rejected(self): + """Test that db_name with spaces is rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "my table", "sample_id": {"field": "sample_id"}} + ) + assert "not a valid SQL identifier" in str(exc_info.value) + + def test_db_name_reserved_samples(self): + """Test that 'samples' is reserved and rejected.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "samples", "sample_id": {"field": "sample_id"}} + ) + assert "reserved" in str(exc_info.value) + + def test_db_name_reserved_case_insensitive(self): + """Test that reserved name check is case-insensitive.""" + from tfbpapi.models import DatasetVirtualDBConfig + + with pytest.raises(ValidationError) as exc_info: + DatasetVirtualDBConfig.model_validate( + {"db_name": "Samples", "sample_id": {"field": "sample_id"}} + ) + assert "reserved" in str(exc_info.value) + + def test_db_name_underscores_allowed(self): + """Test that underscores are allowed in db_name.""" + from tfbpapi.models import DatasetVirtualDBConfig + + config = DatasetVirtualDBConfig.model_validate( + {"db_name": "_my_table_2", "sample_id": {"field": "sample_id"}} + ) + assert config.db_name == "_my_table_2" + class TestRepositoryConfig: """Tests for RepositoryConfig model.""" @@ -469,7 +527,9 @@ def test_invalid_yaml_structure(self, tmp_path): with pytest.raises(ValueError) as exc_info: MetadataConfig.from_yaml(config_path) - assert "Configuration must be a YAML dict" in str(exc_info.value) + assert "Configuration file must contain a YAML dictionary" in str( + exc_info.value + ) def test_nested_alias_property_names(self, tmp_path): """Test that alias property names can use dot notation.""" @@ -512,3 +572,145 @@ def test_nested_alias_property_names(self, tmp_path): assert config.factor_aliases["carbon_source.specifications"]["no_aa"] == [ "without_amino_acids" ] + + def test_unique_db_names_valid(self): + """Test that unique db_names across datasets pass validation.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "alpha", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "beta", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + config = MetadataConfig.model_validate(config_data) + repo = config.get_repository_config("BrentLab/repo1") + assert repo.dataset["dataset_a"].db_name == "alpha" + assert repo.dataset["dataset_b"].db_name == "beta" + + def test_duplicate_db_names_rejected(self): + """Test that duplicate db_names are rejected.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "same_name", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "same_name", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_duplicate_db_name_case_insensitive(self): + """Test that db_name uniqueness is case-insensitive.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "dataset_a": { + "db_name": "Alpha", + "sample_id": {"field": "sample_id"}, + }, + "dataset_b": { + "db_name": "alpha", + "sample_id": {"field": "sample_id"}, + }, + } + } + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_db_name_falls_back_to_config_name(self): + """Test that config_name is used when db_name is not set.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "harbison_2004": { + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "kemmeren_2014": { + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + # Should pass -- different config_names used as fallback + config = MetadataConfig.model_validate(config_data) + assert config is not None + + def test_db_name_collides_with_config_name(self): + """Test that db_name colliding with another config_name is rejected.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "harbison": { + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "kemmeren": { + "db_name": "harbison", + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) + + def test_duplicate_db_names_across_repos(self): + """Test that db_name uniqueness spans across repositories.""" + config_data = { + "repositories": { + "BrentLab/repo1": { + "dataset": { + "ds1": { + "db_name": "shared", + "sample_id": {"field": "sample_id"}, + } + } + }, + "BrentLab/repo2": { + "dataset": { + "ds2": { + "db_name": "shared", + "sample_id": {"field": "sample_id"}, + } + } + }, + } + } + with pytest.raises(ValidationError) as exc_info: + MetadataConfig.model_validate(config_data) + assert "Duplicate db_name" in str(exc_info.value) diff --git a/tfbpapi/tests/test_models.py b/tfbpapi/tests/test_models.py index 1771c4d..4506131 100644 --- a/tfbpapi/tests/test_models.py +++ b/tfbpapi/tests/test_models.py @@ -476,7 +476,7 @@ def test_get_default_config(self): ), ] ) - default = card.get_default_config() + default = card.default_config assert default is not None assert default.config_name == "data2" diff --git a/tfbpapi/tests/test_virtual_db.py b/tfbpapi/tests/test_virtual_db.py index 1293bf9..e62b840 100644 --- a/tfbpapi/tests/test_virtual_db.py +++ b/tfbpapi/tests/test_virtual_db.py @@ -1,695 +1,782 @@ """ -Tests for VirtualDB unified query interface. +Tests for the SQL-first VirtualDB interface. -Tests configuration loading, schema discovery, querying, filtering, and caching. +Uses local Parquet fixtures and monkeypatches ``_resolve_parquet_files`` +and ``_cached_datacard`` so no network access is needed. """ -import tempfile from pathlib import Path +from unittest.mock import MagicMock +import duckdb import pandas as pd import pytest import yaml # type: ignore -from tfbpapi.virtual_db import VirtualDB, get_nested_value, normalize_value +from tfbpapi.virtual_db import VirtualDB + +# ------------------------------------------------------------------ +# Fixtures +# ------------------------------------------------------------------ + + +def _write_parquet(path: Path, df: pd.DataFrame) -> str: + """Write a DataFrame to a Parquet file using DuckDB.""" + conn = duckdb.connect(":memory:") + conn.execute(f"COPY (SELECT * FROM df) TO '{path}' (FORMAT PARQUET)") + conn.close() + return str(path) + + +@pytest.fixture() +def parquet_dir(tmp_path): + """ + Create Parquet files for two primary datasets and one comparative. + + harbison has a ``condition`` column (like the real dataset) rather + than ``carbon_source`` / ``temperature_celsius`` as raw columns. + Those are derived from DataCard field definitions via config + property mappings. + + kemmeren has no ``condition`` column; carbon_source and + temperature_celsius come from config-level (path-only) mappings + that resolve to constants from the DataCard. + + Returns dict mapping (repo_id, config_name) -> [parquet_path]. + + """ + # harbison: 4 samples; samples 1-3 have 2 target measurements each, + # sample 4 has 2 targets but condition "Unknown" has no definition + # for carbon_source (tests missing_value_labels fallback) + harbison_df = pd.DataFrame( + { + "sample_id": [1, 1, 2, 2, 3, 3, 4, 4], + "regulator_locus_tag": [ + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + ], + "regulator_symbol": [ + "REB1", + "REB1", + "STP1", + "STP1", + "REB1", + "REB1", + "STP1", + "STP1", + ], + "condition": [ + "YPD", + "YPD", + "Galactose", + "Galactose", + "Acid", + "Acid", + "Unknown", + "Unknown", + ], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL003W", + "YAL002W", + "YAL003W", + "YAL001C", + "YAL002W", + ], + "effect": [1.5, 0.8, 2.1, 0.3, 1.2, 0.9, 0.6, 1.0], + "pvalue": [0.01, 0.4, 0.001, 0.9, 0.05, 0.3, 0.2, 0.7], + } + ) + # kemmeren: 2 samples, each with 2 targets = 4 rows + # No condition column; carbon_source comes from path-only mapping + kemmeren_df = pd.DataFrame( + { + "sample_id": [10, 10, 11, 11], + "regulator_locus_tag": [ + "YBR049C", + "YBR049C", + "YDR463W", + "YDR463W", + ], + "regulator_symbol": [ + "REB1", + "REB1", + "STP1", + "STP1", + ], + "target_locus_tag": [ + "YAL001C", + "YAL002W", + "YAL001C", + "YAL003W", + ], + "effect": [1.1, 0.7, 1.8, 0.5], + "pvalue": [0.02, 0.5, 0.003, 0.7], + } + ) + dto_df = pd.DataFrame( + { + "binding_id": [ + "BrentLab/harbison;harbison_2004;1", + "BrentLab/harbison;harbison_2004;2", + "BrentLab/harbison;harbison_2004;3", + ], + "perturbation_id": [ + "BrentLab/kemmeren;kemmeren_2014;10", + "BrentLab/kemmeren;kemmeren_2014;11", + "BrentLab/kemmeren;kemmeren_2014;10", + ], + "dto_empirical_pvalue": [0.001, 0.05, 0.8], + "dto_fdr": [0.01, 0.1, 0.9], + } + ) + files = {} + h_path = tmp_path / "harbison.parquet" + files[("BrentLab/harbison", "harbison_2004")] = [ + _write_parquet(h_path, harbison_df) + ] -class TestHelperFunctions: - """Tests for helper functions.""" + k_path = tmp_path / "kemmeren.parquet" + files[("BrentLab/kemmeren", "kemmeren_2014")] = [ + _write_parquet(k_path, kemmeren_df) + ] - def test_get_nested_value_simple(self): - """Test simple nested dict navigation.""" - data = {"media": {"name": "YPD"}} - result = get_nested_value(data, "media.name") - assert result == "YPD" + d_path = tmp_path / "dto.parquet" + files[("BrentLab/comp", "dto")] = [_write_parquet(d_path, dto_df)] - def test_get_nested_value_missing_key(self): - """Test that missing keys return None.""" - data = {"media": {"name": "YPD"}} - result = get_nested_value(data, "media.carbon_source") - assert result is None + return files - def test_get_nested_value_list_extraction(self): - """Test extracting property from list of dicts.""" - data = { - "media": { - "carbon_source": [{"compound": "glucose"}, {"compound": "galactose"}] + +@pytest.fixture() +def config_path(tmp_path): + """Create a YAML config file for the test datasets.""" + config = { + "factor_aliases": { + "carbon_source": { + "glucose": ["D-glucose", "dextrose"], + "galactose": ["D-galactose"], } + }, + "missing_value_labels": {"carbon_source": "unspecified"}, + "repositories": { + "BrentLab/harbison": { + "dataset": { + "harbison_2004": { + "db_name": "harbison", + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + "regulator_symbol": { + "field": "regulator_symbol", + }, + # field+path: derive from condition definitions + "carbon_source": { + "field": "condition", + "path": "media.carbon_source.compound", + }, + "temperature_celsius": { + "field": "condition", + "path": "temperature_celsius", + "dtype": "numeric", + }, + # field-only rename + "environmental_condition": { + "field": "condition", + }, + } + } + }, + "BrentLab/kemmeren": { + # repo-level path-only mappings (constants) + # Paths include experimental_conditions prefix + # to match real datacard model_extra structure + "carbon_source": { + "path": ("experimental_conditions" ".media.carbon_source.compound"), + }, + "temperature_celsius": { + "path": ("experimental_conditions" ".temperature_celsius"), + "dtype": "numeric", + }, + "dataset": { + "kemmeren_2014": { + "db_name": "kemmeren", + "sample_id": {"field": "sample_id"}, + "regulator_locus_tag": { + "field": "regulator_locus_tag", + }, + "regulator_symbol": { + "field": "regulator_symbol", + }, + } + }, + }, + "BrentLab/comp": { + "dataset": { + "dto": { + "dto_pvalue": {"field": "dto_empirical_pvalue"}, + "dto_fdr": {"field": "dto_fdr"}, + "links": { + "binding_id": [ + [ + "BrentLab/harbison", + "harbison_2004", + ], + ], + "perturbation_id": [ + [ + "BrentLab/kemmeren", + "kemmeren_2014", + ], + ], + }, + } + } + }, + }, + } + p = tmp_path / "config.yaml" + with open(p, "w") as f: + yaml.dump(config, f) + return p + + +# metadata_fields per dataset (mirrors what the DataCard would return) +METADATA_FIELDS = { + "harbison_2004": [ + "regulator_locus_tag", + "regulator_symbol", + "condition", + ], + "kemmeren_2014": [ + "regulator_locus_tag", + "regulator_symbol", + ], +} + +# Field definitions from DataCard (condition field for harbison) +HARBISON_CONDITION_DEFS = { + "YPD": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, + }, + "Galactose": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-galactose"}, + ], + }, + }, + "Acid": { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, + }, +} + +# Experimental conditions from DataCard (kemmeren -- config-level) +KEMMEREN_EXP_CONDITIONS = { + "temperature_celsius": 30, + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, +} + + +def _make_mock_datacard(repo_id): + """Create a mock DataCard for testing.""" + card = MagicMock() + + if repo_id == "BrentLab/harbison": + config_mock = MagicMock() + config_mock.metadata_fields = METADATA_FIELDS["harbison_2004"] + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = HARBISON_CONDITION_DEFS + card.get_experimental_conditions.return_value = {} + elif repo_id == "BrentLab/kemmeren": + config_mock = MagicMock() + config_mock.metadata_fields = METADATA_FIELDS["kemmeren_2014"] + # model_extra at config level (no experimental_conditions + # at this level for kemmeren) + config_mock.model_extra = {} + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = {} + # model_extra at top level with experimental_conditions + # wrapper -- matches real DataCard structure + dataset_card_mock = MagicMock() + dataset_card_mock.model_extra = { + "experimental_conditions": KEMMEREN_EXP_CONDITIONS, } - result = get_nested_value(data, "media.carbon_source.compound") - assert result == ["glucose", "galactose"] - - def test_get_nested_value_non_dict(self): - """Test that non-dict input returns None.""" - result = get_nested_value("not a dict", "path") # type: ignore - assert result is None - - def test_normalize_value_exact_match(self): - """Test exact alias match.""" - aliases = {"glucose": ["D-glucose", "dextrose"]} - result = normalize_value("D-glucose", aliases) - assert result == "glucose" - - def test_normalize_value_case_insensitive(self): - """Test case-insensitive matching.""" - aliases = {"glucose": ["D-glucose", "dextrose"]} - result = normalize_value("DEXTROSE", aliases) - assert result == "glucose" - - def test_normalize_value_no_match(self): - """Test pass-through when no alias matches.""" - aliases = {"glucose": ["D-glucose"]} - result = normalize_value("maltose", aliases) - assert result == "maltose" - - def test_normalize_value_no_aliases(self): - """Test pass-through when no aliases provided.""" - result = normalize_value("D-glucose", None) - assert result == "D-glucose" - - def test_normalize_value_missing_value_label(self): - """Test missing value handling.""" - result = normalize_value(None, None, "unspecified") - assert result == "unspecified" - - def test_normalize_value_missing_value_no_label(self): - """Test missing value without label.""" - result = normalize_value(None, None) - assert result == "None" + card.dataset_card = dataset_card_mock + else: + config_mock = MagicMock() + config_mock.metadata_fields = None + card.get_config.return_value = config_mock + card.get_field_definitions.return_value = {} + card.get_experimental_conditions.return_value = {} + + return card + + +@pytest.fixture() +def vdb(config_path, parquet_dir, monkeypatch): + """Return a VirtualDB with _resolve_parquet_files and _cached_datacard monkeypatched + for local testing.""" + import tfbpapi.virtual_db as vdb_module + + v = VirtualDB(config_path) + + def _fake_resolve(self, repo_id, config_name): + return parquet_dir.get((repo_id, config_name), []) + + monkeypatch.setattr(VirtualDB, "_resolve_parquet_files", _fake_resolve) + monkeypatch.setattr( + vdb_module, + "_cached_datacard", + lambda repo_id, token=None: _make_mock_datacard(repo_id), + ) + return v + + +# ------------------------------------------------------------------ +# Tests: Initialisation and config +# ------------------------------------------------------------------ class TestVirtualDBConfig: """Tests for VirtualDB configuration loading.""" - def create_test_config(self, **overrides): - """Helper to create test configuration file.""" - config = { - "factor_aliases": { - "carbon_source": { - "glucose": ["D-glucose", "dextrose"], - "galactose": ["D-galactose", "Galactose"], - } - }, - "missing_value_labels": {"carbon_source": "unspecified"}, - "description": {"carbon_source": "Carbon source in growth media"}, - "repositories": { - "BrentLab/test_repo": { - "temperature_celsius": {"path": "temperature_celsius"}, - "dataset": { - "test_dataset": { - "carbon_source": { - "field": "condition", - "path": "media.carbon_source.compound", - } - } - }, - } - }, - } - config.update(overrides) - return config - - def test_init_with_valid_config(self): - """Test VirtualDB initialization with valid config.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_test_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - assert vdb.config is not None - assert vdb.token is None - assert len(vdb.cache) == 0 - finally: - Path(config_path).unlink() - - def test_init_with_token(self): - """Test VirtualDB initialization with HF token.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_test_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path, token="test_token") - assert vdb.token == "test_token" - finally: - Path(config_path).unlink() - - def test_init_missing_config_file(self): - """Test error when config file doesn't exist.""" + def test_init_loads_config(self, config_path, monkeypatch): + """Test that config loads without error.""" + v = VirtualDB(config_path) + assert v.config is not None + assert v.token is None + + def test_init_with_token(self, config_path, monkeypatch): + """Test token is stored.""" + v = VirtualDB(config_path, token="tok123") + assert v.token == "tok123" + + def test_init_missing_file(self): + """Test FileNotFoundError for missing config.""" with pytest.raises(FileNotFoundError): VirtualDB("/nonexistent/path.yaml") - def test_repr(self): - """Test string representation.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_test_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - repr_str = repr(vdb) - assert "VirtualDB" in repr_str - assert "1 repositories" in repr_str - assert "1 datasets configured" in repr_str - assert "0 views cached" in repr_str - finally: - Path(config_path).unlink() - - -class TestSchemaDiscovery: - """Tests for schema discovery methods.""" - - def create_multi_dataset_config(self): - """Create config with multiple datasets.""" - return { - "factor_aliases": {}, - "repositories": { - "BrentLab/repo1": { - "temperature_celsius": {"path": "temperature_celsius"}, - "dataset": { - "dataset1": { - "carbon_source": { - "field": "condition", - "path": "media.carbon_source", - } - } - }, - }, - "BrentLab/repo2": { - "nitrogen_source": {"path": "media.nitrogen_source"}, - "dataset": { - "dataset2": { - "carbon_source": {"path": "media.carbon_source"}, - "temperature_celsius": {"path": "temperature_celsius"}, - } - }, - }, - }, - } + def test_repr_before_views(self, config_path): + """Test repr before views are registered.""" + v = VirtualDB(config_path) + r = repr(v) + assert "VirtualDB" in r + assert "views not yet registered" in r + + def test_repr_after_views(self, vdb): + """Test repr after views are registered.""" + vdb.tables() # triggers view registration + r = repr(vdb) + assert "VirtualDB" in r + assert "views)" in r + + def test_db_name_map(self, config_path): + """Test that _db_name_map resolves db_name correctly.""" + v = VirtualDB(config_path) + assert "harbison" in v._db_name_map + assert "kemmeren" in v._db_name_map + assert "dto" in v._db_name_map + assert v._db_name_map["harbison"] == ( + "BrentLab/harbison", + "harbison_2004", + ) - def test_get_fields_all_datasets(self): - """Test getting all fields across all datasets.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_multi_dataset_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - fields = vdb.get_fields() - assert "carbon_source" in fields - assert "temperature_celsius" in fields - assert "nitrogen_source" in fields - assert fields == sorted(fields) # Should be sorted - finally: - Path(config_path).unlink() - - def test_get_fields_specific_dataset(self): - """Test getting fields for specific dataset.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_multi_dataset_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - fields = vdb.get_fields("BrentLab/repo1", "dataset1") - assert "carbon_source" in fields - assert "temperature_celsius" in fields - # nitrogen_source is in repo2, not repo1 - assert "nitrogen_source" not in fields - finally: - Path(config_path).unlink() - - def test_get_fields_invalid_partial_args(self): - """Test error when only one of repo_id/config_name provided.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_multi_dataset_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - with pytest.raises(ValueError, match="Both repo_id and config_name"): - vdb.get_fields(repo_id="BrentLab/repo1") - finally: - Path(config_path).unlink() - - def test_get_common_fields(self): - """Test getting fields common to all datasets.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_multi_dataset_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - common = vdb.get_common_fields() - # Both datasets have carbon_source and temperature_celsius - assert "carbon_source" in common - assert "temperature_celsius" in common - # nitrogen_source is only in repo2 - assert "nitrogen_source" not in common - finally: - Path(config_path).unlink() - - -class TestCaching: - """Tests for view materialization and caching.""" - - def create_simple_config(self): - """Create simple config for testing.""" - return { - "factor_aliases": {}, - "repositories": { - "BrentLab/test_repo": { - "dataset": { - "test_dataset": { - "carbon_source": {"path": "media.carbon_source"} - } - } - } - }, - } - def test_invalidate_cache_all(self): - """Test invalidating all cache.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_simple_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - # Manually add to cache - vdb.cache[("BrentLab/test_repo", "test_dataset")] = pd.DataFrame() - assert len(vdb.cache) == 1 - - vdb.invalidate_cache() - assert len(vdb.cache) == 0 - finally: - Path(config_path).unlink() - - def test_invalidate_cache_specific(self): - """Test invalidating specific dataset cache.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - yaml.dump(self.create_simple_config(), f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - # Add multiple entries to cache - vdb.cache[("BrentLab/test_repo", "test_dataset")] = pd.DataFrame() - vdb.cache[("BrentLab/other_repo", "other_dataset")] = pd.DataFrame() - assert len(vdb.cache) == 2 - - vdb.invalidate_cache([("BrentLab/test_repo", "test_dataset")]) - assert len(vdb.cache) == 1 - assert ("BrentLab/other_repo", "other_dataset") in vdb.cache - finally: - Path(config_path).unlink() - - -class TestFiltering: - """Tests for filter application logic.""" - - def test_apply_filters_exact_match(self): - """Test exact value matching in filters.""" - df = pd.DataFrame( - { - "sample_id": ["s1", "s2", "s3"], - "carbon_source": ["glucose", "galactose", "glucose"], - } +# ------------------------------------------------------------------ +# Tests: View registration +# ------------------------------------------------------------------ + + +class TestViewRegistration: + """Tests for lazy view creation.""" + + def test_raw_views_created(self, vdb): + """Test that raw per-dataset views exist.""" + views = vdb.tables() + assert "harbison" in views + assert "kemmeren" in views + # Comparative datasets only get _expanded, not a bare view + assert "dto" not in views + assert "dto_expanded" in views + + def test_raw_view_has_all_rows(self, vdb): + """Test raw view returns measurement-level data.""" + df = vdb.query("SELECT COUNT(*) AS n FROM harbison") + # 4 samples x 2 targets each = 8 rows + assert df["n"].iloc[0] == 8 + + def test_raw_view_has_measurement_columns(self, vdb): + """Test raw view includes measurement columns.""" + fields = vdb.get_fields("harbison") + assert "target_locus_tag" in fields + assert "effect" in fields + assert "pvalue" in fields + + def test_raw_view_has_condition_column(self, vdb): + """Test harbison raw view has condition and derived columns.""" + fields = vdb.get_fields("harbison") + assert "condition" in fields + # Derived columns are available via join to _meta + assert "carbon_source" in fields + assert "temperature_celsius" in fields + + def test_meta_views_created(self, vdb): + """Test that _meta views exist for primary datasets.""" + views = vdb.tables() + assert "harbison_meta" in views + assert "kemmeren_meta" in views + # Comparative datasets should NOT have _meta views + assert "dto_meta" not in views + + def test_meta_view_one_row_per_sample(self, vdb): + """Test _meta view has one row per sample_id.""" + df = vdb.query("SELECT COUNT(*) AS n FROM harbison_meta") + # 4 distinct samples + assert df["n"].iloc[0] == 4 + + def test_meta_view_excludes_measurement_columns(self, vdb): + """Test _meta view has only metadata columns.""" + fields = vdb.get_fields("harbison_meta") + assert "sample_id" in fields + assert "regulator_locus_tag" in fields + # Measurement columns should NOT be in _meta + assert "target_locus_tag" not in fields + assert "effect" not in fields + assert "pvalue" not in fields + + def test_meta_view_has_derived_carbon_source(self, vdb): + """Test harbison_meta has carbon_source from field+path.""" + fields = vdb.get_fields("harbison_meta") + assert "carbon_source" in fields + df = vdb.query( + "SELECT sample_id, carbon_source " "FROM harbison_meta ORDER BY sample_id" + ) + values = dict(zip(df["sample_id"], df["carbon_source"])) + # YPD -> D-glucose -> glucose (aliased) + assert values[1] == "glucose" + # Galactose -> D-galactose -> galactose (aliased) + assert values[2] == "galactose" + # Acid -> D-glucose -> glucose (aliased) + assert values[3] == "glucose" + # Unknown -> no definition -> missing_value_labels fallback + assert values[4] == "unspecified" + + def test_meta_view_has_derived_temperature(self, vdb): + """Test harbison_meta has temperature_celsius from field+path.""" + fields = vdb.get_fields("harbison_meta") + assert "temperature_celsius" in fields + df = vdb.query( + "SELECT DISTINCT temperature_celsius " + "FROM harbison_meta " + "WHERE temperature_celsius IS NOT NULL" ) + # Conditions with definitions have temperature_celsius=30; + # "Unknown" has no definition so gets NULL + assert len(df) == 1 + assert df["temperature_celsius"].iloc[0] == 30.0 + + def test_meta_view_has_field_rename(self, vdb): + """Test harbison_meta has environmental_condition alias.""" + fields = vdb.get_fields("harbison_meta") + assert "environmental_condition" in fields + df = vdb.query( + "SELECT DISTINCT environmental_condition " + "FROM harbison_meta ORDER BY environmental_condition" + ) + values = sorted(df["environmental_condition"].tolist()) + assert values == ["Acid", "Galactose", "Unknown", "YPD"] + + def test_meta_view_path_only_constant(self, vdb): + """Test kemmeren_meta has carbon_source from path-only.""" + fields = vdb.get_fields("kemmeren_meta") + assert "carbon_source" in fields + df = vdb.query("SELECT DISTINCT carbon_source FROM kemmeren_meta") + # Constant resolved from experimental_conditions + # D-glucose -> glucose (aliased) + assert len(df) == 1 + assert df["carbon_source"].iloc[0] == "glucose" + + def test_meta_view_path_only_numeric(self, vdb): + """Test kemmeren_meta has temperature_celsius as numeric.""" + df = vdb.query("SELECT DISTINCT temperature_celsius " "FROM kemmeren_meta") + assert len(df) == 1 + assert df["temperature_celsius"].iloc[0] == 30.0 + + def test_comparative_expanded_view(self, vdb): + """Test that dto_expanded view is created.""" + views = vdb.tables() + assert "dto_expanded" in views + + def test_expanded_view_has_parsed_columns(self, vdb): + """Test that expanded view has _source and _id columns.""" + df = vdb.query("SELECT * FROM dto_expanded LIMIT 1") + assert "binding_id_source" in df.columns + assert "binding_id_id" in df.columns + assert "perturbation_id_source" in df.columns + assert "perturbation_id_id" in df.columns + + def test_expanded_view_source_aliased(self, vdb): + """Test that _source columns use db_name aliases.""" + df = vdb.query("SELECT DISTINCT binding_id_source " "FROM dto_expanded") + assert "harbison" in df["binding_id_source"].tolist() + + def test_expanded_view_perturbation_source_aliased(self, vdb): + """Test that perturbation_id_source uses db_name alias.""" + df = vdb.query("SELECT DISTINCT perturbation_id_source " "FROM dto_expanded") + assert "kemmeren" in df["perturbation_id_source"].tolist() + + def test_expanded_view_id_values(self, vdb): + """Test that _id columns contain the sample_id component.""" + df = vdb.query( + "SELECT DISTINCT binding_id_id " "FROM dto_expanded ORDER BY binding_id_id" + ) + assert set(df["binding_id_id"]) == {"1", "2", "3"} - # Create minimal VirtualDB instance - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/test": { - "dataset": { - "test": {"carbon_source": {"path": "media.carbon_source"}} - } - } - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - filtered = vdb._apply_filters( - df, {"carbon_source": "glucose"}, "BrentLab/test", "test" - ) - assert len(filtered) == 2 - assert all(filtered["carbon_source"] == "glucose") - finally: - Path(config_path).unlink() - - def test_apply_filters_numeric_range(self): - """Test numeric range filtering.""" - df = pd.DataFrame( - {"sample_id": ["s1", "s2", "s3"], "temperature_celsius": [25, 30, 37]} + +# ------------------------------------------------------------------ +# Tests: Factor aliases in _meta views +# ------------------------------------------------------------------ + + +class TestFactorAliases: + """Tests that factor aliases are applied in _meta views.""" + + def test_alias_applied_in_meta(self, vdb): + """Test that aliases are applied at _meta level too.""" + df = vdb.query( + "SELECT DISTINCT carbon_source " "FROM harbison_meta ORDER BY carbon_source" ) + values = df["carbon_source"].tolist() + assert "glucose" in values + assert "galactose" in values + assert "D-glucose" not in values - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/test": { - "dataset": { - "test": { - "temperature_celsius": {"path": "temperature_celsius"} - } - } - } - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - - # Test >= operator - filtered = vdb._apply_filters( - df, {"temperature_celsius": (">=", 30)}, "BrentLab/test", "test" - ) - assert len(filtered) == 2 - assert all(filtered["temperature_celsius"] >= 30) - - # Test between operator - filtered = vdb._apply_filters( - df, - {"temperature_celsius": ("between", 28, 32)}, - "BrentLab/test", - "test", - ) - assert len(filtered) == 1 - assert filtered.iloc[0]["temperature_celsius"] == 30 - finally: - Path(config_path).unlink() - - def test_apply_filters_with_alias_expansion(self): - """Test filter with alias expansion.""" - df = pd.DataFrame( - { - "sample_id": ["s1", "s2", "s3"], - "carbon_source": ["glucose", "D-glucose", "galactose"], - } + +# ------------------------------------------------------------------ +# Tests: query() public API +# ------------------------------------------------------------------ + + +class TestQuery: + """Tests for the query() method.""" + + def test_raw_sql(self, vdb): + """Test basic SQL execution.""" + df = vdb.query("SELECT * FROM harbison WHERE sample_id = 1") + # 2 rows: sample 1 has two target measurements + assert len(df) == 2 + assert all(df["sample_id"] == 1) + + def test_parameterized_query(self, vdb): + """Test query with named parameters.""" + df = vdb.query( + "SELECT * FROM harbison WHERE sample_id = $sid", + sid=1, ) + # 2 rows: sample 1 has two target measurements + assert len(df) == 2 + assert all(df["sample_id"] == 1) + + def test_query_returns_dataframe(self, vdb): + """Test that query always returns a DataFrame.""" + df = vdb.query("SELECT 1 AS x") + assert isinstance(df, pd.DataFrame) - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "factor_aliases": { - "carbon_source": {"glucose": ["D-glucose", "dextrose", "glucose"]} - }, - "repositories": { - "BrentLab/test": { - "dataset": { - "test": {"carbon_source": {"path": "media.carbon_source"}} - } - } - }, - } - yaml.dump(config, f) - config_path = f.name - try: - vdb = VirtualDB(config_path) - filtered = vdb._apply_filters( - df, {"carbon_source": "glucose"}, "BrentLab/test", "test" - ) - # Should match both "glucose" and "D-glucose" due to alias expansion - assert len(filtered) == 2 - finally: - Path(config_path).unlink() +# ------------------------------------------------------------------ +# Tests: prepare() and prepared queries +# ------------------------------------------------------------------ -class TestExtraction: - """Tests for metadata extraction methods.""" +class TestPrepare: + """Tests for the prepare() method.""" - def test_add_field_metadata(self): - """Test adding field-level metadata to DataFrame.""" - df = pd.DataFrame({"sample_id": ["s1", "s2"], "condition": ["YPD", "YPG"]}) + def test_prepare_and_query(self, vdb): + """Test registering and using a prepared query.""" + vdb.prepare( + "by_condition", + "SELECT * FROM harbison " "WHERE condition = $cond", + ) + df = vdb.query("by_condition", cond="YPD") + # 2 rows: sample 1 with YPD has 2 targets + assert len(df) == 2 + assert all(df["condition"] == "YPD") + + def test_prepare_name_collision_with_view(self, vdb): + """Test that prepare rejects names colliding with views.""" + with pytest.raises(ValueError, match="collides with"): + vdb.prepare("harbison", "SELECT 1") + + def test_prepare_overwrite(self, vdb): + """Test that re-preparing the same name overwrites.""" + vdb.prepare("q1", "SELECT 1 AS x") + vdb.prepare("q1", "SELECT 2 AS x") + df = vdb.query("q1") + assert df["x"].iloc[0] == 2 + + +# ------------------------------------------------------------------ +# Tests: tables() and describe() +# ------------------------------------------------------------------ + + +class TestDiscovery: + """Tests for tables(), describe(), get_fields().""" + + def test_tables_sorted(self, vdb): + """Test that tables() returns sorted view names.""" + views = vdb.tables() + assert views == sorted(views) + + def test_describe_single(self, vdb): + """Test describe for a single view.""" + df = vdb.describe("harbison") + assert "column_name" in df.columns + assert "column_type" in df.columns + assert "table" in df.columns + assert all(df["table"] == "harbison") + col_names = df["column_name"].tolist() + assert "sample_id" in col_names + assert "condition" in col_names + + def test_describe_all(self, vdb): + """Test describe for all views.""" + df = vdb.describe() + tables = df["table"].unique().tolist() + assert "harbison" in tables + assert "kemmeren" in tables + + def test_get_fields_single(self, vdb): + """Test get_fields for a specific view.""" + fields = vdb.get_fields("harbison") + assert "sample_id" in fields + assert "condition" in fields + assert fields == sorted(fields) + + def test_get_fields_all(self, vdb): + """Test get_fields across all views.""" + fields = vdb.get_fields() + assert "sample_id" in fields + # comparative fields + assert "dto_empirical_pvalue" in fields + + def test_get_common_fields(self, vdb): + """Test common fields across primary _meta views.""" + common = vdb.get_common_fields() + # Both harbison_meta and kemmeren_meta share these + assert "sample_id" in common + assert "carbon_source" in common + assert "temperature_celsius" in common + assert "regulator_locus_tag" in common + + +# ------------------------------------------------------------------ +# Tests: get_nested_value helper +# ------------------------------------------------------------------ + + +class TestGetNestedValue: + """Tests for the get_nested_value module-level helper.""" + + def test_simple_path(self): + from tfbpapi.virtual_db import get_nested_value + + data = {"media": {"name": "YPD"}} + assert get_nested_value(data, "media.name") == "YPD" - field_metadata = { - "YPD": {"carbon_source": ["glucose"], "growth_media": ["YPD"]}, - "YPG": {"carbon_source": ["glycerol"], "growth_media": ["YPG"]}, + def test_list_extraction(self): + from tfbpapi.virtual_db import get_nested_value + + data = { + "media": { + "carbon_source": [ + {"compound": "D-glucose"}, + ], + }, } + result = get_nested_value(data, "media.carbon_source.compound") + assert result == ["D-glucose"] - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/test": { - "dataset": { - "test": {"carbon_source": {"path": "media.carbon_source"}} - } - } - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - result = vdb._add_field_metadata(df, field_metadata) - - assert "carbon_source" in result.columns - assert "growth_media" in result.columns - assert ( - result.loc[result["condition"] == "YPD", "carbon_source"].iloc[0] - == "glucose" - ) - assert ( - result.loc[result["condition"] == "YPG", "carbon_source"].iloc[0] - == "glycerol" - ) - finally: - Path(config_path).unlink() + def test_missing_key(self): + from tfbpapi.virtual_db import get_nested_value + assert get_nested_value({"a": 1}, "b") is None -class TestQuery: - """Tests for query method - requires mocking HfQueryAPI.""" - - def test_query_empty_result(self): - """Test query with no matching datasets.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/test": { - "dataset": { - "test": {"carbon_source": {"path": "media.carbon_source"}} - } - } - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - # Query with non-configured dataset should return empty - result = vdb.query(datasets=[("BrentLab/other", "other")]) - assert isinstance(result, pd.DataFrame) - assert result.empty - finally: - Path(config_path).unlink() - - -class TestComparativeDatasets: - """Tests for comparative dataset field-based joins.""" - - def test_parse_composite_identifier(self): - """Test parsing composite identifiers.""" - composite_id = "BrentLab/harbison_2004;harbison_2004;sample_42" - repo, config, sample = VirtualDB._parse_composite_identifier(composite_id) - assert repo == "BrentLab/harbison_2004" - assert config == "harbison_2004" - assert sample == "sample_42" - - def test_parse_composite_identifier_invalid(self): - """Test that invalid composite IDs raise errors.""" - with pytest.raises(ValueError, match="Invalid composite ID format"): - VirtualDB._parse_composite_identifier("invalid:format") - - def test_get_comparative_fields_for_dataset(self): - """Test getting comparative fields mapping.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comparative", - "dataset": "comp_data", - "via_field": "binding_id", - } - ], - } - } - }, - "BrentLab/comparative": { - "dataset": { - "comp_data": { - "dto_fdr": {"field": "dto_fdr"}, - "dto_pvalue": {"field": "dto_empirical_pvalue"}, - } - } - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - field_mapping = vdb._get_comparative_fields_for_dataset( - "BrentLab/primary", "primary_data" - ) - - # Should have dto_fdr and dto_pvalue, but NOT binding_id (via_field) - assert "dto_fdr" in field_mapping - assert "dto_pvalue" in field_mapping - assert "binding_id" not in field_mapping - - # Check mapping structure - assert field_mapping["dto_fdr"]["comp_repo"] == "BrentLab/comparative" - assert field_mapping["dto_fdr"]["comp_dataset"] == "comp_data" - assert field_mapping["dto_fdr"]["via_field"] == "binding_id" - finally: - Path(config_path).unlink() - - def test_get_comparative_fields_no_links(self): - """Test that datasets without comparative links return empty mapping.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": {"sample_id": {"field": "sample_id"}} + def test_deep_missing(self): + from tfbpapi.virtual_db import get_nested_value + + assert get_nested_value({"a": {"b": 1}}, "a.c") is None + + def test_non_dict_input(self): + from tfbpapi.virtual_db import get_nested_value + + assert get_nested_value("not a dict", "a.b") is None # type: ignore + + +# ------------------------------------------------------------------ +# Tests: edge cases +# ------------------------------------------------------------------ + + +class TestEdgeCases: + """Edge case and error handling tests.""" + + def test_no_parquet_files(self, tmp_path, monkeypatch): + """Test graceful handling when no parquet files are found.""" + config = { + "repositories": { + "BrentLab/empty": { + "dataset": { + "empty_data": { + "sample_id": {"field": "sample_id"}, } } } } + } + p = tmp_path / "config.yaml" + with open(p, "w") as f: yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - field_mapping = vdb._get_comparative_fields_for_dataset( - "BrentLab/primary", "primary_data" - ) - assert field_mapping == {} - finally: - Path(config_path).unlink() - - def test_get_comparative_analyses(self): - """Test getting comparative analysis relationships.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary": { - "dataset": { - "primary_data": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comparative", - "dataset": "comp_data", - "via_field": "binding_id", - } - ], - } - } - }, - "BrentLab/comparative": { - "dataset": {"comp_data": {"dto_fdr": {"field": "dto_fdr"}}} - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - info = vdb.get_comparative_analyses() - - # Check primary to comparative mapping - assert "BrentLab/primary/primary_data" in info["primary_to_comparative"] - links = info["primary_to_comparative"]["BrentLab/primary/primary_data"] - assert len(links) == 1 - assert links[0]["comparative_repo"] == "BrentLab/comparative" - assert links[0]["comparative_dataset"] == "comp_data" - assert links[0]["via_field"] == "binding_id" - - # Check comparative fields - assert "BrentLab/comparative/comp_data" in info["comparative_fields"] - assert ( - "dto_fdr" - in info["comparative_fields"]["BrentLab/comparative/comp_data"] - ) - finally: - Path(config_path).unlink() - - def test_get_comparative_analyses_filtered(self): - """Test filtering comparative analyses by repo and config.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: - config = { - "repositories": { - "BrentLab/primary1": { - "dataset": { - "data1": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comp", - "dataset": "comp_data", - "via_field": "id1", - } - ], - } - } - }, - "BrentLab/primary2": { - "dataset": { - "data2": { - "sample_id": {"field": "sample_id"}, - "comparative_analyses": [ - { - "repo": "BrentLab/comp", - "dataset": "comp_data", - "via_field": "id2", - } - ], - } - } - }, - } - } - yaml.dump(config, f) - config_path = f.name - - try: - vdb = VirtualDB(config_path) - # Get all - all_info = vdb.get_comparative_analyses() - assert len(all_info["primary_to_comparative"]) == 2 + v = VirtualDB(p) - # Filter by repo and config - filtered = vdb.get_comparative_analyses("BrentLab/primary1", "data1") - assert len(filtered["primary_to_comparative"]) == 1 - assert "BrentLab/primary1/data1" in filtered["primary_to_comparative"] + def _fake_resolve(self, repo_id, config_name): + return [] - # Filter by repo only - repo_filtered = vdb.get_comparative_analyses("BrentLab/primary2") - assert len(repo_filtered["primary_to_comparative"]) == 1 - assert "BrentLab/primary2/data2" in repo_filtered["primary_to_comparative"] - finally: - Path(config_path).unlink() + monkeypatch.setattr(VirtualDB, "_resolve_parquet_files", _fake_resolve) + # Should not raise; just have no views + views = v.tables() + assert "empty_data" not in views -# Note: Full integration tests with real HuggingFace datasets would go here -# but are excluded as they require network access and specific test datasets. -# These tests cover the core logic and would be supplemented with integration -# tests using the actual sample config and real datasets like harbison_2004. + def test_lazy_init(self, config_path): + """Test that DuckDB connection is not created until needed.""" + v = VirtualDB(config_path) + assert v._conn is None + assert not v._views_registered diff --git a/tfbpapi/virtual_db.py b/tfbpapi/virtual_db.py index f6dd12e..4fd6f3c 100644 --- a/tfbpapi/virtual_db.py +++ b/tfbpapi/virtual_db.py @@ -1,39 +1,48 @@ """ -VirtualDB provides a unified query interface across heterogeneous datasets. - -This module enables cross-dataset queries with standardized field names and values, -mapping varying experimental condition structures to a common schema through external -YAML configuration. - -Key Components: -- VirtualDB: Main interface for unified cross-dataset queries -- Helper functions: get_nested_value(), normalize_value() for metadata extraction -- Configuration-driven schema via models.MetadataConfig - -Example Usage: - >>> from tfbpapi.datainfo import VirtualDB - >>> vdb = VirtualDB("config.yaml") - >>> - >>> # Discover available fields - >>> fields = vdb.get_fields() - >>> print(fields) # ["carbon_source", "temperature_celsius", ...] - >>> - >>> # Query across datasets - >>> df = vdb.query( - ... filters={"carbon_source": "glucose", "temperature_celsius": 30}, - ... fields=["sample_id", "carbon_source", "temperature_celsius"] - ... ) - >>> - >>> # Get complete data with measurements - >>> df = vdb.query( - ... filters={"carbon_source": "glucose"}, - ... complete=True - ... ) +VirtualDB provides a SQL query interface across heterogeneous datasets. + +A developer creates huggingface repos with datacards. Datacard specifications +specific to tfbpapi can be found at +https://brentlab.github.io/tfbpapi/huggingface_datacard/. Next, a developer can create +a virtualDB configuration file that describes which huggingface repos and datasets to +use, a set of common fields, datasets that contain comparative analytics, and more. +VirtualDB, this code, then uses DuckDB to construct tables and views are +which are lazily created over Parquet files which are cached locally. VirtualDB uses +the information in the datacard to create metadata views which describe sample level +features. Derived columns are attached to both the metadata and full data views. Any +comparative analysis datasets are also parsed and joined to the primary datasets' +metadata views. The expectation is that a developer will use this interface to write +SQL queries against the views to provide an API to downstream users and applications. + +Example Usage:: + + from tfbpapi.virtual_db import VirtualDB + + vdb = VirtualDB("config.yaml", token=token) + + # Discover views + vdb.tables() + vdb.describe("harbison") + + # Raw SQL + df = vdb.query("SELECT * FROM harbison WHERE sample_id = 42") + + # Parameterized SQL + df = vdb.query( + "SELECT * FROM harbison_meta WHERE carbon_source = $cs", + cs="glucose", + ) + + # Prepared queries + vdb.prepare("sig", "SELECT * FROM harbison_meta LIMIT $n") + df = vdb.query("sig", n=10) """ from __future__ import annotations +import logging +from functools import lru_cache from pathlib import Path from typing import Any @@ -41,1305 +50,1046 @@ import pandas as pd from tfbpapi.datacard import DataCard -from tfbpapi.errors import DataCardError -from tfbpapi.hf_cache_manager import HfCacheManager -from tfbpapi.models import MetadataConfig, PropertyMapping +from tfbpapi.models import MetadataConfig + +logger = logging.getLogger(__name__) -def get_nested_value(data: dict, path: str) -> Any: +def get_nested_value(data: dict | list, path: str) -> Any: """ Navigate nested dict/list using dot notation. Handles missing intermediate keys gracefully by returning None. - Supports extracting properties from lists of dicts. + When an intermediate value is a list of dicts, extracts the + remaining path from each item and returns a list of results. - :param data: Dictionary to navigate + :param data: Dictionary or list of dicts to navigate :param path: Dot-separated path (e.g., "media.carbon_source.compound") - :return: Value at path or None if not found - - Examples: - Simple nested dict: - get_nested_value({"media": {"name": "YPD"}}, "media.name") - Returns: "YPD" - - List of dicts - extract property from each item: - get_nested_value( - {"media": {"carbon_source": [{"compound": "glucose"}, - {"compound": "galactose"}]}}, - "media.carbon_source.compound" - ) - Returns: ["glucose", "galactose"] + :return: Value at path, list of values, or None if not found + + :raises TypeError: If an unexpected type is encountered during navigation of the + dict/list structure according to the provided path. + + Example -- dict input:: + + >>> get_nested_value({"media": {"name": "YPD"}}, "media.name") + 'YPD' + + Example -- list-of-dicts at an intermediate node:: + + >>> data = { + ... "media": { + ... "carbon_source": [ + ... {"compound": "glucose"}, + ... ] + ... } + ... } + >>> get_nested_value(data, "media.carbon_source.compound") + ['glucose'] """ - if not isinstance(data, dict): + if not isinstance(data, (dict, list)): return None + # If top-level data is a list, extract path from each item + if isinstance(data, list): + results = [] + for item in data: + if isinstance(item, dict): + val = get_nested_value(item, path) + if val is not None: + results.append(val) + return results if results else None + keys = path.split(".") current = data for i, key in enumerate(keys): if isinstance(current, dict): if key not in current: + logger.warning( + "Key '%s' not found at path '%s' (current keys: %s)", + key, + ".".join(keys[: i + 1]), + list(current.keys()), + ) return None current = current[key] elif isinstance(current, list): - # If current is a list and we have more keys, - # extract property from each item - if i < len(keys): - # Extract the remaining path from each list item - remaining_path = ".".join(keys[i:]) - results = [] - for item in current: - if isinstance(item, dict): - val = get_nested_value(item, remaining_path) - if val is not None: - results.append(val) - return results if results else None + # Extract the remaining path from each list item + remaining_path = ".".join(keys[i:]) + results = [] + for item in current: + if isinstance(item, dict): + val = get_nested_value(item, remaining_path) + if val is not None: + results.append(val) + return results if results else None else: - return None + error_msg = ( + f"Unexpected type '{type(current).__name__}' at " + f"path '{'.'.join(keys[:i])}'; expected dict or " + f"list of dicts" + ) + logger.error(error_msg) + raise TypeError(error_msg) return current -def normalize_value( - actual_value: Any, - aliases: dict[str, list[Any]] | None, - missing_value_label: str | None = None, -) -> str: +@lru_cache(maxsize=32) +def _cached_datacard(repo_id: str, token: str | None = None) -> Any: """ - Normalize a value using optional alias mappings (case-insensitive). - - Returns the alias name if a match is found, otherwise returns the - original value as a string. Handles missing values by returning - the configured missing_value_label. - - :param actual_value: The value from the data to normalize - :param aliases: Optional dict mapping alias names to lists of actual values. - Example: {"glucose": ["D-glucose", "dextrose"]} - :param missing_value_label: Label to use for None/missing values - :return: Alias name if match found, missing_value_label if None, - otherwise str(actual_value) + Return a cached DataCard instance. - Examples: - With aliases - exact match: - normalize_value("D-glucose", {"glucose": ["D-glucose", "dextrose"]}) - Returns: "glucose" - - With aliases - case-insensitive match: - normalize_value("DEXTROSE", {"glucose": ["D-glucose", "dextrose"]}) - Returns: "glucose" - - Missing value: - normalize_value(None, None, "unspecified") - Returns: "unspecified" - - No alias match - pass through: - normalize_value("maltose", {"glucose": ["D-glucose"]}) - Returns: "maltose" + :param repo_id: HuggingFace repository ID + :param token: Optional HuggingFace token + :return: DataCard instance """ - # Handle None/missing values - if actual_value is None: - return missing_value_label if missing_value_label else "None" - - if aliases is None: - return str(actual_value) - - # Convert to string for comparison (case-insensitive) - actual_str = str(actual_value).lower() - - # Check each alias mapping - for alias_name, actual_values in aliases.items(): - for val in actual_values: - if str(val).lower() == actual_str: - return alias_name - - # No match found - pass through original value - return str(actual_value) + return DataCard(repo_id, token=token) class VirtualDB: """ - Unified query interface across heterogeneous datasets. - - VirtualDB provides a virtual database layer over multiple HuggingFace datasets, - allowing cross-dataset queries with standardized field names and normalized values. - Each configured dataset becomes a view with a common schema defined by external - YAML configuration. + A query interface across heterogeneous datasets. - The YAML configuration specifies: - 1. Property mappings: How to extract each field from dataset structures - 2. Factor aliases: Normalize varying terminologies to standard values - 3. Missing value labels: Handle missing data consistently - 4. Descriptions: Document each field's semantics + DuckDB views are lazily registered over Parquet files on first + ``query()`` call. The user writes SQL against named views. - Attributes: - config: MetadataConfig instance with all configuration - token: Optional HuggingFace token for private datasets - cache: Dict mapping (repo_id, config_name) to cached DataFrame views + :ivar config: Validated MetadataConfig + :ivar token: Optional HuggingFace token """ - def __init__(self, config_path: Path | str, token: str | None = None): + def __init__( + self, + config_path: Path | str, + token: str | None = None, + duckdb_connection: duckdb.DuckDBPyConnection | None = None, + views_registered: bool = False, + lazy: bool = True, + ): """ - Initialize VirtualDB with configuration and optional auth token. + Initialize VirtualDB with configuration. :param config_path: Path to YAML configuration file :param token: Optional HuggingFace token for private datasets - :raises FileNotFoundError: If config file doesn't exist - :raises ValueError: If configuration is invalid + :param duckdb_connection: Optional DuckDB connection. If provided, views will be + registered on this connection instead of creating a new in-memory database. + Note that this provides a method of using a persistent database file. If + this isn't provided, then the duckDB connection is in-memory. + :param views_registered: If True, skip view registration (assumes views are + already registered on the provided duckdb_connection). This is useful when + reusing a connection across multiple VirtualDB instances with the same + config. + :param lazy: If True, delay DuckDB connection and view registration until first + query. Set to False to register views immediately on initialization. This is + intended to be used when creating a persistent duckDB connection. If the + views are registered immediately on initialization, then for any other + instances of VirtualDB that are initialized with the same duckDB connection + and config, the views will already be registered and available for querying. + :raises FileNotFoundError: If config file does not exist + :raises ValueError: If configuration is invalid or if views_registered=True is + set when lazy=False """ + if not lazy and views_registered: + raise ValueError( + "Cannot set views_registered=True when lazy=False. " + "If lazy=False, views will be registered immediately on initialization." + ) self.config = MetadataConfig.from_yaml(config_path) self.token = token - self.cache: dict[tuple[str, str], pd.DataFrame] = {} - # Build mapping of comparative dataset references - self._comparative_links = self._build_comparative_links() - - def get_fields( - self, repo_id: str | None = None, config_name: str | None = None - ) -> list[str]: - """ - Get list of queryable fields. - :param repo_id: Optional repository ID to filter to specific dataset - :param config_name: Optional config name (required if repo_id provided) - :return: List of field names + # Instantiate without creating a connection, if no connection is provided. + # the connection is created when needed by calling self._ensure_sql_views() + self._conn: duckdb.DuckDBPyConnection | None = duckdb_connection + self._views_registered = views_registered - Examples: - All fields across all datasets: - fields = vdb.get_fields() + # db_name -> (repo_id, config_name) + self._db_name_map = self._build_db_name_map() - Fields for specific dataset: - fields = vdb.get_fields("BrentLab/harbison_2004", "harbison_2004") + # Prepared queries: name -> sql + self._prepared_queries: dict[str, str] = {} - """ - if repo_id is not None and config_name is not None: - # Get fields for specific dataset - mappings = self.config.get_property_mappings(repo_id, config_name) - return sorted(mappings.keys()) + # If not lazy, create the DuckDB connection and register views immediately. + if not lazy: + self._ensure_sql_views() - if repo_id is not None or config_name is not None: - raise ValueError( - "Both repo_id and config_name must be provided, or neither" - ) + @property + def _db(self) -> duckdb.DuckDBPyConnection: + """Return the DuckDB connection, asserting it is initialized.""" + assert self._conn is not None, ( + "DuckDB connection not initialized. " "Call _ensure_sql_views() first." + ) + return self._conn - # Get all fields across all datasets - all_fields: set[str] = set() - for repo_id, repo_config in self.config.repositories.items(): - # Add repo-wide fields - all_fields.update(repo_config.properties.keys()) - # Add dataset-specific fields - if repo_config.dataset: - for dataset_config in repo_config.dataset.values(): - # DatasetVirtualDBConfig stores property mappings in model_extra - if ( - hasattr(dataset_config, "model_extra") - and dataset_config.model_extra - ): - all_fields.update(dataset_config.model_extra.keys()) - # Also include special fields if they exist - if dataset_config.sample_id: - all_fields.add("sample_id") - - return sorted(all_fields) + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ - def get_common_fields(self) -> list[str]: + def query(self, sql: str, **params: Any) -> pd.DataFrame: """ - Get fields present in ALL configured datasets. + Execute SQL or a prepared query and return a DataFrame. - :return: List of field names common to all datasets + If *sql* matches a registered prepared-query name the stored + SQL template is used instead. Keyword arguments are passed as + named parameters to DuckDB. - Example: - common = vdb.get_common_fields() - # ["carbon_source", "temperature_celsius"] + :param sql: Raw SQL string **or** name of a prepared query + :param params: Named parameters (DuckDB ``$name`` syntax) + :return: Query result as a pandas DataFrame - """ - if not self.config.repositories: - return [] + Examples:: - # Get field sets for each dataset - dataset_fields: list[set[str]] = [] - for repo_id, repo_config in self.config.repositories.items(): - if repo_config.dataset: - for config_name in repo_config.dataset.keys(): - mappings = self.config.get_property_mappings(repo_id, config_name) - dataset_fields.append(set(mappings.keys())) + # Raw SQL + df = vdb.query("SELECT * FROM harbison LIMIT 5") - if not dataset_fields: - return [] + # With parameters + df = vdb.query( + "SELECT * FROM harbison_meta WHERE carbon_source = $cs", + cs="glucose", + ) - # Return intersection - common = set.intersection(*dataset_fields) - return sorted(common) + # Prepared query + vdb.prepare("top", "SELECT * FROM harbison_meta LIMIT $n") + df = vdb.query("top", n=10) - def get_unique_values( - self, field: str, by_dataset: bool = False - ) -> list[str] | dict[str, list[str]]: """ - Get unique values for a field across datasets (with normalization). - - :param field: Field name to get values for - :param by_dataset: If True, return dict keyed by dataset identifier - :return: List of unique normalized values, or dict if by_dataset=True - - Examples: - All unique values: - values = vdb.get_unique_values("carbon_source") - # ["glucose", "galactose", "raffinose"] - - Values by dataset: - values = vdb.get_unique_values("carbon_source", by_dataset=True) - # {"BrentLab/harbison_2004": ["glucose", "galactose"], - # "BrentLab/kemmeren_2014": ["glucose", "raffinose"]} - + self._ensure_sql_views() + + # param `sql` may be a prepared query name, a raw sql statement, or + # a parameterized sql statement that is not prepared. If it exists as a key + # in the _prepared_queries dict, we use the prepared sql. Otherwise, we + # use the sql as passed to query(). + resolved = self._prepared_queries.get(sql, sql) + if params: + return self._db.execute(resolved, params).fetchdf() + return self._db.execute(resolved).fetchdf() + + def prepare(self, name: str, sql: str, overwrite: bool = False) -> None: """ - if by_dataset: - result: dict[str, list[str]] = {} - else: - all_values: set[str] = set() - - # Query each dataset that has this field - for repo_id, repo_config in self.config.repositories.items(): - if repo_config.dataset: - for config_name in repo_config.dataset.keys(): - mappings = self.config.get_property_mappings(repo_id, config_name) - if field not in mappings: - continue - - # Build metadata table for this dataset - metadata_df = self._build_metadata_table(repo_id, config_name) - if metadata_df.empty or field not in metadata_df.columns: - continue - - # Get unique values (already normalized) - unique_vals = metadata_df[field].dropna().unique().tolist() - - if by_dataset: - dataset_key = f"{repo_id}/{config_name}" - result[dataset_key] = sorted(unique_vals) - else: - all_values.update(unique_vals) - - if by_dataset: - return result - else: - return sorted(all_values) + Register a named parameterized query for later use. + + Parameters use DuckDB ``$name`` syntax. + + :param name: Query name (must not collide with a view name) + :param sql: SQL template with ``$name`` parameters + :param overwrite: If True, overwrite existing prepared query + with same name + :raises ValueError: If *name* collides with an existing view + con + Example:: + + vdb.prepare("glucose_regs", ''' + SELECT regulator_symbol, COUNT(*) AS n + FROM harbison_meta + WHERE carbon_source = $cs + GROUP BY regulator_symbol + HAVING n >= $min_n + ''') + df = vdb.query("glucose_regs", cs="glucose", min_n=2) - def get_comparative_analyses( - self, repo_id: str | None = None, config_name: str | None = None - ) -> dict[str, Any]: """ - Get information about comparative analysis relationships. - - Returns information about which comparative datasets are available - and how they link to primary datasets. Useful for discovering - what cross-dataset analyses can be performed. - - :param repo_id: Optional repository ID to filter to specific repo - :param config_name: Optional config name (requires repo_id) - :return: Dictionary with two keys: - - "primary_to_comparative": Maps primary datasets to their - comparative analyses - - "comparative_fields": Maps comparative datasets to fields - available for joining - :raises ValueError: If config_name provided without repo_id - - Examples: - Get all comparative analysis relationships: - info = vdb.get_comparative_analyses() - - Get relationships for specific primary dataset: - info = vdb.get_comparative_analyses( - "BrentLab/callingcards", "annotated_features" - ) + self._ensure_sql_views() + if name in self._list_views() and not overwrite: + error_msg = ( + f"Prepared-query name '{name}' collides with " + f"an existing view. Choose a different name or set " + f"overwrite=True." + ) + logger.error(error_msg) + raise ValueError(error_msg) + self._prepared_queries[name] = sql + def tables(self) -> list[str]: """ - if config_name and not repo_id: - raise ValueError("repo_id required when config_name is specified") - - primary_to_comparative: dict[str, list[dict[str, str]]] = {} - comparative_fields: dict[str, list[str]] = {} - - # Filter links based on parameters - if repo_id and config_name: - # Specific dataset requested - links_to_process = { - (repo_id, config_name): self._comparative_links.get( - (repo_id, config_name), {} - ) - } - elif repo_id: - # All configs in specific repo - links_to_process = { - k: v for k, v in self._comparative_links.items() if k[0] == repo_id - } - else: - # All links - links_to_process = self._comparative_links + Return sorted list of registered view names. - # Build primary to comparative mapping - for (prim_repo, prim_config), link_info in links_to_process.items(): - if "comparative_analyses" not in link_info: - continue - - dataset_key = f"{prim_repo}/{prim_config}" - primary_to_comparative[dataset_key] = [] + :return: Sorted list of view names - for ca in link_info["comparative_analyses"]: - primary_to_comparative[dataset_key].append( - { - "comparative_repo": ca["repo"], - "comparative_dataset": ca["dataset"], - "via_field": ca["via_field"], - } - ) + """ + self._ensure_sql_views() + return sorted(self._list_views()) - # Track which fields are available from comparative datasets - comp_key = f"{ca['repo']}/{ca['dataset']}" - if comp_key not in comparative_fields: - # Get fields from the comparative dataset - # First try config mappings - comp_fields = self.get_fields(ca["repo"], ca["dataset"]) - - # If no mappings, get actual fields from DataCard - if not comp_fields: - try: - card = DataCard(ca["repo"], token=self.token) - config = card.get_config(ca["dataset"]) - if config and config.dataset_info: - comp_fields = [ - f.name for f in config.dataset_info.features - ] - except Exception: - comp_fields = [] - - comparative_fields[comp_key] = comp_fields - - return { - "primary_to_comparative": primary_to_comparative, - "comparative_fields": comparative_fields, - } - - def query( - self, - filters: dict[str, Any] | None = None, - datasets: list[tuple[str, str]] | None = None, - fields: list[str] | None = None, - complete: bool = False, - ) -> pd.DataFrame: + def describe(self, table: str | None = None) -> pd.DataFrame: """ - Query VirtualDB with optional filters and field selection. - - :param filters: Dict of field:value pairs to filter on - :param datasets: List of (repo_id, config_name) tuples to query (None = all) - :param fields: List of field names to return (None = all) - :param complete: If True, return measurement-level data; if False, sample-level - :return: DataFrame with query results - - Examples: - Basic query across all datasets: - df = vdb.query(filters={"carbon_source": "glucose"}) - - Query specific datasets with field selection: - df = vdb.query( - filters={"carbon_source": "glucose", "temperature_celsius": 30}, - datasets=[("BrentLab/harbison_2004", "harbison_2004")], - fields=["sample_id", "carbon_source", "temperature_celsius"] - ) + Describe column names and types for one or all views. - Complete data with measurements: - df = vdb.query( - filters={"carbon_source": "glucose"}, - complete=True - ) + :param table: View name, or None for all views + :return: DataFrame with columns ``table``, ``column_name``, + ``column_type`` """ - # Determine which datasets to query - if datasets is None: - # Query all configured datasets - datasets = [] - for repo_id, repo_config in self.config.repositories.items(): - if repo_config.dataset: - for config_name in repo_config.dataset.keys(): - datasets.append((repo_id, config_name)) - - if not datasets: - return pd.DataFrame() - - # Query each dataset - results: list[pd.DataFrame] = [] - for repo_id, config_name in datasets: - # Build metadata table - metadata_df = self._build_metadata_table(repo_id, config_name) - if metadata_df.empty: - continue + self._ensure_sql_views() + if table is not None: + df = self._db.execute(f"DESCRIBE {table}").fetchdf() + df.insert(0, "table", table) + return df - # Separate filters into primary and comparative - primary_filters = {} - comparative_filters = {} - if filters: - # Get comparative field mapping - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) - for field, value in filters.items(): - if field in comp_field_mapping: - comparative_filters[field] = value - else: - primary_filters[field] = value - - # Apply primary filters first - if primary_filters: - metadata_df = self._apply_filters( - metadata_df, primary_filters, repo_id, config_name - ) + frames = [] + for view in sorted(self._list_views()): + df = self._db.execute(f"DESCRIBE {view}").fetchdf() + df.insert(0, "table", view) + frames.append(df) + if not frames: + return pd.DataFrame(columns=["table", "column_name", "column_type"]) + return pd.concat(frames, ignore_index=True) - # Enrich with comparative data if needed - # IMPORTANT: Do this BEFORE getting complete data so comparative fields - # are joined at the sample level, not measurement level - # This happens when: fields are requested from comparative datasets - # OR when filtering on comparative fields - if fields or comparative_filters: - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) - if fields: - requested_comp_fields = [ - f for f in fields if f in comp_field_mapping - ] - # Also need fields that are filtered on - filtered_comp_fields = [ - f for f in comparative_filters.keys() if f in comp_field_mapping - ] - all_comp_fields = list( - set(requested_comp_fields + filtered_comp_fields) - ) - if all_comp_fields: - metadata_df = self._enrich_with_comparative_data( - metadata_df, repo_id, config_name, all_comp_fields - ) - - # Apply comparative filters after enrichment - if comparative_filters: - metadata_df = self._apply_filters( - metadata_df, comparative_filters, repo_id, config_name - ) - - # If complete=True, join with full data - # Do this AFTER comparative enrichment so DTO fields are already added - if complete: - sample_ids = metadata_df["sample_id"].tolist() - if sample_ids: - full_df = self._get_complete_data( - repo_id, config_name, sample_ids, metadata_df - ) - if not full_df.empty: - metadata_df = full_df - - # Select requested fields - if fields: - # Keep sample_id and any dataset identifier columns - keep_cols = ["sample_id"] - if "dataset_id" in metadata_df.columns: - keep_cols.append("dataset_id") - # Add requested fields that exist - for field in fields: - if field in metadata_df.columns and field not in keep_cols: - keep_cols.append(field) - metadata_df = metadata_df[keep_cols].copy() - - # Add dataset identifier - if "dataset_id" not in metadata_df.columns: - metadata_df = metadata_df.copy() - metadata_df["dataset_id"] = f"{repo_id}/{config_name}" - - results.append(metadata_df) - - if not results: - return pd.DataFrame() - - # Concatenate results, filling NaN for missing columns - return pd.concat(results, ignore_index=True, sort=False) - - def materialize_views(self, datasets: list[tuple[str, str]] | None = None) -> None: + def get_fields(self, table: str | None = None) -> list[str]: """ - Build and cache metadata DataFrames for faster subsequent queries. - - :param datasets: List of (repo_id, config_name) tuples to materialize - (None = materialize all) + Return column names for a view or all unique columns. - Example: - vdb.materialize_views() # Cache all datasets - vdb.materialize_views([("BrentLab/harbison_2004", "harbison_2004")]) + :param table: View name, or None for all views + :return: Sorted list of column names """ - if datasets is None: - # Materialize all configured datasets - datasets = [] - for repo_id, repo_config in self.config.repositories.items(): - if repo_config.dataset: - for config_name in repo_config.dataset.keys(): - datasets.append((repo_id, config_name)) - - for repo_id, config_name in datasets: - # Build and cache - self._build_metadata_table(repo_id, config_name, use_cache=False) - - def invalidate_cache(self, datasets: list[tuple[str, str]] | None = None) -> None: + self._ensure_sql_views() + if table is not None: + cols = self._db.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{table}'" + ).fetchdf() + return sorted(cols["column_name"].tolist()) + + all_cols: set[str] = set() + for view in self._list_views(): + cols = self._db.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{view}'" + ).fetchdf() + all_cols.update(cols["column_name"].tolist()) + return sorted(all_cols) + + def get_common_fields(self) -> list[str]: """ - Clear cached metadata DataFrames. + Return columns present in ALL primary ``_meta`` views. - :param datasets: List of (repo_id, config_name) tuples to invalidate - (None = invalidate all) + Primary dataset views are those without ``links`` in their + config (i.e. not comparative datasets). - Example: - vdb.invalidate_cache() # Clear all cache - vdb.invalidate_cache([("BrentLab/harbison_2004", "harbison_2004")]) + :return: Sorted list of common column names """ - if datasets is None: - self.cache.clear() - else: - for dataset_key in datasets: - if dataset_key in self.cache: - del self.cache[dataset_key] + self._ensure_sql_views() + meta_views = self._get_primary_meta_view_names() + if not meta_views: + return [] - def _build_comparative_links(self) -> dict[tuple[str, str], dict[str, Any]]: - """ - Build mapping of primary datasets to their comparative dataset references. + sets = [] + for view in meta_views: + cols = self._db.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{view}'" + ).fetchdf() + sets.append(set(cols["column_name"].tolist())) - Returns dict keyed by (repo_id, config_name) with value being dict: { - "comparative_analyses": [ { "repo": comparative_repo_id, - "dataset": comparative_config_name, "via_field": - field_name_with_composite_ids } ] } + common = set.intersection(*sets) + return sorted(common) - """ - links: dict[tuple[str, str], dict[str, Any]] = {} + # ------------------------------------------------------------------ + # Lazy initialisation + # ------------------------------------------------------------------ + + def _ensure_sql_views(self) -> None: + """Create DuckDB connection and register all views on first call.""" + if self._views_registered: + return + self._conn = duckdb.connect(":memory:") + self._register_all_views() + self._views_registered = True + + def _register_all_views(self) -> None: + """Orchestrate view registration in dependency order.""" + # 1. Raw per-dataset views (internal ___parquet + # plus public for primary datasets only) + for db_name, (repo_id, config_name) in self._db_name_map.items(): + comparative = self._is_comparative(repo_id, config_name) + self._register_raw_view( + db_name, + repo_id, + config_name, + parquet_only=comparative, + ) - for repo_id, repo_config in self.config.repositories.items(): - if not repo_config.dataset: + # 2. Metadata views for primary datasets (_meta) + # This is based on the metadata defined in the datacard, + # and includes any additional derived columns based on the + # virtualDB config passed in at initialization. Note that + # this is joined onto the raw view in the next step. + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + self._register_meta_view(db_name, repo_id, config_name) + + # 3. Replace primary raw views with join to _meta so + # derived columns (e.g. carbon_source) are available + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + self._enrich_raw_view(db_name) + + # 4. Comparative expanded views (pre-parsed composite IDs) + # These build directly on ___parquet since + # comparative datasets have no _meta or enriched raw view. + for db_name, (repo_id, config_name) in self._db_name_map.items(): + repo_cfg = self.config.repositories.get(repo_id) + if not repo_cfg or not repo_cfg.dataset: continue + ds_cfg = repo_cfg.dataset.get(config_name) + if ds_cfg and ds_cfg.links: + self._register_comparative_expanded_view(db_name, ds_cfg) - for config_name, dataset_config in repo_config.dataset.items(): - if dataset_config.comparative_analyses: - links[(repo_id, config_name)] = { - "comparative_analyses": [ - { - "repo": ca.repo, - "dataset": ca.dataset, - "via_field": ca.via_field, - } - for ca in dataset_config.comparative_analyses - ] - } - - return links - - def _get_comparative_fields_for_dataset( - self, repo_id: str, config_name: str - ) -> dict[str, dict[str, str]]: - """ - Get mapping of comparative fields available for a primary dataset. - - :param repo_id: Primary dataset repository ID - :param config_name: Primary dataset config name - :return: Dict mapping field_name to comparative dataset info - {field_name: { - "comp_repo": comparative_repo_id, - "comp_dataset": comparative_dataset_name, - "via_field": field_with_composite_ids - }} - - Example: - For callingcards dataset linked to DTO via binding_id: - { - "dto_fdr": { - "comp_repo": "BrentLab/yeast_comparative_analysis", - "comp_dataset": "dto", - "via_field": "binding_id" - }, - "dto_empirical_pvalue": {...} - } + # ------------------------------------------------------------------ + # db_name mapping + # ------------------------------------------------------------------ + def _build_db_name_map(self) -> dict[str, tuple[str, str]]: """ - field_mapping: dict[str, dict[str, str]] = {} - - # Get comparative analyses for this dataset - links = self._comparative_links.get((repo_id, config_name), {}) - if "comparative_analyses" not in links: - return field_mapping - - # For each comparative dataset, get its fields - for ca in links["comparative_analyses"]: - comp_repo = ca["repo"] - comp_dataset = ca["dataset"] - via_field = ca["via_field"] - - # Get fields from comparative dataset - comp_fields = self.get_fields(comp_repo, comp_dataset) - - # If no fields from config, try DataCard - if not comp_fields: - try: - from tfbpapi.datacard import DataCard - - card = DataCard(comp_repo, token=self.token) - config = card.get_config(comp_dataset) - if config and config.dataset_info: - comp_fields = [f.name for f in config.dataset_info.features] - except Exception: - comp_fields = [] - - # Map each field to this comparative dataset - for field_name in comp_fields: - # Skip the via_field itself (it's the join key) - if field_name == via_field: - continue - - field_mapping[field_name] = { - "comp_repo": comp_repo, - "comp_dataset": comp_dataset, - "via_field": via_field, - } - - return field_mapping - - def _enrich_with_comparative_data( - self, - primary_df: pd.DataFrame, - repo_id: str, - config_name: str, - requested_fields: list[str], - ) -> pd.DataFrame: + Build mapping from resolved db_name to (repo_id, config_name). + + :return: Dict mapping db_name -> (repo_id, config_name) + """ - Enrich primary dataset with fields from comparative datasets. + mapping: dict[str, tuple[str, str]] = {} + for repo_id, repo_cfg in self.config.repositories.items(): + if not repo_cfg.dataset: + continue + for config_name, ds_cfg in repo_cfg.dataset.items(): + resolved = ds_cfg.db_name or config_name + mapping[resolved] = (repo_id, config_name) + return mapping - :param primary_df: Primary dataset DataFrame with sample_id column - :param repo_id: Primary dataset repository ID - :param config_name: Primary dataset config name - :param requested_fields: List of field names requested by user - :return: DataFrame enriched with comparative fields + # ------------------------------------------------------------------ + # Parquet file resolution + # ------------------------------------------------------------------ + def _resolve_parquet_files(self, repo_id: str, config_name: str) -> list[str]: """ - # Get mapping of which fields come from which comparative datasets - comp_field_mapping = self._get_comparative_fields_for_dataset( - repo_id, config_name - ) + Download (or locate cached) Parquet files for a dataset config. - if not comp_field_mapping: - return primary_df + Uses ``huggingface_hub.snapshot_download`` with the file patterns + from the DataCard. - # Find which requested fields are from comparative datasets - comp_fields_to_fetch = [f for f in requested_fields if f in comp_field_mapping] + :param repo_id: HuggingFace repository ID + :param config_name: Dataset configuration name + :return: List of absolute paths to Parquet files - if not comp_fields_to_fetch: - return primary_df + """ + card = DataCard(repo_id, token=self.token) + config = card.get_config(config_name) + if not config: + logger.warning( + "Config '%s' not found in repo '%s'", + config_name, + repo_id, + ) + return [] - # Group fields by comparative dataset to minimize queries - by_comp_dataset: dict[tuple[str, str, str], list[str]] = {} - for field in comp_fields_to_fetch: - info = comp_field_mapping[field] - key = (info["comp_repo"], info["comp_dataset"], info["via_field"]) - if key not in by_comp_dataset: - by_comp_dataset[key] = [] - by_comp_dataset[key].append(field) + file_patterns = [df.path for df in config.data_files] - # For each comparative dataset, load and join - result_df = primary_df.copy() + from huggingface_hub import snapshot_download - for (comp_repo, comp_dataset, via_field), fields in by_comp_dataset.items(): - try: - # Load comparative dataset using HfCacheManager - # but query the raw data table instead of metadata view - from tfbpapi.hf_cache_manager import HfCacheManager + downloaded_path = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + allow_patterns=file_patterns, + token=self.token, + ) - comp_cache_mgr = HfCacheManager( - comp_repo, duckdb_conn=duckdb.connect(":memory:"), token=self.token + parquet_files: list[str] = [] + for pattern in file_patterns: + file_path = Path(downloaded_path) / pattern + if file_path.exists() and file_path.suffix == ".parquet": + parquet_files.append(str(file_path)) + elif "*" in pattern: + base = Path(downloaded_path) + parquet_files.extend( + str(f) for f in base.glob(pattern) if f.suffix == ".parquet" ) + else: + parent_dir = Path(downloaded_path) / Path(pattern).parent + if parent_dir.exists(): + parquet_files.extend(str(f) for f in parent_dir.glob("*.parquet")) - # Get the config to load data - comp_config = comp_cache_mgr.get_config(comp_dataset) - if not comp_config: - continue - - # Load the data (this will download and register parquet files) - result = comp_cache_mgr._get_metadata_for_config(comp_config) - if not result.get("success", False): - continue - - # Now query the raw data table directly (not the metadata view) - # The raw table name is config_name without "metadata_" prefix - select_fields = [via_field] + fields - columns = ", ".join(select_fields) - - # Query the actual parquet data by creating a view from the files - try: - # Get file paths that were loaded - import glob - - from huggingface_hub import snapshot_download - - cache_dir = snapshot_download( - repo_id=comp_repo, - repo_type="dataset", - allow_patterns=f"{comp_dataset}/**/*.parquet", - token=self.token, - ) - - parquet_files = glob.glob( - f"{cache_dir}/{comp_dataset}/**/*.parquet", recursive=True - ) - - if not parquet_files: - continue - - # Create a temporary view from parquet files - temp_view = f"temp_{comp_dataset}_raw" - files_sql = ", ".join([f"'{f}'" for f in parquet_files]) - comp_cache_mgr.duckdb_conn.execute( - f"CREATE OR REPLACE VIEW {temp_view} AS " - f"SELECT * FROM read_parquet([{files_sql}])" - ) - - # Query the view - sql = f"SELECT {columns} FROM {temp_view}" - comp_df = comp_cache_mgr.duckdb_conn.execute(sql).fetchdf() - - except Exception: - # If direct parquet loading fails, skip this comparative dataset - continue - - if comp_df.empty: - continue - - # Parse composite identifiers to extract sample_id - # via_field contains values like - # "BrentLab/harbison_2004;harbison_2004;123" - # We need to extract the third component and match on - # current repo/config - def extract_sample_id(composite_id: str) -> str | None: - """Extract sample_id if composite matches current dataset.""" - if pd.isna(composite_id): - return None - try: - parts = composite_id.split(";") - if len(parts) != 3: - return None - # Check if this composite ID references our dataset - if parts[0] == repo_id and parts[1] == config_name: - return parts[2] - return None - except Exception: - return None - - comp_df["_join_sample_id"] = comp_df[via_field].apply(extract_sample_id) - - # Convert _join_sample_id to match primary_df sample_id dtype - # This handles cases where sample_id is int but composite has string - if "_join_sample_id" in comp_df.columns: - primary_dtype = primary_df["sample_id"].dtype - if pd.api.types.is_integer_dtype(primary_dtype): - # Convert to numeric, coercing errors to NaN - comp_df["_join_sample_id"] = pd.to_numeric( - comp_df["_join_sample_id"], errors="coerce" - ) - elif pd.api.types.is_string_dtype(primary_dtype): - comp_df["_join_sample_id"] = comp_df["_join_sample_id"].astype( - str - ) - - # Filter to only rows that match our dataset - comp_df = comp_df[comp_df["_join_sample_id"].notna()].copy() - - if comp_df.empty: - continue - - # Drop the via_field column (we don't need it in results) - comp_df = comp_df.drop(columns=[via_field]) - - # Merge with primary data - result_df = result_df.merge( - comp_df, left_on="sample_id", right_on="_join_sample_id", how="left" - ) + return parquet_files - # Drop the temporary join column - result_df = result_df.drop(columns=["_join_sample_id"]) + # ------------------------------------------------------------------ + # View registration helpers + # ------------------------------------------------------------------ - except Exception: - # If enrichment fails for this comparative dataset, continue - continue + def _register_raw_view( + self, + db_name: str, + repo_id: str, + config_name: str, + *, + parquet_only: bool = False, + ) -> None: + """ + Register a raw DuckDB view over Parquet files. - return result_df + Creates an internal ``___parquet`` view that reads + directly from the Parquet files. For primary datasets, also + creates a public ```` view (initially identical) + that may later be replaced by ``_enrich_raw_view``. - @staticmethod - def _parse_composite_identifier(composite_id: str) -> tuple[str, str, str]: - """ - Parse composite sample identifier into components. + For comparative datasets, only the internal parquet view is + created; the public view is the ``_expanded`` view instead. - :param composite_id: Composite ID in format "repo_id;config_name;sample_id" - :return: Tuple of (repo_id, config_name, sample_id) + :param db_name: View name + :param repo_id: Repository ID + :param config_name: Configuration name + :param parquet_only: If True, only create the internal + ``___parquet`` view (no public ````). - Example: - _parse_composite_identifier( - "BrentLab/harbison_2004;harbison_2004;sample_42" + """ + files = self._resolve_parquet_files(repo_id, config_name) + if not files: + logger.warning( + "No parquet files for %s/%s -- skipping view '%s'", + repo_id, + config_name, + db_name, ) - Returns: ("BrentLab/harbison_2004", "harbison_2004", "sample_42") + return - """ - parts = composite_id.split(";") - if len(parts) != 3: - raise ValueError( - f"Invalid composite ID format: {composite_id}. " - "Expected 'repo_id;config_name;sample_id'" + files_sql = ", ".join(f"'{f}'" for f in files) + parquet_sql = f"SELECT * FROM read_parquet([{files_sql}])" + self._db.execute( + f"CREATE OR REPLACE VIEW __{db_name}_parquet AS " f"{parquet_sql}" + ) + if not parquet_only: + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name} AS " + f"SELECT * FROM __{db_name}_parquet" ) - return parts[0], parts[1], parts[2] - def _build_metadata_table( - self, repo_id: str, config_name: str, use_cache: bool = True - ) -> pd.DataFrame: + def _register_meta_view(self, db_name: str, repo_id: str, config_name: str) -> None: """ - Build metadata table for a single dataset. + Register a ``_meta`` view with one row per sample_id. - Extracts sample-level metadata from experimental conditions hierarchy and field - definitions, with normalization and missing value handling. + Includes raw metadata columns from the DataCard plus any derived columns from + config property mappings (resolved against DataCard definitions with factor + aliases applied). + :param db_name: Base view name for the primary dataset :param repo_id: Repository ID :param config_name: Configuration name - :param use_cache: Whether to use/update cache - :return: DataFrame with one row per sample_id """ - cache_key = (repo_id, config_name) - - # Check cache - if use_cache and cache_key in self.cache: - return self.cache[cache_key] + parquet_view = f"__{db_name}_parquet" + if not self._view_exists(parquet_view): + return + + meta_cols = self._resolve_metadata_fields(repo_id, config_name) + prop_result = self._resolve_property_columns(repo_id, config_name) + + if prop_result is not None: + derived_exprs, prop_raw_cols = prop_result + # Raw cols = metadata_fields + any source fields needed + # by property mappings + if meta_cols is not None: + raw = list(dict.fromkeys(["sample_id"] + meta_cols + prop_raw_cols)) + else: + raw = list(dict.fromkeys(["sample_id"] + prop_raw_cols)) - try: - # Load DataCard and CacheManager - card = DataCard(repo_id, token=self.token) - cache_mgr = HfCacheManager( - repo_id, duckdb_conn=duckdb.connect(":memory:"), token=self.token - ) + raw_sql = ", ".join(raw) - # Get property mappings - property_mappings = self.config.get_property_mappings(repo_id, config_name) - if not property_mappings: - return pd.DataFrame() + # Outer SELECT: raw cols + derived expressions + outer_parts = list(raw) + derived_exprs + outer_sql = ", ".join(outer_parts) - # Extract repo/config-level metadata - repo_metadata = self._extract_repo_level( - card, config_name, property_mappings + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT {outer_sql} " + f"FROM (" + f"SELECT DISTINCT {raw_sql} " + f"FROM {parquet_view}" + f") AS __raw" ) - - # Extract field-level metadata - field_metadata = self._extract_field_level( - card, config_name, property_mappings + elif meta_cols is not None: + # Fallback: metadata_fields only, no property mappings + cols = list(dict.fromkeys(["sample_id"] + meta_cols)) + cols_sql = ", ".join(cols) + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT {cols_sql} " + f"FROM {parquet_view}" ) - - # Get sample-level data from HuggingFace - config = card.get_config(config_name) - - # Check if this is a comparative dataset - from tfbpapi.models import DatasetType - - is_comparative = ( - config - and hasattr(config, "dataset_type") - and config.dataset_type == DatasetType.COMPARATIVE + else: + # No metadata_fields at all -- all columns are metadata + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name}_meta AS " + f"SELECT DISTINCT * FROM {parquet_view}" ) - if config and hasattr(config, "metadata_fields") and config.metadata_fields: - # Select only metadata fields - columns = ", ".join(config.metadata_fields) - if not is_comparative and "sample_id" not in config.metadata_fields: - columns = f"sample_id, {columns}" - sql = f"SELECT DISTINCT {columns} FROM {config_name}" - else: - # No metadata_fields specified, select all - sql = f"SELECT DISTINCT * FROM {config_name}" - - df = cache_mgr.query(sql, config_name) - - # For non-comparative datasets: one row per sample_id - # For comparative datasets: keep all rows (each row is a relationship) - if not is_comparative and "sample_id" in df.columns: - df = df.groupby("sample_id").first().reset_index() - - # Add repo-level metadata as columns - for prop_name, values in repo_metadata.items(): - # Use first value (repo-level properties are constant) - df[prop_name] = values[0] if values else None - - # Add field-level metadata - if field_metadata: - df = self._add_field_metadata(df, field_metadata) - - # Apply dtype conversions to DataFrame columns - df = self._apply_column_dtypes(df, property_mappings) + def _enrich_raw_view(self, db_name: str) -> None: + """ + Replace a primary raw view with a join to its ``_meta`` view. - # Cache result - if use_cache: - self.cache[cache_key] = df + If ``_meta`` has derived columns not present in the + raw parquet view, recreates ```` as a join so derived + columns (e.g. ``carbon_source``) appear alongside measurement + data. - return df + :param db_name: Base view name for the primary dataset - except Exception as e: - # Log error for debugging with full traceback - import traceback + """ + meta_name = f"{db_name}_meta" + parquet_name = f"__{db_name}_parquet" + if not self._view_exists(meta_name) or not self._view_exists(parquet_name): + return + + raw_cols = set(self._get_view_columns(parquet_name)) + meta_cols = set(self._get_view_columns(meta_name)) + extra_cols = meta_cols - raw_cols + + if not extra_cols: + return + + extra_select = ", ".join(f"m.{c}" for c in sorted(extra_cols)) + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name} AS " + f"SELECT r.*, {extra_select} " + f"FROM {parquet_name} r " + f"JOIN {meta_name} m USING (sample_id)" + ) - print(f"Error downloading metadata for {config_name}: {e}") - traceback.print_exc() - # Return empty DataFrame on error - return pd.DataFrame() + def _get_view_columns(self, view: str) -> list[str]: + """Return column names for a view.""" + df = self._db.execute( + f"SELECT column_name FROM information_schema.columns " + f"WHERE table_name = '{view}'" + ).fetchdf() + return df["column_name"].tolist() - def _apply_column_dtypes( - self, df: pd.DataFrame, property_mappings: dict[str, PropertyMapping] - ) -> pd.DataFrame: + def _resolve_metadata_fields( + self, repo_id: str, config_name: str + ) -> list[str] | None: """ - Apply dtype conversions to DataFrame columns based on property mappings. + Get the metadata_fields list from the DataCard config. - :param df: DataFrame to apply conversions to - :param property_mappings: Property mappings with dtype specifications - :return: DataFrame with converted column dtypes + :param repo_id: Repository ID + :param config_name: Configuration name + :return: List of metadata field names, or None if not specified """ - for prop_name, mapping in property_mappings.items(): - # Skip if no dtype specified or column doesn't exist - if not mapping.dtype or prop_name not in df.columns: - continue + try: + card = _cached_datacard(repo_id, token=self.token) + config = card.get_config(config_name) + if config and config.metadata_fields: + return list(config.metadata_fields) + except Exception: + logger.debug( + "Could not resolve metadata_fields for %s/%s", + repo_id, + config_name, + ) + return None - # Convert column dtype - try: - if mapping.dtype == "numeric": - df[prop_name] = pd.to_numeric(df[prop_name], errors="coerce") - elif mapping.dtype == "bool": - df[prop_name] = df[prop_name].astype(bool) - elif mapping.dtype == "string": - df[prop_name] = df[prop_name].astype(str) - except (ValueError, TypeError): - # Conversion failed, leave as is - pass - - return df - - def _convert_dtype(self, value: Any, dtype: str) -> Any: + def _resolve_alias(self, col: str, value: str) -> str: """ - Convert value to specified data type. - - :param value: The value to convert to a given `dtype` - :param dtype: Target data type ("numeric", "bool", "string") + Apply factor alias to a value if one is configured. - :return: Converted value or None if conversion fails + :param col: Column name (e.g., "carbon_source") + :param value: Raw value (e.g., "D-glucose") + :return: Canonical alias (e.g., "glucose") or original value """ - if value is None: - return None - - try: - if dtype == "numeric": - # Try float first (handles both int and float) - return float(value) - elif dtype == "bool": - return bool(value) - elif dtype == "string": - return str(value) - else: - # Unknown dtype, pass through unchanged - return value - except (ValueError, TypeError): - # Conversion failed, return None - return None - - def _extract_repo_level( + aliases = self.config.factor_aliases.get(col) + if not aliases: + return value + lower_val = str(value).lower() + for canonical, actuals in aliases.items(): + if lower_val in [str(a).lower() for a in actuals]: + return canonical + return value + + def _resolve_property_columns( self, - card: DataCard, + repo_id: str, config_name: str, - property_mappings: dict[str, PropertyMapping], - ) -> dict[str, list[str]]: + ) -> tuple[list[str], list[str]] | None: """ - Extract and normalize repo/config-level metadata. + Build SQL column expressions for derived property columns. - :param card: DataCard instance + Resolves config property mappings against the DataCard to + produce SQL expressions that add derived columns to the + ``_meta`` view. + + :param repo_id: Repository ID :param config_name: Configuration name - :param property_mappings: Property mappings for this dataset - :return: Dict mapping property names to normalized values + :return: Tuple of (sql_expressions, raw_cols_needed) or None + if no property mappings are configured. + ``sql_expressions`` are SQL fragments like + ``"'glucose' AS carbon_source"`` or + ``"CASE WHEN ... END AS carbon_source"``. + ``raw_cols_needed`` are raw parquet column names that must + be present in the inner SELECT. """ - metadata: dict[str, list[str]] = {} + mappings = self.config.get_property_mappings(repo_id, config_name) + if not mappings: + return None - # Get experimental conditions - try: - conditions = card.get_experimental_conditions(config_name) - except DataCardError: - conditions = {} + expressions: list[str] = [] + raw_cols: set[str] = set() - if not conditions: - return metadata + try: + card = _cached_datacard(repo_id, token=self.token) + except Exception as exc: + logger.warning( + "Could not load DataCard for %s: %s", + repo_id, + exc, + ) + return None - # Extract each mapped property - for prop_name, mapping in property_mappings.items(): - # Skip field-level mappings - if mapping.field is not None: + for key, mapping in mappings.items(): + if mapping.expression is not None: + # Type D: expression + expressions.append(f"({mapping.expression}) AS {key}") continue - # Build full path - # Note: `conditions` is already the experimental_conditions dict, - # so we don't add the prefix - full_path = mapping.path - - # Get value at path - value = get_nested_value(conditions, full_path) # type: ignore - - # Handle missing values - missing_label = self.config.missing_value_labels.get(prop_name) - if value is None: - if missing_label: - metadata[prop_name] = [missing_label] + if mapping.field is not None and mapping.path is None: + # Type A: field-only (alias or no-op) + raw_cols.add(mapping.field) + if key == mapping.field: + # no-op -- column already present as raw col + pass + else: + expressions.append(f"{mapping.field} AS {key}") continue - # Ensure value is a list - actual_values = [value] if not isinstance(value, list) else value - - # Apply dtype conversion if specified - if mapping.dtype: - actual_values = [ - self._convert_dtype(v, mapping.dtype) for v in actual_values - ] + if mapping.field is not None and mapping.path is not None: + # Type B: field + path -- resolve from definitions + raw_cols.add(mapping.field) + expr = self._build_field_path_expr( + key, + mapping.field, + mapping.path, + mapping.dtype, + config_name, + card, + ) + if expr is not None: + expressions.append(expr) + continue - # Normalize using aliases - aliases = self.config.factor_aliases.get(prop_name) - normalized_values = [ - normalize_value(v, aliases, missing_label) for v in actual_values - ] + if mapping.field is None and mapping.path is not None: + # Type C: path-only -- constant from config + expr = self._build_path_only_expr( + key, + mapping.path, + mapping.dtype, + config_name, + card, + ) + if expr is not None: + expressions.append(expr) + continue - metadata[prop_name] = normalized_values + if not expressions and not raw_cols: + return None - return metadata + return expressions, sorted(raw_cols) - def _extract_field_level( + def _build_field_path_expr( self, - card: DataCard, + key: str, + field: str, + path: str, + dtype: str | None, config_name: str, - property_mappings: dict[str, PropertyMapping], - ) -> dict[str, dict[str, Any]]: + card: Any, + ) -> str | None: """ - Extract and normalize field-level metadata. + Build a SQL expression for a field+path property mapping. - :param card: DataCard instance + Resolves each definition value via ``get_nested_value``, + applies factor aliases, and returns either a constant or + a CASE WHEN expression. + + :param key: Output column name + :param field: Source field in parquet (e.g., "condition") + :param path: Dot-notation path within definitions + :param dtype: Optional data type ("numeric", "string", "bool") :param config_name: Configuration name - :param property_mappings: Property mappings for this dataset - :return: Dict mapping field values to their normalized metadata + :param card: DataCard instance + :return: SQL expression string, or None on failure """ - field_metadata: dict[str, dict[str, Any]] = {} + try: + defs = card.get_field_definitions(config_name, field) + except Exception as exc: + logger.warning( + "Could not get definitions for field '%s' " "in config '%s': %s", + field, + config_name, + exc, + ) + return None - # Group property mappings by field - field_mappings: dict[str, dict[str, PropertyMapping]] = {} - for prop_name, mapping in property_mappings.items(): - # Only process if field is specified AND path exists - # (no path means it's just a column alias, not metadata extraction) - if mapping.field is not None and mapping.path is not None: - field_name = mapping.field - if field_name not in field_mappings: - field_mappings[field_name] = {} - field_mappings[field_name][prop_name] = mapping - - # Process each field that has mappings - for field_name, prop_mappings_dict in field_mappings.items(): - # Get field definitions - definitions = card.get_field_definitions(config_name, field_name) - if not definitions: - continue + if not defs: + return None - # Extract metadata for each field value - for field_value, definition in definitions.items(): - if field_value not in field_metadata: - field_metadata[field_value] = {} - - for prop_name, mapping in prop_mappings_dict.items(): - # Get value at path - value = get_nested_value(definition, mapping.path) # type: ignore - - # Handle missing values - missing_label = self.config.missing_value_labels.get(prop_name) - if value is None: - if missing_label: - field_metadata[field_value][prop_name] = [missing_label] - continue - - # Ensure value is a list - actual_values = [value] if not isinstance(value, list) else value - - # Apply dtype conversion if specified - if mapping.dtype: - actual_values = [ - self._convert_dtype(v, mapping.dtype) for v in actual_values - ] - - # Normalize using aliases - aliases = self.config.factor_aliases.get(prop_name) - normalized_values = [ - normalize_value(v, aliases, missing_label) - for v in actual_values - ] - - field_metadata[field_value][prop_name] = normalized_values - - return field_metadata - - def _add_field_metadata( - self, df: pd.DataFrame, field_metadata: dict[str, dict[str, Any]] - ) -> pd.DataFrame: - """ - Add columns from field-level metadata to DataFrame. + # Resolve each definition value + value_map: dict[str, str] = {} + for def_key, definition in defs.items(): + raw = get_nested_value(definition, path) + if raw is None: + logger.debug( + "Path '%s' resolved to None for " "definition key '%s' (keys: %s)", + path, + def_key, + ( + list(definition.keys()) + if isinstance(definition, dict) + else type(definition).__name__ + ), + ) + continue + # Handle list results (e.g., carbon_source returns + # [{"compound": "D-glucose"}]) + if isinstance(raw, list): + raw = raw[0] if len(raw) == 1 else ", ".join(str(v) for v in raw) + resolved = self._resolve_alias(key, str(raw)) + value_map[str(def_key)] = resolved + + if not value_map: + return None - :param df: DataFrame with base sample metadata - :param field_metadata: Dict mapping field values to their properties - :return: DataFrame with additional property columns + # If all values are the same, emit a constant + unique_vals = set(value_map.values()) + if len(unique_vals) == 1: + val = next(iter(unique_vals)) + return self._literal_expr(key, val, dtype) + + # Otherwise, build CASE WHEN + whens = [] + for def_key, resolved in value_map.items(): + escaped_key = def_key.replace("'", "''") + escaped_val = resolved.replace("'", "''") + whens.append(f"WHEN {field} = '{escaped_key}' " f"THEN '{escaped_val}'") + case_sql = " ".join(whens) + missing = self.config.missing_value_labels.get(key) + if missing is not None: + escaped_missing = missing.replace("'", "''") + expr = f"CASE {case_sql} " f"ELSE '{escaped_missing}' END" + else: + expr = f"CASE {case_sql} ELSE NULL END" + if dtype == "numeric": + expr = f"CAST({expr} AS DOUBLE)" + return f"{expr} AS {key}" - """ - # For each field value, add its properties as columns - for field_value, properties in field_metadata.items(): - for prop_name, prop_values in properties.items(): - # Initialize column if needed - if prop_name not in df.columns: - df[prop_name] = None - - # Find rows where any column matches field_value - for col in df.columns: - if col in [prop_name, "sample_id", "dataset_id"]: - continue - mask = df[col] == field_value - if mask.any(): - # Set property value (take first from list) - value = prop_values[0] if prop_values else None - df.loc[mask, prop_name] = value - - return df - - def _apply_filters( + def _build_path_only_expr( self, - df: pd.DataFrame, - filters: dict[str, Any], - repo_id: str, + key: str, + path: str, + dtype: str | None, config_name: str, - ) -> pd.DataFrame: + card: Any, + ) -> str | None: """ - Apply filters to DataFrame with alias expansion and numeric handling. + Build a constant column expression for a path-only mapping. - :param df: DataFrame to filter - :param filters: Dict of field:value pairs - :param repo_id: Repository ID (for alias lookup) - :param config_name: Config name (for alias lookup) - :return: Filtered DataFrame + Resolves a single value from the DataCard's raw model_extra, + which preserves the full dict structure (including any + ``experimental_conditions`` wrapper). + + :param key: Output column name + :param path: Dot-notation path (may include + ``experimental_conditions.`` prefix) + :param dtype: Optional data type + :param config_name: Configuration name + :param card: DataCard instance + :return: SQL literal expression, or None on failure """ - for field, filter_value in filters.items(): - if field not in df.columns: - continue + # Build merged dict from top-level + config-level model_extra. + # This preserves keys like "experimental_conditions" that + # get_experimental_conditions() would strip. + merged: dict[str, Any] = {} + try: + top_extra = card.dataset_card.model_extra + if isinstance(top_extra, dict): + merged.update(top_extra) + config_obj = card.get_config(config_name) + if config_obj and isinstance(config_obj.model_extra, dict): + merged.update(config_obj.model_extra) + except Exception: + logger.debug( + "Could not get model_extra for %s/%s", + card.repo_id if hasattr(card, "repo_id") else "?", + config_name, + ) + return None - # Handle numeric range filters - if isinstance(filter_value, tuple): - operator = filter_value[0] - if operator == "between" and len(filter_value) == 3: - df = df[ - (df[field] >= filter_value[1]) & (df[field] <= filter_value[2]) - ] - elif operator in (">=", ">", "<=", "<", "==", "!="): - if operator == ">=": - df = df[df[field] >= filter_value[1]] - elif operator == ">": - df = df[df[field] > filter_value[1]] - elif operator == "<=": - df = df[df[field] <= filter_value[1]] - elif operator == "<": - df = df[df[field] < filter_value[1]] - elif operator == "==": - df = df[df[field] == filter_value[1]] - elif operator == "!=": - df = df[df[field] != filter_value[1]] - else: - # Exact match with alias expansion - aliases = self.config.factor_aliases.get(field) - if aliases: - # Expand filter value to all aliases - expanded_values = [filter_value] - for alias_name, actual_values in aliases.items(): - if alias_name == filter_value: - # Add all actual values for this alias - expanded_values.extend([str(v) for v in actual_values]) - df = df[df[field].isin(expanded_values)] - else: - # No aliases, exact match - df = df[df[field] == filter_value] + if not merged: + return None - return df + raw = get_nested_value(merged, path) + if raw is None: + logger.debug( + "Path '%s' resolved to None in model_extra for " + "%s/%s. Available keys: %s", + path, + card.repo_id if hasattr(card, "repo_id") else "?", + config_name, + list(merged.keys()), + ) + return None + + if isinstance(raw, list): + raw = raw[0] if len(raw) == 1 else ", ".join(str(v) for v in raw) + + resolved = self._resolve_alias(key, str(raw)) + return self._literal_expr(key, resolved, dtype) + + @staticmethod + def _literal_expr(key: str, value: str, dtype: str | None) -> str: + """ + Build a SQL literal expression with optional type cast. - def _get_complete_data( + :param key: Column alias + :param value: Literal value + :param dtype: Optional type ("numeric", "string", "bool") + :return: SQL expression + + """ + escaped = value.replace("'", "''") + if dtype == "numeric": + return f"CAST('{escaped}' AS DOUBLE) AS {key}" + return f"'{escaped}' AS {key}" + + def _register_comparative_expanded_view( self, - repo_id: str, - config_name: str, - sample_ids: list[str], - metadata_df: pd.DataFrame, - ) -> pd.DataFrame: + db_name: str, + ds_cfg: Any, + ) -> None: """ - Get complete data (with measurements) for sample_ids. + Create ``_expanded`` view with parsed composite ID cols. - Uses WHERE sample_id IN (...) approach for efficient retrieval. + For each link_field in the dataset config, adds two columns: - :param repo_id: Repository ID - :param config_name: Configuration name - :param sample_ids: List of sample IDs to retrieve - :param metadata_df: Metadata DataFrame to merge with - :return: DataFrame with measurements and metadata + - ``_source`` -- the ``repo_id;config_name`` prefix, + aliased to the configured ``db_name`` when available. + - ``_id`` -- the sample_id component. + + :param db_name: Base view name for the comparative dataset + :param ds_cfg: DatasetVirtualDBConfig with ``links`` """ - try: - cache_mgr = HfCacheManager( - repo_id, duckdb_conn=duckdb.connect(":memory:"), token=self.token + parquet_view = f"__{db_name}_parquet" + if not self._view_exists(parquet_view): + return + + extra_cols = [] + for link_field, primaries in ds_cfg.links.items(): + # _id column: third component of composite ID + id_col = f"{link_field}_id" + extra_cols.append(f"SPLIT_PART({link_field}, ';', 3) " f"AS {id_col}") + + # _source column: first two components, aliased + # to db_name when the pair is in the config + raw_expr = ( + f"SPLIT_PART({link_field}, ';', 1) || ';' " + f"|| SPLIT_PART({link_field}, ';', 2)" ) + whens = [] + for pair in primaries: + repo_id, config_name = pair[0], pair[1] + alias = self._get_db_name_for(repo_id, config_name) + if alias: + key = f"{repo_id};{config_name}".replace("'", "''") + whens.append(f"WHEN '{key}' THEN '{alias}'") + if whens: + case_sql = " ".join(whens) + source_expr = f"CASE {raw_expr} {case_sql} " f"ELSE {raw_expr} END" + else: + source_expr = raw_expr + source_col = f"{link_field}_source" + extra_cols.append(f"{source_expr} AS {source_col}") - # Build IN clause - sample_id_list = ", ".join([f"'{sid}'" for sid in sample_ids]) - sql = f""" - SELECT * - FROM {config_name} - WHERE sample_id IN ({sample_id_list}) - """ - - full_df = cache_mgr.query(sql, config_name) - - # Merge with metadata (metadata_df has normalized fields) - # Drop metadata columns from full_df to avoid duplicates - metadata_cols = [ - col - for col in metadata_df.columns - if col not in ["sample_id", "dataset_id"] - ] - full_df = full_df.drop( - columns=[c for c in metadata_cols if c in full_df.columns], - errors="ignore", - ) + if not extra_cols: + return - # Merge on sample_id - result = full_df.merge(metadata_df, on="sample_id", how="left") + cols_sql = ", ".join(extra_cols) + self._db.execute( + f"CREATE OR REPLACE VIEW {db_name}_expanded AS " + f"SELECT *, {cols_sql} FROM {parquet_view}" + ) - return result + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _is_comparative(self, repo_id: str, config_name: str) -> bool: + """Return True if the dataset has links (i.e. is comparative).""" + repo_cfg = self.config.repositories.get(repo_id) + if not repo_cfg or not repo_cfg.dataset: + return False + ds_cfg = repo_cfg.dataset.get(config_name) + return bool(ds_cfg and ds_cfg.links) + + def _list_views(self) -> list[str]: + """Return list of public views (excludes internal __ prefixed).""" + df = self._db.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'main' AND table_type = 'VIEW'" + ).fetchdf() + return [n for n in df["table_name"].tolist() if not n.startswith("__")] + + def _view_exists(self, name: str) -> bool: + """Check whether a view is registered (including internal).""" + df = self._db.execute( + "SELECT table_name FROM information_schema.tables " + "WHERE table_schema = 'main' AND table_type = 'VIEW' " + f"AND table_name = '{name}'" + ).fetchdf() + return len(df) > 0 + + def _get_primary_view_names(self) -> list[str]: + """ + Return db_names of primary (non-comparative) raw views. - except Exception: - return pd.DataFrame() + A primary dataset is one whose config has no ``links``. + + """ + names = [] + for db_name, (repo_id, config_name) in self._db_name_map.items(): + if not self._is_comparative(repo_id, config_name): + if self._view_exists(db_name): + names.append(db_name) + return sorted(names) + + def _get_primary_meta_view_names(self) -> list[str]: + """Return names of primary ``_meta`` views.""" + return [ + f"{n}_meta" + for n in self._get_primary_view_names() + if self._view_exists(f"{n}_meta") + ] + + def _get_db_name_for(self, repo_id: str, config_name: str) -> str | None: + """Resolve db_name for a (repo_id, config_name) pair.""" + for db_name, (r, c) in self._db_name_map.items(): + if r == repo_id and c == config_name: + return db_name + return None def __repr__(self) -> str: """String representation.""" n_repos = len(self.config.repositories) - n_datasets = sum( - len(rc.dataset) if rc.dataset else 0 - for rc in self.config.repositories.values() - ) - n_cached = len(self.cache) + n_datasets = len(self._db_name_map) + if self._views_registered: + n_views = len(self._list_views()) + return ( + f"VirtualDB({n_repos} repos, " + f"{n_datasets} datasets, " + f"{n_views} views)" + ) return ( - f"VirtualDB({n_repos} repositories, {n_datasets} datasets configured, " - f"{n_cached} views cached)" + f"VirtualDB({n_repos} repos, " + f"{n_datasets} datasets, views not yet registered)" )