From c6ad99d914bc3a375f451f7e92a8d03ca331d46e Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Tue, 14 Apr 2026 16:44:58 -0700 Subject: [PATCH 1/4] feat: export mapped variants during public dumps --- src/mavedb/scripts/export_public_data.py | 54 ++++++++++++++---------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py index 63400aeb..77d9ac4a 100644 --- a/src/mavedb/scripts/export_public_data.py +++ b/src/mavedb/scripts/export_public_data.py @@ -6,22 +6,8 @@ python3 -m mavedb.scripts.export_public_data ``` -This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents: -- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets -- LICENSE.txt: The text of the Creative Commons Zero license, which applies to all data included in the dump. -- variants/ - - [URN].counts.csv (for each variant URN): The score set's variant count columns, - sorted by variant number - - [URN].scores.csv (for each variant URN): The score set's variant count columns, - sorted by variant number - - [URN].annotations.csv (for each variant URN with mapped variants): The score set's variant annotations, sorted by - variant number. This file is only included for score sets with mapped variants, and includes VEP, gnomAD, and ClinGen annotations. - -In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets. -Experiments are nested in their parent experiment sets, and score sets in their parent experiments. - -The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like -`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance. +This generates a ZIP archive named `mavedb-dump.YYYYMMDDHHMMSS.zip` in the working directory. +See `src/mavedb/scripts/resources/README.md` for a full description of the archive contents and file formats. Unpublished data and data sets licensed other than under the Creative Commons Zero license are not included in the dump, and user details are limited to ORCID IDs and names of contributors to published data sets. @@ -37,7 +23,7 @@ from fastapi.encoders import jsonable_encoder from sqlalchemy import select -from sqlalchemy.orm import Session, lazyload +from sqlalchemy.orm import Session, joinedload, lazyload from mavedb.lib.score_sets import get_score_set_variants_as_csv from mavedb.models.experiment import Experiment @@ -47,6 +33,7 @@ from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant from mavedb.scripts.environment import script_environment, with_database_session +from mavedb.view_models import mapped_variant as mapped_variant_vm from mavedb.view_models.experiment_set import ExperimentSetPublicDump logger = logging.getLogger(__name__) @@ -114,6 +101,7 @@ def export_public_data(db: Session): # Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score # sets. experiment_sets = list(filter_experiment_sets(experiment_sets_query.all())) + logger.info(f"Found {len(experiment_sets)} published experiment sets with CC0-licensed score sets.") # TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator. # Issue: https://github.com/VariantEffect/mavedb-api/issues/192 @@ -129,7 +117,7 @@ def export_public_data(db: Session): timestamp_format = "%Y%m%d%H%M%S" zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip" - logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json") + logger.info(f"Writing {zip_file_name} with {len(score_set_ids)} score sets.") json_data = { "title": "MaveDB public data", "asOf": datetime.now(timezone.utc).isoformat(), @@ -140,21 +128,23 @@ def export_public_data(db: Session): # Write metadata for all data sets to a single JSON file. zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data))) - # Copy the CC0 license. - zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt") + # Copy the CC0 license and README. + resources_dir = os.path.join(os.path.dirname(__file__), "resources") + zipfile.write(os.path.join(resources_dir, "CC0_license.txt"), "LICENSE.txt") + zipfile.write(os.path.join(resources_dir, "README.md"), "README.md") # Write score and count files for each score set. num_score_sets = len(score_set_ids) for i, score_set_id in enumerate(score_set_ids): score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none() if score_set is not None and score_set.urn is not None: - logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}") + logger.info(f"[{i + 1}/{num_score_sets}] Exporting score set {score_set.urn}") csv_filename_base = score_set.urn.replace(":", "-") csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True) zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) - # Only generate the annotations CSV if mapped variants exist in the score set. + # Only generate annotation files if mapped variants exist in the score set. has_annotations = ( db.scalars( select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1) @@ -167,12 +157,32 @@ def export_public_data(db: Session): ) zipfile.writestr(f"csv/{csv_filename_base}.annotations.csv", csv_str) + # Write mapped variants JSON — mirrors GET /api/v1/score-sets/{urn}/mapped-variants. + mapped_variants = db.scalars( + select(MappedVariant) + .join(Variant, Variant.id == MappedVariant.variant_id) + .options(joinedload(MappedVariant.variant)) + .where(Variant.score_set_id == score_set_id) + ).all() + mapped_variant_views = [ + mapped_variant_vm.MappedVariant.model_validate(mv) for mv in mapped_variants + ] + zipfile.writestr( + f"mapped/{csv_filename_base}.mapped-variants.json", + json.dumps(jsonable_encoder(mapped_variant_views)), + ) + logger.info( + f"[{i + 1}/{num_score_sets}] Wrote annotations + {len(mapped_variants)} mapped variants" + ) + # Only generate the counts CSV if count columns are present. count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None if count_columns and len(count_columns) > 0: csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], namespaced=True) zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str) + logger.info(f"Export complete: {zip_file_name}") + if __name__ == "__main__": export_public_data() From 4da9003a96640d7c46bee9d6891a2a402451b354 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Tue, 14 Apr 2026 16:45:06 -0700 Subject: [PATCH 2/4] docs: add README for MaveDB public data dump --- src/mavedb/scripts/resources/README.md | 236 +++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 src/mavedb/scripts/resources/README.md diff --git a/src/mavedb/scripts/resources/README.md b/src/mavedb/scripts/resources/README.md new file mode 100644 index 00000000..3dcd0247 --- /dev/null +++ b/src/mavedb/scripts/resources/README.md @@ -0,0 +1,236 @@ +# MaveDB Public Data Dump + +This archive contains a snapshot of publicly accessible variant effect data from MaveDB. +The `asOf` field in `main.json` records the exact date and time this dump was generated. + +### Useful links +- **MaveDB website:** https://www.mavedb.org +- **API documentation:** https://api.mavedb.org/docs +- **MaveDB documentation:** https://mavedb.org/docs/mavedb/index.html +- **Source code:** + - https://github.com/VariantEffect/mavedb-api + - https://github.com/VariantEffect/mavedb-ui + - https://github.com/VariantEffect/dcd_mapping2 + +--- + +## What's Included + +This dump includes only data that is: + +- **Published** — publicly released on MaveDB +- **CC0-licensed** — released under the Creative Commons CC0 1.0 Public Domain Dedication + +Unpublished data, private datasets, and datasets published under other licenses are excluded. + +--- + +## Archive Structure + +``` +mavedb-dump.YYYYMMDDHHMMSS.zip +├── README.md # This file +├── LICENSE.txt # Creative Commons CC0 1.0 license text +├── main.json # Metadata for all included datasets +├── csv/ +│ ├── {urn}.scores.csv # Variant effect scores (all score sets) +│ ├── {urn}.counts.csv # Variant counts (score sets with count data only) +│ └── {urn}.annotations.csv # Variant annotations from VEP, gnomAD, and ClinGen +│ # (score sets that have completed mapping only) +└── mapped/ + └── {urn}.mapped-variants.json # Mapped variant data including VRS alleles and HGVS + # (score sets that have completed mapping only) +``` + +`{urn}` is the score set URN with colons replaced by hyphens, e.g., `urn-mavedb-00000001-a-1`. + +--- + +## File Descriptions + +### `main.json` + +A JSON object containing MaveDB metadata with three top-level fields: + +- `title` — `"MaveDB public data"` +- `asOf` — ISO 8601 UTC timestamp indicating when this dump was generated +- `experimentSets` — Array of experiment set objects, each containing nested experiments and score + sets with full metadata (targets, publications, licenses, contributors, etc.) + +The hierarchy mirrors the MaveDB data model: each **ExperimentSet** contains one or more +**Experiments**, each of which contains one or more **ScoreSets**. + +Score set metadata includes the `datasetColumns` field, which lists the names of the per-score-set +score and count columns that appear in the corresponding CSV files. + +### CSV column namespacing + +All CSV files exported from MaveDB use a namespaced column naming scheme. The namespace prefix +identifies which data source a column belongs to and is separated from the column name by a dot: + +| Prefix | Source | +|--------|--------| +| *(no prefix)* | Core identifiers — `accession`, `hgvs_nt`, `hgvs_pro`, `hgvs_splice` | +| `scores.` | Score columns defined by the score set author (e.g. `scores.score`) | +| `counts.` | Count columns defined by the score set author | +| `mavedb.` | Columns computed by the MaveDB mapping pipeline (post-mapped HGVS, VRS digest) | +| `vep.` | Ensembl Variant Effect Predictor annotations | +| `gnomad.` | gnomAD population frequency data | +| `clingen.` | ClinGen Allele Registry linkage | + +Missing or inapplicable values in all CSV files are represented as the string `NA`. + +### `csv/{urn}.scores.csv` + +Comma-separated file with variant effect scores. Contains the following fixed columns, followed by +score columns defined by each individual score set: + +| Column | Description | +|--------|-------------| +| `accession` | Full variant URN (e.g., `urn:mavedb:00000001-a-1#1`) | +| `hgvs_nt` | Assay-level nucleotide HGVS string in MAVE-HGVS format, if applicable | +| `hgvs_pro` | Assay-level protein HGVS string in MAVE-HGVS format, if applicable | +| `hgvs_splice` | Assay-level splice HGVS string in MAVE-HGVS format, if applicable | +| `scores.score` | The primary score column — always present | +| `scores.*` | Additional score columns defined by the score set author | + +The `hgvs_nt`, `hgvs_pro`, and `hgvs_splice` columns use **MAVE-HGVS format** — a constrained +subset of HGVS notation used by MaveDB. These strings are often expressed relative to the +assay's reference sequence (a transcript or protein), not the genome, and may not validate against +a standard HGVS parser. Score values are not normalized across score sets; each score set defines +its own scale and units. Refer to the score set's entry in `main.json` for the meaning of each +score column. + +### `csv/{urn}.counts.csv` + +Same structure as `scores.csv`, but with `counts.*` columns in place of score columns. Only +present for score sets that have count data. The count column names are listed in +`datasetColumns.countColumns` in `main.json`. + +### `csv/{urn}.annotations.csv` + +Variant annotation data from external databases, joined with post-mapped HGVS and VRS identifiers +produced by the MaveDB variant mapping pipeline. **Only present for score sets that have completed +the MaveDB mapping pipeline.** Exact columns: + +| Column | Description | +|--------|-------------| +| `accession` | Full variant URN — use this to join with `scores.csv` | +| `hgvs_nt` | Assay-level nucleotide HGVS (MAVE-HGVS format) | +| `hgvs_pro` | Assay-level protein HGVS (MAVE-HGVS format) | +| `hgvs_splice` | Assay-level splice HGVS (MAVE-HGVS format) | +| `mavedb.post_mapped_hgvs_g` | Post-mapped genomic HGVS on GRCh38 (g. notation) | +| `mavedb.post_mapped_hgvs_c` | Post-mapped coding HGVS (c. notation) | +| `mavedb.post_mapped_hgvs_p` | Post-mapped protein HGVS (p. notation) | +| `mavedb.post_mapped_hgvs_at_assay_level` | Post-mapped HGVS at the assay reference level (transcript or protein) | +| `mavedb.post_mapped_vrs_digest` | GA4GH VRS digest identifier for the post-mapped allele | +| `vep.vep_functional_consequence` | VEP functional consequence term (e.g. `missense_variant`) | +| `gnomad.gnomad_af` | gnomAD v4.1 allele frequency | +| `clingen.clingen_allele_id` | ClinGen Allele Registry CA identifier (e.g. `CA12345`) | + +Variants that could not be mapped, or for which a specific annotation is unavailable, will have +`NA` in the corresponding column. For multi-allelic variants (haplotypes), `mavedb.*` HGVS columns +will be `NA` because a single combined HGVS string cannot currently be derived. This may be updated in +a future release. + +### `mapped/{urn}.mapped-variants.json` + +A JSON array of mapped variant records. Each record corresponds to a single variant and contains +the same fields returned by `GET /api/v1/score-sets/{urn}/mapped-variants`: + +| Field | Description | +|-------|-------------| +| `variantUrn` | URN of the source variant — use this to join with `accession` in the CSV files | +| `preMapped` | VRS allele or haplotype using coordinates on the assay's reference sequence (transcript or protein accession) | +| `postMapped` | VRS allele or haplotype lifted over to GRCh38 genomic coordinates | +| `vrsVersion` | VRS schema version used to encode these objects (e.g., `"1.3"`, `"2.0"`) | +| `mappingApiVersion` | Version of the dcd_mapping service that produced this result | +| `mappedDate` | Date the mapping was produced | +| `modificationDate` | Date this mapping record was last modified | +| `current` | `true` if this is the active mapping for the variant; `false` for superseded mappings | +| `errorMessage` | Diagnostic message if mapping failed; `null` on success | +| `clingenAlleleId` | ClinGen Allele Registry identifier, if the variant has been registered | + +`preMapped` and `postMapped` are raw GA4GH VRS objects (JSON). The `type` field within them may be +`"Allele"`, `"Haplotype"`, or `"CisPhasedBlock"` depending on the variant. Records where mapping +failed will have `preMapped: null`, `postMapped: null`, and a non-null `errorMessage`. **Only +present for score sets that have completed the MaveDB mapping pipeline.** + +--- + +## Working with this data + +### Joining files for a single score set + +All files for a given score set share the same variant identifier: + +- In CSV files: the `accession` column (e.g. `urn:mavedb:00000001-a-1#42`) +- In `mapped-variants.json`: the `variantUrn` field + +To combine scores with annotations or with VRS data, join on `accession` = `variantUrn`. + +### Linking files back to metadata + +A filename like `urn-mavedb-00000001-a-1.scores.csv` corresponds to the score set with +`"urn": "urn:mavedb:00000001-a-1"` in `main.json`. The filename prefix is the score set URN with +every colon (`:`) replaced by a hyphen (`-`). + +### Reconstructing score set metadata from `main.json` + +`main.json` contains the full metadata hierarchy. Score sets are nested inside experiments, which +are nested inside experiment sets. To find the metadata for a specific score set: + +```python +import json + +with open("main.json") as f: + data = json.load(f) + +target_urn = "urn:mavedb:00000001-a-1" +score_set = next( + ss + for es in data["experimentSets"] + for exp in es["experiments"] + for ss in exp["scoreSets"] + if ss["urn"] == target_urn +) +``` + +--- + +## Caveats + +- Only **published**, **CC0-licensed** data is included. Datasets with other licenses are not + present in this dump even if they are publicly visible on MaveDB. +- Annotation files (`.annotations.csv`) and mapped variant files (`.mapped-variants.json`) are + **only present for score sets that have been processed by the MaveDB variant mapping pipeline**. + Score sets that have not yet been mapped, or for which mapping failed entirely, will not have + these files. +- Mapping is applied per variant within a score set. A score set that has completed the mapping + pipeline may still contain individual variants with failed mappings. Those variants have `NA` in + all `mavedb.*`, `vep.*`, `gnomad.*`, and `clingen.*` columns in the annotations CSV, and + `preMapped: null` / `postMapped: null` in the JSON. +- The `mapped/` JSON files include **all** mapping records, not only the most recent ones. When a + score set is remapped, the previous records are retained with `current: false`. For most use + cases, filter to records where `current` is `true`. Annotations are always reported with respect + to the current mapping object. +- gnomAD allele frequencies in `annotations.csv` are sourced from **gnomAD v4.1** specifically. +- `preMapped` VRS objects reference the assay's input sequence (a transcript or protein accession). + `postMapped` VRS objects are remapped to the **GRCh38** reference genome. Do not compare + coordinates between `preMapped` and `postMapped` directly. +- Assay-level HGVS strings (`hgvs_nt`, `hgvs_pro`, `hgvs_splice`) are in **MAVE-HGVS format**, a + constrained community convention that may not parse with a standard HGVS library. +- Score values are **not normalized** across score sets. Each score set defines its own scale, + range, and interpretation. A score of `1.0` in one score set has no defined relationship to a + score of `1.0` in another. +- The data in this dump reflects the state of MaveDB at the time of export, as recorded in the + `asOf` UTC timestamp in `main.json`. It may not reflect changes made after that time. + +--- + +## License + +All data in this archive is released under the +[Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/). + +See `LICENSE.txt` for the full license text. From 8fb8bf95b7ddf6c174fddbefe02dd3cb5e8fc669 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Mon, 15 Jun 2026 16:48:20 -0700 Subject: [PATCH 3/4] feat(export): add VA-Spec annotations to public data dump Emit a va/{urn}.va.ndjson file per mapped score set in the public data export, one record per current mapped variant carrying its highest materialized VA-Spec layer. - add variant_highest_level_annotation to resolve the highest available layer (pathogenicity > functional statement > study result), returning None for variants without a post-mapped allele - extract get_current_mapped_variants_for_annotation as the shared eager-load source of truth for the annotated-variant endpoints and the export, and route the three streaming routers through it - document the va/ output, layer ladder, and the functional-evidence-only caveat in the dump README - cover the resolver with unit tests across the uncalibrated, functional, pathogenicity, and unmapped cases --- src/mavedb/lib/annotation/annotate.py | 22 ++++++++- src/mavedb/lib/score_sets.py | 47 +++++++++++++++---- src/mavedb/routers/score_sets.py | 58 ++--------------------- src/mavedb/scripts/export_public_data.py | 25 +++++++++- src/mavedb/scripts/resources/README.md | 59 +++++++++++++++++++++--- tests/lib/annotation/test_annotate.py | 36 +++++++++++++++ 6 files changed, 177 insertions(+), 70 deletions(-) diff --git a/src/mavedb/lib/annotation/annotate.py b/src/mavedb/lib/annotation/annotate.py index 0453fc40..e5e289a3 100644 --- a/src/mavedb/lib/annotation/annotate.py +++ b/src/mavedb/lib/annotation/annotate.py @@ -8,12 +8,13 @@ See: https://va-spec.ga4gh.org/en/latest/va-standard-profiles/community-profiles/acmg-2015-profiles.html#variant-pathogenicity-statement-acmg-2015 """ -from typing import Optional +from typing import Optional, Union from ga4gh.va_spec.acmg_2015 import VariantPathogenicityStatement from ga4gh.va_spec.base.core import ExperimentalVariantFunctionalImpactStudyResult, Statement from mavedb.lib.annotation.classification import functional_classification_of_variant +from mavedb.lib.annotation.exceptions import MappingDataDoesntExistException from mavedb.lib.annotation.evidence_line import acmg_evidence_line, functional_evidence_line from mavedb.lib.annotation.proposition import ( mapped_variant_to_experimental_variant_clinical_impact_proposition, @@ -132,3 +133,22 @@ def variant_pathogenicity_statement( return mapped_variant_to_pathogenicity_statement( mapped_variant, clinical_proposition, clinical_evidence, strongest_calibration, strongest_range ) + + +def variant_highest_level_annotation( + mapped_variant: MappedVariant, +) -> Optional[Union[ExperimentalVariantFunctionalImpactStudyResult, Statement, VariantPathogenicityStatement]]: + """ + Build the single highest-materialized VA-Spec layer for a mapped variant. + + Layer ladder (highest to lowest): pathogenicity statement -> functional impact statement -> study result. + Returns None when the variant has no post-mapped allele and therefore cannot be annotated. + """ + try: + if can_annotate_variant_for_pathogenicity_evidence(mapped_variant): + return variant_pathogenicity_statement(mapped_variant) + if can_annotate_variant_for_functional_statement(mapped_variant): + return variant_functional_impact_statement(mapped_variant) + return variant_study_result(mapped_variant) + except MappingDataDoesntExistException: + return None diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 5c33b3c5..8e3c8deb 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -410,9 +410,7 @@ def fetch_score_set_search_filter_options( controlled_keywords_counter_list = [] for key, label_counter in controlled_keywords_counter.items(): for label, count in label_counter.items(): - controlled_keywords_counter_list.append( - ControlledKeywordFilterOption(key=key, value=label, count=count) - ) + controlled_keywords_counter_list.append(ControlledKeywordFilterOption(key=key, value=label, count=count)) logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) @@ -556,6 +554,39 @@ def find_publish_or_private_superseded_score_set_tail( return score_set +def get_current_mapped_variants_for_annotation(db: Session, score_set: ScoreSet) -> Sequence[MappedVariant]: + """ + Load the current mapped variants for a score set with the relationships required to build VA-Spec + annotations eagerly loaded. + + This is the single source of truth for the eager-load shape shared by the annotated-variant + streaming endpoints and the public data export. The annotation builders reach through + ``MappedVariant.variant.score_set`` for publications, contributors, license, experiment, and score + calibrations, so each of those is loaded up front to avoid per-variant lazy loads. + """ + return ( + db.query(MappedVariant) + .join(MappedVariant.variant) + .join(Variant.score_set) + .filter(Variant.score_set_id == score_set.id) + .filter(MappedVariant.current.is_(True)) + .options( + contains_eager(MappedVariant.variant).contains_eager(Variant.score_set), + contains_eager(MappedVariant.variant) + .contains_eager(Variant.score_set) + .selectinload(ScoreSet.publication_identifier_associations), + contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by), + contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by), + contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license), + contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment), + contains_eager(MappedVariant.variant) + .contains_eager(Variant.score_set) + .selectinload(ScoreSet.score_calibrations), + ) + .all() + ) + + def get_score_set_variants_as_csv( db: Session, score_set: ScoreSet, @@ -643,11 +674,11 @@ def get_score_set_variants_as_csv( namespaced_score_set_columns[ns] = ["clinical_significance", "clinical_review_status"] need_mappings = ( - include_post_mapped_hgvs - or "clingen" in namespaces - or "vep" in namespaces - or "gnomad" in namespaces - or bool(clinvar_namespaces) + include_post_mapped_hgvs + or "clingen" in namespaces + or "vep" in namespaces + or "gnomad" in namespaces + or bool(clinvar_namespaces) ) need_gnomad = "gnomad" in namespaces diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py index c8981252..371862d1 100644 --- a/src/mavedb/routers/score_sets.py +++ b/src/mavedb/routers/score_sets.py @@ -54,6 +54,7 @@ csv_data_to_df, fetch_score_set_search_filter_options, find_meta_analyses_for_experiment_sets, + get_current_mapped_variants_for_annotation, get_score_set_variants_as_csv, is_replaces_id_unique_violation, refresh_variant_urns, @@ -1288,24 +1289,7 @@ def get_score_set_annotated_variants( assert_permission(user_data, score_set, Action.READ) - mapped_variants = ( - db.query(MappedVariant) - .join(MappedVariant.variant) - .join(Variant.score_set) - .filter(ScoreSet.urn == urn) - .filter(MappedVariant.current.is_(True)) - .options( - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set), - contains_eager(MappedVariant.variant) - .contains_eager(Variant.score_set) - .selectinload(ScoreSet.publication_identifier_associations), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment), - ) - .all() - ) + mapped_variants = get_current_mapped_variants_for_annotation(db, score_set) if not mapped_variants: logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context()) @@ -1397,24 +1381,7 @@ def get_score_set_annotated_variants_functional_statement( assert_permission(user_data, score_set, Action.READ) - mapped_variants = ( - db.query(MappedVariant) - .join(MappedVariant.variant) - .join(Variant.score_set) - .filter(ScoreSet.urn == urn) - .filter(MappedVariant.current.is_(True)) - .options( - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set), - contains_eager(MappedVariant.variant) - .contains_eager(Variant.score_set) - .selectinload(ScoreSet.publication_identifier_associations), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment), - ) - .all() - ) + mapped_variants = get_current_mapped_variants_for_annotation(db, score_set) if not mapped_variants: logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context()) @@ -1510,24 +1477,7 @@ def get_score_set_annotated_variants_functional_study_result( assert_permission(user_data, score_set, Action.READ) - mapped_variants = ( - db.query(MappedVariant) - .join(MappedVariant.variant) - .join(Variant.score_set) - .filter(ScoreSet.urn == urn) - .filter(MappedVariant.current.is_(True)) - .options( - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set), - contains_eager(MappedVariant.variant) - .contains_eager(Variant.score_set) - .selectinload(ScoreSet.publication_identifier_associations), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license), - contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment), - ) - .all() - ) + mapped_variants = get_current_mapped_variants_for_annotation(db, score_set) if not mapped_variants: logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context()) diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py index 77d9ac4a..9a1d0a5b 100644 --- a/src/mavedb/scripts/export_public_data.py +++ b/src/mavedb/scripts/export_public_data.py @@ -25,7 +25,8 @@ from sqlalchemy import select from sqlalchemy.orm import Session, joinedload, lazyload -from mavedb.lib.score_sets import get_score_set_variants_as_csv +from mavedb.lib.annotation.annotate import variant_highest_level_annotation +from mavedb.lib.score_sets import get_current_mapped_variants_for_annotation, get_score_set_variants_as_csv from mavedb.models.experiment import Experiment from mavedb.models.experiment_set import ExperimentSet from mavedb.models.license import License @@ -175,6 +176,28 @@ def export_public_data(db: Session): f"[{i + 1}/{num_score_sets}] Wrote annotations + {len(mapped_variants)} mapped variants" ) + # Write VA-Spec annotations NDJSON — mirrors the GET /api/v1/score-sets/{urn}/annotated-variants/* + # streams, emitting one record per current mapped variant at its highest materialized VA level. + annotated_variants = get_current_mapped_variants_for_annotation(db, score_set) + + va_lines = [] + num_annotations = 0 + for mv in annotated_variants: + annotation = variant_highest_level_annotation(mv) + if annotation is not None: + num_annotations += 1 + record = { + "variant_urn": mv.variant.urn, + "annotation": annotation.model_dump(exclude_none=True) if annotation else None, + } + va_lines.append(json.dumps(record, default=str)) + + zipfile.writestr(f"va/{csv_filename_base}.va.ndjson", "\n".join(va_lines)) + logger.info( + f"[{i + 1}/{num_score_sets}] Wrote {len(va_lines)} VA-Spec records " + f"({num_annotations} non-null annotations)" + ) + # Only generate the counts CSV if count columns are present. count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None if count_columns and len(count_columns) > 0: diff --git a/src/mavedb/scripts/resources/README.md b/src/mavedb/scripts/resources/README.md index 3dcd0247..31ec4e26 100644 --- a/src/mavedb/scripts/resources/README.md +++ b/src/mavedb/scripts/resources/README.md @@ -37,8 +37,11 @@ mavedb-dump.YYYYMMDDHHMMSS.zip │ ├── {urn}.counts.csv # Variant counts (score sets with count data only) │ └── {urn}.annotations.csv # Variant annotations from VEP, gnomAD, and ClinGen │ # (score sets that have completed mapping only) -└── mapped/ - └── {urn}.mapped-variants.json # Mapped variant data including VRS alleles and HGVS +├── mapped/ +│ └── {urn}.mapped-variants.json # Mapped variant data including VRS alleles and HGVS +│ # (score sets that have completed mapping only) +└── va/ + └── {urn}.va.ndjson # GA4GH VA-Spec annotations, one record per mapped variant # (score sets that have completed mapping only) ``` @@ -158,6 +161,47 @@ present for score sets that have completed the MaveDB mapping pipeline.** --- +### `va/{urn}.va.ndjson` + +[Newline-delimited JSON](https://ndjson.org/): one line per current mapped variant. Each line is an +envelope mirroring the `GET /api/v1/score-sets/{urn}/annotated-variants/*` streaming endpoints: + +```json +{"variant_urn": "urn:mavedb:00000001-a-1#1", "annotation": { ... }} +``` + +| Field | Description | +|-------|-------------| +| `variant_urn` | URN of the source variant — use this to join with `accession` in the CSV files | +| `annotation` | A single GA4GH VA-Spec object, or `null` | + +Rather than re-emitting every nested layer, each variant carries only its **highest materialized** +VA-Spec layer. The lower layers are not dropped — they are nested inside the higher one (the study +result sits inside the functional statement, which sits inside the pathogenicity statement). Both +statement layers serialize with `type: "Statement"`, so `annotation.type` alone does not distinguish +them — use `annotation.proposition.type`: + +| Layer | `annotation.type` | `annotation.proposition.type` | GA4GH class | Emitted when | +|-------|-------------------|-------------------------------|-------------|--------------| +| Pathogenicity statement | `Statement` | `VariantPathogenicityProposition` | `VariantPathogenicityStatement` | A non-research-use calibration with ACMG classifications exists | +| Functional impact statement | `Statement` | `ExperimentalVariantFunctionalImpactProposition` | `Statement` | A non-research-use calibration with functional ranges exists | +| Study result | `ExperimentalVariantFunctionalImpactStudyResult` | — | `ExperimentalVariantFunctionalImpactStudyResult` | Any variant that can be mapped (lowest layer) | + +**Note on the pathogenicity layer:** its `classification` (e.g. `Pathogenic` / `Uncertain +Significance` / `Benign`) integrates **only MaveDB functional evidence** — every eligible calibration +for the variant, with the strongest determining the statement-level classification — and not the +non-functional ACMG criteria (population frequency, segregation, computational predictions) that a +full clinical determination requires. Treat it as the functional contribution to a classification, to +be combined with other evidence downstream, not as a standalone clinical verdict. Research-use-only +calibrations are excluded. + +`annotation` is `null` for current mapped variants that have no post-mapped allele (and therefore +cannot be annotated); the `variant_urn` is still present on those lines. Every current mapped variant +produces exactly one line, so the line count equals the current mapped-variant count. **Only present +for score sets that have completed the MaveDB mapping pipeline.** + +--- + ## Working with this data ### Joining files for a single score set @@ -202,10 +246,13 @@ score_set = next( - Only **published**, **CC0-licensed** data is included. Datasets with other licenses are not present in this dump even if they are publicly visible on MaveDB. -- Annotation files (`.annotations.csv`) and mapped variant files (`.mapped-variants.json`) are - **only present for score sets that have been processed by the MaveDB variant mapping pipeline**. - Score sets that have not yet been mapped, or for which mapping failed entirely, will not have - these files. +- Annotation files (`.annotations.csv`), mapped variant files (`.mapped-variants.json`), and + VA-Spec files (`.va.ndjson`) are **only present for score sets that have been processed by the + MaveDB variant mapping pipeline**. Score sets that have not yet been mapped, or for which mapping + failed entirely, will not have these files. +- The `va/` files carry only each variant's highest materialized VA-Spec layer (see + [`va/{urn}.va.ndjson`](#vaurnvandjson)). The pathogenicity layer's classification reflects MaveDB + functional evidence only, not a full clinical ACMG determination. - Mapping is applied per variant within a score set. A score set that has completed the mapping pipeline may still contain individual variants with failed mappings. Those variants have `NA` in all `mavedb.*`, `vep.*`, `gnomad.*`, and `clingen.*` columns in the annotations CSV, and diff --git a/tests/lib/annotation/test_annotate.py b/tests/lib/annotation/test_annotate.py index f9118fc2..b05c2c18 100644 --- a/tests/lib/annotation/test_annotate.py +++ b/tests/lib/annotation/test_annotate.py @@ -15,6 +15,7 @@ from mavedb.lib.annotation.annotate import ( variant_functional_impact_statement, + variant_highest_level_annotation, variant_pathogenicity_statement, variant_study_result, ) @@ -294,3 +295,38 @@ def test_pathogenicity_evidence_line_has_evidence_items_are_statement_instances( evidence_item, dict ), "hasEvidenceItems contained a raw dict instead of a model instance" assert evidence_item.type == "Statement" + + +@pytest.mark.unit +class TestVariantHighestLevelAnnotation: + """Unit tests for the highest-materialized-layer resolver used by the public data dump.""" + + def test_study_result_when_uncalibrated(self, mock_mapped_variant): + result = variant_highest_level_annotation(mock_mapped_variant) + + assert result is not None + assert result.type == "ExperimentalVariantFunctionalImpactStudyResult" + + def test_functional_statement_when_functional_only(self, mock_mapped_variant_with_functional_calibration_score_set): + # The functional calibration fixture has no ACMG classifications, so the variant qualifies for the + # functional layer but not pathogenicity. + result = variant_highest_level_annotation(mock_mapped_variant_with_functional_calibration_score_set) + + assert result is not None + assert result.type == "Statement" + assert result.proposition.type == "ExperimentalVariantFunctionalImpactProposition" + + def test_pathogenicity_statement_when_calibrated( + self, mock_mapped_variant_with_pathogenicity_calibration_score_set + ): + result = variant_highest_level_annotation(mock_mapped_variant_with_pathogenicity_calibration_score_set) + + assert result is not None + assert result.type == "Statement" + assert result.proposition.type == "VariantPathogenicityProposition" + + def test_none_when_unmapped(self, mock_mapped_variant): + mock_mapped_variant.post_mapped = None + + result = variant_highest_level_annotation(mock_mapped_variant) + assert result is None From 5c155f4dae9b27929046fb8b5817bc04c53c3a45 Mon Sep 17 00:00:00 2001 From: Benjamin Capodanno Date: Tue, 16 Jun 2026 11:33:26 -0700 Subject: [PATCH 4/4] fix(annotation): make VA-Spec dump output valid and round-trippable - Omit the score-calibration "Baseline score" extension when no baseline score exists. Extension.value is required, so a null value was stripped by model_dump(exclude_none=True) and the object no longer re-parsed through the VA-Spec models. This also corrects the API's VA-Spec streaming endpoints, which share the builder. - Gate dump annotation files on the presence of current mapped variants, so score sets whose mappings are all superseded no longer emit empty or stale annotation files. - Newline-terminate every NDJSON record to match the API streams and keep line-based consumers happy. - Add regression tests covering the baseline-score extension round-trip. --- src/mavedb/lib/annotation/document.py | 19 ++++++++++++----- src/mavedb/scripts/export_public_data.py | 15 ++++++++++--- tests/lib/annotation/test_document.py | 27 ++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/src/mavedb/lib/annotation/document.py b/src/mavedb/lib/annotation/document.py index aba2e4ce..03914203 100644 --- a/src/mavedb/lib/annotation/document.py +++ b/src/mavedb/lib/annotation/document.py @@ -84,11 +84,20 @@ def score_calibration_as_document(score_calibration: ScoreCalibration) -> Docume name="MaveDB Score Calibration", title=score_calibration.title, extensions=[ - Extension( - name="Baseline score", - value=score_calibration.baseline_score, - description=score_calibration.baseline_score_description - or "No description for this baseline score provided.", + # Omit the baseline-score extension when no baseline score exists: Extension.value is required, + # so an extension with a null value will be dropped by model_dump(exclude_none=True) and will + # not round trip when served by the API. + *( + [ + Extension( + name="Baseline score", + value=score_calibration.baseline_score, + description=score_calibration.baseline_score_description + or "No description for this baseline score provided.", + ) + ] + if score_calibration.baseline_score is not None + else [] ), Extension( name="Research use only", diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py index 9a1d0a5b..4ec1b3fb 100644 --- a/src/mavedb/scripts/export_public_data.py +++ b/src/mavedb/scripts/export_public_data.py @@ -145,10 +145,17 @@ def export_public_data(db: Session): csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True) zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) - # Only generate annotation files if mapped variants exist in the score set. + # Only generate annotation files if the score set has at least one current mapped variant. + # A score set whose mappings are all superseded (no current mapping) yields no annotations, + # so we skip emitting empty/superseded-only annotation files for it entirely. has_annotations = ( db.scalars( - select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1) + select(ScoreSet) + .where(ScoreSet.id == score_set_id) + .join(Variant) + .join(MappedVariant) + .where(MappedVariant.current.is_(True)) + .limit(1) ).one_or_none() is not None ) @@ -192,7 +199,9 @@ def export_public_data(db: Session): } va_lines.append(json.dumps(record, default=str)) - zipfile.writestr(f"va/{csv_filename_base}.va.ndjson", "\n".join(va_lines)) + # Newline-terminate every record (including the last) to match the API NDJSON streams + # and keep line-based consumers happy. + zipfile.writestr(f"va/{csv_filename_base}.va.ndjson", "".join(line + "\n" for line in va_lines)) logger.info( f"[{i + 1}/{num_score_sets}] Wrote {len(va_lines)} VA-Spec records " f"({num_annotations} non-null annotations)" diff --git a/tests/lib/annotation/test_document.py b/tests/lib/annotation/test_document.py index eb7558ee..0a9b0ac8 100644 --- a/tests/lib/annotation/test_document.py +++ b/tests/lib/annotation/test_document.py @@ -14,16 +14,20 @@ pytest.importorskip("psycopg2") +from ga4gh.va_spec.base.core import Document + from mavedb.lib.annotation.document import ( experiment_as_iri, experiment_to_document, mapped_variant_as_iri, mapped_variant_to_document, + score_calibration_as_document, score_set_as_iri, score_set_to_document, variant_as_iri, variant_to_document, ) +from tests.helpers.mocks.factories import create_mock_score_calibration BASE_URL = "https://mavedb.org" @@ -134,3 +138,26 @@ def test_variant_to_document(self, mock_variant): assert document.documentType == "genomic variant description" assert len(document.urls) > 0 assert variant_as_iri(mock_variant).root in document.urls + + +@pytest.mark.unit +class TestScoreCalibrationDocumentFunctions: + """Unit tests for score calibration document creation.""" + + def test_includes_baseline_score_extension_when_present(self, mock_score_set): + calibration = create_mock_score_calibration(score_set=mock_score_set, baseline_score=1.5) + document = score_calibration_as_document(calibration) + + baseline = [ext for ext in document.extensions if ext.name == "Baseline score"] + assert len(baseline) == 1 + assert baseline[0].value == 1.5 + + def test_omits_baseline_score_extension_when_none_and_round_trips(self, mock_score_set): + # Extension.value is required, so a None-valued extension is dropped by exclude_none and + # would no longer re-parse. The builder must omit the extension entirely instead. + calibration = create_mock_score_calibration(score_set=mock_score_set, baseline_score=None) + document = score_calibration_as_document(calibration) + + assert all(ext.name != "Baseline score" for ext in document.extensions) + # Regression guard: the exclude_none serialization must round-trip back through the model. + Document(**document.model_dump(exclude_none=True))