VariantEffect · bencap · Jun 23, 2026 · Apr 14, 2026 · Apr 14, 2026 · Jun 15, 2026
diff --git a/src/mavedb/lib/annotation/annotate.py b/src/mavedb/lib/annotation/annotate.py
@@ -8,12 +8,13 @@
     See: https://va-spec.ga4gh.org/en/latest/va-standard-profiles/community-profiles/acmg-2015-profiles.html#variant-pathogenicity-statement-acmg-2015
 """
 
-from typing import Optional
+from typing import Optional, Union
 
 from ga4gh.va_spec.acmg_2015 import VariantPathogenicityStatement
 from ga4gh.va_spec.base.core import ExperimentalVariantFunctionalImpactStudyResult, Statement
 
 from mavedb.lib.annotation.classification import functional_classification_of_variant
+from mavedb.lib.annotation.exceptions import MappingDataDoesntExistException
 from mavedb.lib.annotation.evidence_line import acmg_evidence_line, functional_evidence_line
 from mavedb.lib.annotation.proposition import (
     mapped_variant_to_experimental_variant_clinical_impact_proposition,
@@ -132,3 +133,22 @@ def variant_pathogenicity_statement(
     return mapped_variant_to_pathogenicity_statement(
         mapped_variant, clinical_proposition, clinical_evidence, strongest_calibration, strongest_range
     )
+
+
+def variant_highest_level_annotation(
+    mapped_variant: MappedVariant,
+) -> Optional[Union[ExperimentalVariantFunctionalImpactStudyResult, Statement, VariantPathogenicityStatement]]:
+    """
+    Build the single highest-materialized VA-Spec layer for a mapped variant.
+
+    Layer ladder (highest to lowest): pathogenicity statement -> functional impact statement -> study result.
+    Returns None when the variant has no post-mapped allele and therefore cannot be annotated.
+    """
+    try:
+        if can_annotate_variant_for_pathogenicity_evidence(mapped_variant):
+            return variant_pathogenicity_statement(mapped_variant)
+        if can_annotate_variant_for_functional_statement(mapped_variant):
+            return variant_functional_impact_statement(mapped_variant)
+        return variant_study_result(mapped_variant)
+    except MappingDataDoesntExistException:
+        return None
diff --git a/src/mavedb/lib/annotation/document.py b/src/mavedb/lib/annotation/document.py
@@ -84,11 +84,20 @@ def score_calibration_as_document(score_calibration: ScoreCalibration) -> Docume
         name="MaveDB Score Calibration",
         title=score_calibration.title,
         extensions=[
-            Extension(
-                name="Baseline score",
-                value=score_calibration.baseline_score,
-                description=score_calibration.baseline_score_description
-                or "No description for this baseline score provided.",
+            # Omit the baseline-score extension when no baseline score exists: Extension.value is required,
+            # so an extension with a null value will be dropped by model_dump(exclude_none=True) and will
+            # not round trip when served by the API.
+            *(
+                [
+                    Extension(
+                        name="Baseline score",
+                        value=score_calibration.baseline_score,
+                        description=score_calibration.baseline_score_description
+                        or "No description for this baseline score provided.",
+                    )
+                ]
+                if score_calibration.baseline_score is not None
+                else []
             ),
             Extension(
                 name="Research use only",

diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py
@@ -410,9 +410,7 @@ def fetch_score_set_search_filter_options(
     controlled_keywords_counter_list = []
     for key, label_counter in controlled_keywords_counter.items():
         for label, count in label_counter.items():
-            controlled_keywords_counter_list.append(
-                ControlledKeywordFilterOption(key=key, value=label, count=count)
-            )
+            controlled_keywords_counter_list.append(ControlledKeywordFilterOption(key=key, value=label, count=count))
 
     logger.debug(msg="Score set search filter options were fetched.", extra=logging_context())
 
@@ -556,6 +554,39 @@ def find_publish_or_private_superseded_score_set_tail(
     return score_set
 
 
+def get_current_mapped_variants_for_annotation(db: Session, score_set: ScoreSet) -> Sequence[MappedVariant]:
+    """
+    Load the current mapped variants for a score set with the relationships required to build VA-Spec
+    annotations eagerly loaded.
+
+    This is the single source of truth for the eager-load shape shared by the annotated-variant
+    streaming endpoints and the public data export. The annotation builders reach through
+    ``MappedVariant.variant.score_set`` for publications, contributors, license, experiment, and score
+    calibrations, so each of those is loaded up front to avoid per-variant lazy loads.
+    """
+    return (
+        db.query(MappedVariant)
+        .join(MappedVariant.variant)
+        .join(Variant.score_set)
+        .filter(Variant.score_set_id == score_set.id)
+        .filter(MappedVariant.current.is_(True))
+        .options(
+            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
+            contains_eager(MappedVariant.variant)
+            .contains_eager(Variant.score_set)
+            .selectinload(ScoreSet.publication_identifier_associations),
+            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
+            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
+            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
+            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
+            contains_eager(MappedVariant.variant)
+            .contains_eager(Variant.score_set)
+            .selectinload(ScoreSet.score_calibrations),
+        )
+        .all()
+    )
+
+
 def get_score_set_variants_as_csv(
     db: Session,
     score_set: ScoreSet,
@@ -643,11 +674,11 @@ def get_score_set_variants_as_csv(
             namespaced_score_set_columns[ns] = ["clinical_significance", "clinical_review_status"]
 
     need_mappings = (
-            include_post_mapped_hgvs
-            or "clingen" in namespaces
-            or "vep" in namespaces
-            or "gnomad" in namespaces
-            or bool(clinvar_namespaces)
+        include_post_mapped_hgvs
+        or "clingen" in namespaces
+        or "vep" in namespaces
+        or "gnomad" in namespaces
+        or bool(clinvar_namespaces)
     )
     need_gnomad = "gnomad" in namespaces
 

diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py
@@ -54,6 +54,7 @@
     csv_data_to_df,
     fetch_score_set_search_filter_options,
     find_meta_analyses_for_experiment_sets,
+    get_current_mapped_variants_for_annotation,
     get_score_set_variants_as_csv,
     is_replaces_id_unique_violation,
     refresh_variant_urns,
@@ -1288,24 +1289,7 @@ def get_score_set_annotated_variants(
 
     assert_permission(user_data, score_set, Action.READ)
 
-    mapped_variants = (
-        db.query(MappedVariant)
-        .join(MappedVariant.variant)
-        .join(Variant.score_set)
-        .filter(ScoreSet.urn == urn)
-        .filter(MappedVariant.current.is_(True))
-        .options(
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
-            contains_eager(MappedVariant.variant)
-            .contains_eager(Variant.score_set)
-            .selectinload(ScoreSet.publication_identifier_associations),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
-        )
-        .all()
-    )
+    mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)
 
     if not mapped_variants:
         logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())
@@ -1397,24 +1381,7 @@ def get_score_set_annotated_variants_functional_statement(
 
     assert_permission(user_data, score_set, Action.READ)
 
-    mapped_variants = (
-        db.query(MappedVariant)
-        .join(MappedVariant.variant)
-        .join(Variant.score_set)
-        .filter(ScoreSet.urn == urn)
-        .filter(MappedVariant.current.is_(True))
-        .options(
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
-            contains_eager(MappedVariant.variant)
-            .contains_eager(Variant.score_set)
-            .selectinload(ScoreSet.publication_identifier_associations),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
-        )
-        .all()
-    )
+    mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)
 
     if not mapped_variants:
         logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())
@@ -1510,24 +1477,7 @@ def get_score_set_annotated_variants_functional_study_result(
 
     assert_permission(user_data, score_set, Action.READ)
 
-    mapped_variants = (
-        db.query(MappedVariant)
-        .join(MappedVariant.variant)
-        .join(Variant.score_set)
-        .filter(ScoreSet.urn == urn)
-        .filter(MappedVariant.current.is_(True))
-        .options(
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
-            contains_eager(MappedVariant.variant)
-            .contains_eager(Variant.score_set)
-            .selectinload(ScoreSet.publication_identifier_associations),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
-            contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
-        )
-        .all()
-    )
+    mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)
 
     if not mapped_variants:
         logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())

diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py
@@ -6,22 +6,8 @@
 python3 -m mavedb.scripts.export_public_data
 ```
 
-This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents:
-- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets
-- LICENSE.txt: The text of the Creative Commons Zero license, which applies to all data included in the dump.
-- variants/
-  - [URN].counts.csv (for each variant URN): The score set's variant count columns,
-    sorted by variant number
-  - [URN].scores.csv (for each variant URN): The score set's variant count columns,
-    sorted by variant number
-  - [URN].annotations.csv (for each variant URN with mapped variants): The score set's variant annotations, sorted by
-    variant number. This file is only included for score sets with mapped variants, and includes VEP, gnomAD, and ClinGen annotations.
-
-In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets.
-Experiments are nested in their parent experiment sets, and score sets in their parent experiments.
-
-The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like
-`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance.
+This generates a ZIP archive named `mavedb-dump.YYYYMMDDHHMMSS.zip` in the working directory.
+See `src/mavedb/scripts/resources/README.md` for a full description of the archive contents and file formats.
 
 Unpublished data and data sets licensed other than under the Creative Commons Zero license are not included in the dump,
 and user details are limited to ORCID IDs and names of contributors to published data sets.
@@ -37,16 +23,18 @@
 
 from fastapi.encoders import jsonable_encoder
 from sqlalchemy import select
-from sqlalchemy.orm import Session, lazyload
+from sqlalchemy.orm import Session, joinedload, lazyload
 
-from mavedb.lib.score_sets import get_score_set_variants_as_csv
+from mavedb.lib.annotation.annotate import variant_highest_level_annotation
+from mavedb.lib.score_sets import get_current_mapped_variants_for_annotation, get_score_set_variants_as_csv
 from mavedb.models.experiment import Experiment
 from mavedb.models.experiment_set import ExperimentSet
 from mavedb.models.license import License
 from mavedb.models.mapped_variant import MappedVariant
 from mavedb.models.score_set import ScoreSet
 from mavedb.models.variant import Variant
 from mavedb.scripts.environment import script_environment, with_database_session
+from mavedb.view_models import mapped_variant as mapped_variant_vm
 from mavedb.view_models.experiment_set import ExperimentSetPublicDump
 
 logger = logging.getLogger(__name__)
@@ -114,6 +102,7 @@ def export_public_data(db: Session):
     # Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
     # sets.
     experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
+    logger.info(f"Found {len(experiment_sets)} published experiment sets with CC0-licensed score sets.")
 
     # TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
     # Issue: https://github.com/VariantEffect/mavedb-api/issues/192
@@ -129,7 +118,7 @@ def export_public_data(db: Session):
     timestamp_format = "%Y%m%d%H%M%S"
     zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"
 
-    logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
+    logger.info(f"Writing {zip_file_name} with {len(score_set_ids)} score sets.")
     json_data = {
         "title": "MaveDB public data",
         "asOf": datetime.now(timezone.utc).isoformat(),
@@ -140,24 +129,33 @@ def export_public_data(db: Session):
         # Write metadata for all data sets to a single JSON file.
         zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))
 
-        # Copy the CC0 license.
-        zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
+        # Copy the CC0 license and README.
+        resources_dir = os.path.join(os.path.dirname(__file__), "resources")
+        zipfile.write(os.path.join(resources_dir, "CC0_license.txt"), "LICENSE.txt")
+        zipfile.write(os.path.join(resources_dir, "README.md"), "README.md")
 
         # Write score and count files for each score set.
         num_score_sets = len(score_set_ids)
         for i, score_set_id in enumerate(score_set_ids):
             score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
             if score_set is not None and score_set.urn is not None:
-                logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
+                logger.info(f"[{i + 1}/{num_score_sets}] Exporting score set {score_set.urn}")
                 csv_filename_base = score_set.urn.replace(":", "-")
 
                 csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True)
                 zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
 
-                # Only generate the annotations CSV if mapped variants exist in the score set.
+                # Only generate annotation files if the score set has at least one current mapped variant.
+                # A score set whose mappings are all superseded (no current mapping) yields no annotations,
+                # so we skip emitting empty/superseded-only annotation files for it entirely.
                 has_annotations = (
                     db.scalars(
-                        select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1)
+                        select(ScoreSet)
+                        .where(ScoreSet.id == score_set_id)
+                        .join(Variant)
+                        .join(MappedVariant)
+                        .where(MappedVariant.current.is_(True))
+                        .limit(1)
                     ).one_or_none()
                     is not None
                 )
@@ -167,12 +165,56 @@ def export_public_data(db: Session):
                     )
                     zipfile.writestr(f"csv/{csv_filename_base}.annotations.csv", csv_str)
 
+                    # Write mapped variants JSON — mirrors GET /api/v1/score-sets/{urn}/mapped-variants.
+                    mapped_variants = db.scalars(
+                        select(MappedVariant)
+                        .join(Variant, Variant.id == MappedVariant.variant_id)
+                        .options(joinedload(MappedVariant.variant))
+                        .where(Variant.score_set_id == score_set_id)
+                    ).all()
+                    mapped_variant_views = [
+                        mapped_variant_vm.MappedVariant.model_validate(mv) for mv in mapped_variants
+                    ]
+                    zipfile.writestr(
+                        f"mapped/{csv_filename_base}.mapped-variants.json",
+                        json.dumps(jsonable_encoder(mapped_variant_views)),
+                    )
+                    logger.info(
+                        f"[{i + 1}/{num_score_sets}]   Wrote annotations + {len(mapped_variants)} mapped variants"
+                    )
+
+                    # Write VA-Spec annotations NDJSON — mirrors the GET /api/v1/score-sets/{urn}/annotated-variants/*
+                    # streams, emitting one record per current mapped variant at its highest materialized VA level.
+                    annotated_variants = get_current_mapped_variants_for_annotation(db, score_set)
+
+                    va_lines = []
+                    num_annotations = 0
+                    for mv in annotated_variants:
+                        annotation = variant_highest_level_annotation(mv)
+                        if annotation is not None:
+                            num_annotations += 1
+                        record = {
+                            "variant_urn": mv.variant.urn,
+                            "annotation": annotation.model_dump(exclude_none=True) if annotation else None,
+                        }
+                        va_lines.append(json.dumps(record, default=str))
+
+                    # Newline-terminate every record (including the last) to match the API NDJSON streams
+                    # and keep line-based consumers happy.
+                    zipfile.writestr(f"va/{csv_filename_base}.va.ndjson", "".join(line + "\n" for line in va_lines))
+                    logger.info(
+                        f"[{i + 1}/{num_score_sets}]   Wrote {len(va_lines)} VA-Spec records "
+                        f"({num_annotations} non-null annotations)"
+                    )
+
                 # Only generate the counts CSV if count columns are present.
                 count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
                 if count_columns and len(count_columns) > 0:
                     csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], namespaced=True)
                     zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
 
+    logger.info(f"Export complete: {zip_file_name}")
+
 
 if __name__ == "__main__":
     export_public_data()