Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion src/mavedb/lib/annotation/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
See: https://va-spec.ga4gh.org/en/latest/va-standard-profiles/community-profiles/acmg-2015-profiles.html#variant-pathogenicity-statement-acmg-2015
"""

from typing import Optional
from typing import Optional, Union

from ga4gh.va_spec.acmg_2015 import VariantPathogenicityStatement
from ga4gh.va_spec.base.core import ExperimentalVariantFunctionalImpactStudyResult, Statement

from mavedb.lib.annotation.classification import functional_classification_of_variant
from mavedb.lib.annotation.exceptions import MappingDataDoesntExistException
from mavedb.lib.annotation.evidence_line import acmg_evidence_line, functional_evidence_line
from mavedb.lib.annotation.proposition import (
mapped_variant_to_experimental_variant_clinical_impact_proposition,
Expand Down Expand Up @@ -132,3 +133,22 @@ def variant_pathogenicity_statement(
return mapped_variant_to_pathogenicity_statement(
mapped_variant, clinical_proposition, clinical_evidence, strongest_calibration, strongest_range
)


def variant_highest_level_annotation(
mapped_variant: MappedVariant,
) -> Optional[Union[ExperimentalVariantFunctionalImpactStudyResult, Statement, VariantPathogenicityStatement]]:
"""
Build the single highest-materialized VA-Spec layer for a mapped variant.

Layer ladder (highest to lowest): pathogenicity statement -> functional impact statement -> study result.
Returns None when the variant has no post-mapped allele and therefore cannot be annotated.
"""
try:
if can_annotate_variant_for_pathogenicity_evidence(mapped_variant):
return variant_pathogenicity_statement(mapped_variant)
if can_annotate_variant_for_functional_statement(mapped_variant):
return variant_functional_impact_statement(mapped_variant)
return variant_study_result(mapped_variant)
except MappingDataDoesntExistException:
return None
19 changes: 14 additions & 5 deletions src/mavedb/lib/annotation/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,20 @@ def score_calibration_as_document(score_calibration: ScoreCalibration) -> Docume
name="MaveDB Score Calibration",
title=score_calibration.title,
extensions=[
Extension(
name="Baseline score",
value=score_calibration.baseline_score,
description=score_calibration.baseline_score_description
or "No description for this baseline score provided.",
# Omit the baseline-score extension when no baseline score exists: Extension.value is required,
# so an extension with a null value will be dropped by model_dump(exclude_none=True) and will
# not round trip when served by the API.
*(
[
Extension(
name="Baseline score",
value=score_calibration.baseline_score,
description=score_calibration.baseline_score_description
or "No description for this baseline score provided.",
)
]
if score_calibration.baseline_score is not None
else []
),
Extension(
name="Research use only",
Expand Down
47 changes: 39 additions & 8 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,7 @@ def fetch_score_set_search_filter_options(
controlled_keywords_counter_list = []
for key, label_counter in controlled_keywords_counter.items():
for label, count in label_counter.items():
controlled_keywords_counter_list.append(
ControlledKeywordFilterOption(key=key, value=label, count=count)
)
controlled_keywords_counter_list.append(ControlledKeywordFilterOption(key=key, value=label, count=count))

logger.debug(msg="Score set search filter options were fetched.", extra=logging_context())

Expand Down Expand Up @@ -556,6 +554,39 @@ def find_publish_or_private_superseded_score_set_tail(
return score_set


def get_current_mapped_variants_for_annotation(db: Session, score_set: ScoreSet) -> Sequence[MappedVariant]:
"""
Load the current mapped variants for a score set with the relationships required to build VA-Spec
annotations eagerly loaded.

This is the single source of truth for the eager-load shape shared by the annotated-variant
streaming endpoints and the public data export. The annotation builders reach through
``MappedVariant.variant.score_set`` for publications, contributors, license, experiment, and score
calibrations, so each of those is loaded up front to avoid per-variant lazy loads.
"""
return (
db.query(MappedVariant)
.join(MappedVariant.variant)
.join(Variant.score_set)
.filter(Variant.score_set_id == score_set.id)
.filter(MappedVariant.current.is_(True))
.options(
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
contains_eager(MappedVariant.variant)
.contains_eager(Variant.score_set)
.selectinload(ScoreSet.publication_identifier_associations),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
contains_eager(MappedVariant.variant)
.contains_eager(Variant.score_set)
.selectinload(ScoreSet.score_calibrations),
)
.all()
)


def get_score_set_variants_as_csv(
db: Session,
score_set: ScoreSet,
Expand Down Expand Up @@ -643,11 +674,11 @@ def get_score_set_variants_as_csv(
namespaced_score_set_columns[ns] = ["clinical_significance", "clinical_review_status"]

need_mappings = (
include_post_mapped_hgvs
or "clingen" in namespaces
or "vep" in namespaces
or "gnomad" in namespaces
or bool(clinvar_namespaces)
include_post_mapped_hgvs
or "clingen" in namespaces
or "vep" in namespaces
or "gnomad" in namespaces
or bool(clinvar_namespaces)
)
need_gnomad = "gnomad" in namespaces

Expand Down
58 changes: 4 additions & 54 deletions src/mavedb/routers/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
csv_data_to_df,
fetch_score_set_search_filter_options,
find_meta_analyses_for_experiment_sets,
get_current_mapped_variants_for_annotation,
get_score_set_variants_as_csv,
is_replaces_id_unique_violation,
refresh_variant_urns,
Expand Down Expand Up @@ -1288,24 +1289,7 @@ def get_score_set_annotated_variants(

assert_permission(user_data, score_set, Action.READ)

mapped_variants = (
db.query(MappedVariant)
.join(MappedVariant.variant)
.join(Variant.score_set)
.filter(ScoreSet.urn == urn)
.filter(MappedVariant.current.is_(True))
.options(
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
contains_eager(MappedVariant.variant)
.contains_eager(Variant.score_set)
.selectinload(ScoreSet.publication_identifier_associations),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
)
.all()
)
mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)

if not mapped_variants:
logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())
Expand Down Expand Up @@ -1397,24 +1381,7 @@ def get_score_set_annotated_variants_functional_statement(

assert_permission(user_data, score_set, Action.READ)

mapped_variants = (
db.query(MappedVariant)
.join(MappedVariant.variant)
.join(Variant.score_set)
.filter(ScoreSet.urn == urn)
.filter(MappedVariant.current.is_(True))
.options(
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
contains_eager(MappedVariant.variant)
.contains_eager(Variant.score_set)
.selectinload(ScoreSet.publication_identifier_associations),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
)
.all()
)
mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)

if not mapped_variants:
logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())
Expand Down Expand Up @@ -1510,24 +1477,7 @@ def get_score_set_annotated_variants_functional_study_result(

assert_permission(user_data, score_set, Action.READ)

mapped_variants = (
db.query(MappedVariant)
.join(MappedVariant.variant)
.join(Variant.score_set)
.filter(ScoreSet.urn == urn)
.filter(MappedVariant.current.is_(True))
.options(
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set),
contains_eager(MappedVariant.variant)
.contains_eager(Variant.score_set)
.selectinload(ScoreSet.publication_identifier_associations),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.created_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.modified_by),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.license),
contains_eager(MappedVariant.variant).contains_eager(Variant.score_set).selectinload(ScoreSet.experiment),
)
.all()
)
mapped_variants = get_current_mapped_variants_for_annotation(db, score_set)

if not mapped_variants:
logger.info(msg="No mapped variants are associated with the requested score set.", extra=logging_context())
Expand Down
90 changes: 66 additions & 24 deletions src/mavedb/scripts/export_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,8 @@
python3 -m mavedb.scripts.export_public_data
```

This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents:
- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets
- LICENSE.txt: The text of the Creative Commons Zero license, which applies to all data included in the dump.
- variants/
- [URN].counts.csv (for each variant URN): The score set's variant count columns,
sorted by variant number
- [URN].scores.csv (for each variant URN): The score set's variant count columns,
sorted by variant number
- [URN].annotations.csv (for each variant URN with mapped variants): The score set's variant annotations, sorted by
variant number. This file is only included for score sets with mapped variants, and includes VEP, gnomAD, and ClinGen annotations.

In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets.
Experiments are nested in their parent experiment sets, and score sets in their parent experiments.

The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like
`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance.
This generates a ZIP archive named `mavedb-dump.YYYYMMDDHHMMSS.zip` in the working directory.
See `src/mavedb/scripts/resources/README.md` for a full description of the archive contents and file formats.

Unpublished data and data sets licensed other than under the Creative Commons Zero license are not included in the dump,
and user details are limited to ORCID IDs and names of contributors to published data sets.
Expand All @@ -37,16 +23,18 @@

from fastapi.encoders import jsonable_encoder
from sqlalchemy import select
from sqlalchemy.orm import Session, lazyload
from sqlalchemy.orm import Session, joinedload, lazyload

from mavedb.lib.score_sets import get_score_set_variants_as_csv
from mavedb.lib.annotation.annotate import variant_highest_level_annotation
from mavedb.lib.score_sets import get_current_mapped_variants_for_annotation, get_score_set_variants_as_csv
from mavedb.models.experiment import Experiment
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.license import License
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.score_set import ScoreSet
from mavedb.models.variant import Variant
from mavedb.scripts.environment import script_environment, with_database_session
from mavedb.view_models import mapped_variant as mapped_variant_vm
from mavedb.view_models.experiment_set import ExperimentSetPublicDump

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -114,6 +102,7 @@ def export_public_data(db: Session):
# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
# sets.
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
logger.info(f"Found {len(experiment_sets)} published experiment sets with CC0-licensed score sets.")

# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
Expand All @@ -129,7 +118,7 @@ def export_public_data(db: Session):
timestamp_format = "%Y%m%d%H%M%S"
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"

logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
logger.info(f"Writing {zip_file_name} with {len(score_set_ids)} score sets.")
json_data = {
"title": "MaveDB public data",
"asOf": datetime.now(timezone.utc).isoformat(),
Expand All @@ -140,24 +129,33 @@ def export_public_data(db: Session):
# Write metadata for all data sets to a single JSON file.
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))

# Copy the CC0 license.
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
# Copy the CC0 license and README.
resources_dir = os.path.join(os.path.dirname(__file__), "resources")
zipfile.write(os.path.join(resources_dir, "CC0_license.txt"), "LICENSE.txt")
zipfile.write(os.path.join(resources_dir, "README.md"), "README.md")

# Write score and count files for each score set.
num_score_sets = len(score_set_ids)
for i, score_set_id in enumerate(score_set_ids):
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
if score_set is not None and score_set.urn is not None:
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
logger.info(f"[{i + 1}/{num_score_sets}] Exporting score set {score_set.urn}")
csv_filename_base = score_set.urn.replace(":", "-")

csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True)
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)

# Only generate the annotations CSV if mapped variants exist in the score set.
# Only generate annotation files if the score set has at least one current mapped variant.
# A score set whose mappings are all superseded (no current mapping) yields no annotations,
# so we skip emitting empty/superseded-only annotation files for it entirely.
has_annotations = (
db.scalars(
select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1)
select(ScoreSet)
.where(ScoreSet.id == score_set_id)
.join(Variant)
.join(MappedVariant)
.where(MappedVariant.current.is_(True))
.limit(1)
).one_or_none()
is not None
)
Expand All @@ -167,12 +165,56 @@ def export_public_data(db: Session):
)
zipfile.writestr(f"csv/{csv_filename_base}.annotations.csv", csv_str)

# Write mapped variants JSON — mirrors GET /api/v1/score-sets/{urn}/mapped-variants.
mapped_variants = db.scalars(
select(MappedVariant)
.join(Variant, Variant.id == MappedVariant.variant_id)
.options(joinedload(MappedVariant.variant))
.where(Variant.score_set_id == score_set_id)
).all()
mapped_variant_views = [
mapped_variant_vm.MappedVariant.model_validate(mv) for mv in mapped_variants
]
zipfile.writestr(
f"mapped/{csv_filename_base}.mapped-variants.json",
json.dumps(jsonable_encoder(mapped_variant_views)),
)
logger.info(
f"[{i + 1}/{num_score_sets}] Wrote annotations + {len(mapped_variants)} mapped variants"
)

# Write VA-Spec annotations NDJSON — mirrors the GET /api/v1/score-sets/{urn}/annotated-variants/*
# streams, emitting one record per current mapped variant at its highest materialized VA level.
annotated_variants = get_current_mapped_variants_for_annotation(db, score_set)

va_lines = []
num_annotations = 0
for mv in annotated_variants:
annotation = variant_highest_level_annotation(mv)
if annotation is not None:
num_annotations += 1
record = {
"variant_urn": mv.variant.urn,
"annotation": annotation.model_dump(exclude_none=True) if annotation else None,
}
va_lines.append(json.dumps(record, default=str))

# Newline-terminate every record (including the last) to match the API NDJSON streams
# and keep line-based consumers happy.
zipfile.writestr(f"va/{csv_filename_base}.va.ndjson", "".join(line + "\n" for line in va_lines))
logger.info(
f"[{i + 1}/{num_score_sets}] Wrote {len(va_lines)} VA-Spec records "
f"({num_annotations} non-null annotations)"
)

# Only generate the counts CSV if count columns are present.
count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
if count_columns and len(count_columns) > 0:
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], namespaced=True)
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)

logger.info(f"Export complete: {zip_file_name}")


if __name__ == "__main__":
export_public_data()
Loading
Loading