diff --git a/Dockerfile b/Dockerfile index 422e2bf83..bc5a52c45 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,7 +80,7 @@ RUN curl -sSL https://install.python-poetry.org | python3 - COPY poetry.lock pyproject.toml ./ # installs runtime dependencies to $VIRTUAL_ENV -RUN poetry install --no-root --extras server +RUN poetry install --no-root --extras server --no-directory COPY alembic /code/alembic COPY alembic.ini /code/alembic.ini COPY src /code/src diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..16f8319bc --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +VENV := .venv/bin + +.DEFAULT_GOAL := help + +.PHONY: help +help: + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-12s %s\n", $$1, $$2}' + +.PHONY: dev +dev: ## Install deps including local editable variant-annotation + poetry install --extras server + +.PHONY: test +test: ## Run the test suite + $(VENV)/pytest tests/ + +.PHONY: lint +lint: ## Check code with ruff + $(VENV)/ruff check src/ tests/ + +.PHONY: format +format: ## Format code with ruff + $(VENV)/ruff format src/ tests/ diff --git a/alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py b/alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py new file mode 100644 index 000000000..49c502d05 --- /dev/null +++ b/alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py @@ -0,0 +1,151 @@ +"""Add mapping_records, alleles, and mapping_record_alleles tables + +Revision ID: a1b2c3d4e5f6 +Revises: 398067c53257 +Create Date: 2026-05-29 + +New parallel tables for the Better Reverse Translation epic (#746). +The existing mapped_variants table is left untouched (frozen serving). +""" + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a1b2c3d4e5f6" +down_revision = "398067c53257" +branch_labels = None +depends_on = None + +VALID_ASSAY_LEVELS = "('genomic', 'cdna', 'protein')" +VALID_ALIGNMENT_LEVELS = "('protein', 'cdna', 'genomic')" + + +def upgrade() -> None: + op.create_table( + "alleles", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("vrs_digest", sa.String(), nullable=False), + sa.Column("level", sa.String(length=16), nullable=False), + sa.Column("transcript", sa.String(), nullable=False), + sa.Column("hgvs_g", sa.String(), nullable=True), + sa.Column("hgvs_c", sa.String(), nullable=True), + sa.Column("hgvs_p", sa.String(), nullable=True), + sa.Column("clingen_allele_id", sa.String(), nullable=True), + sa.Column("post_mapped", postgresql.JSONB(), nullable=True), + sa.Column("created_at", sa.Date(), nullable=False, server_default=sa.text("CURRENT_DATE")), + sa.Column( + "updated_at", + sa.Date(), + nullable=False, + server_default=sa.text("CURRENT_DATE"), + onupdate=sa.text("CURRENT_DATE"), + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("vrs_digest", name="uq_alleles_vrs_digest"), + ) + op.create_index("ix_alleles_vrs_digest", "alleles", ["vrs_digest"]) + op.create_index("ix_alleles_level", "alleles", ["level"]) + op.create_index("ix_alleles_clingen_allele_id", "alleles", ["clingen_allele_id"]) + + op.create_table( + "mapping_records", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("variant_id", sa.Integer(), nullable=False), + sa.Column("vrs_digest", sa.String(), nullable=True), + sa.Column("pre_mapped", postgresql.JSONB(), nullable=True), + sa.Column("assay_level", sa.String(length=16), nullable=False), + sa.Column("hgvs_assay_level", sa.String(), nullable=True), + sa.Column("mapping_api_version", sa.String(), nullable=False), + sa.Column("mapped_date", sa.Date(), nullable=False), + sa.Column("vrs_version", sa.String(), nullable=True), + sa.Column("current", sa.Boolean(), nullable=False), + sa.Column("alignment_level", sa.String(length=16), nullable=True), + sa.Column("at_mismatched_locus", sa.Boolean(), nullable=True), + sa.Column("near_gap", sa.Boolean(), nullable=True), + sa.Column("target_gene_mapping_id", sa.Integer(), nullable=True), + sa.Column("created_at", sa.Date(), nullable=False, server_default=sa.text("CURRENT_DATE")), + sa.Column( + "updated_at", + sa.Date(), + nullable=False, + server_default=sa.text("CURRENT_DATE"), + onupdate=sa.text("CURRENT_DATE"), + ), + sa.ForeignKeyConstraint( + ["variant_id"], + ["variants.id"], + name="fk_mapping_records_variant_id", + ), + sa.ForeignKeyConstraint( + ["target_gene_mapping_id"], + ["target_gene_mappings.id"], + name="fk_mapping_records_target_gene_mapping_id", + ), + sa.PrimaryKeyConstraint("id"), + sa.CheckConstraint( + f"assay_level IN {VALID_ASSAY_LEVELS}", + name="ck_mapping_records_assay_level_valid", + ), + ) + op.create_index("ix_mapping_records_variant_id", "mapping_records", ["variant_id"]) + op.create_index("ix_mapping_records_vrs_digest", "mapping_records", ["vrs_digest"]) + op.create_index( + "ix_mapping_records_target_gene_mapping_id", + "mapping_records", + ["target_gene_mapping_id"], + ) + + op.create_table( + "mapping_record_alleles", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("mapping_record_id", sa.Integer(), nullable=False), + sa.Column("allele_id", sa.Integer(), nullable=False), + sa.Column( + "is_authoritative", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + sa.ForeignKeyConstraint( + ["mapping_record_id"], + ["mapping_records.id"], + name="fk_mapping_record_alleles_mapping_record_id", + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["allele_id"], + ["alleles.id"], + name="fk_mapping_record_alleles_allele_id", + ondelete="RESTRICT", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_mapping_record_alleles_mapping_record_id", + "mapping_record_alleles", + ["mapping_record_id"], + ) + op.create_index( + "ix_mapping_record_alleles_allele_id", + "mapping_record_alleles", + ["allele_id"], + ) + + +def downgrade() -> None: + op.drop_index("ix_mapping_record_alleles_allele_id", table_name="mapping_record_alleles") + op.drop_index("ix_mapping_record_alleles_mapping_record_id", table_name="mapping_record_alleles") + op.drop_table("mapping_record_alleles") + + op.drop_index("ix_mapping_records_target_gene_mapping_id", table_name="mapping_records") + op.drop_index("ix_mapping_records_vrs_digest", table_name="mapping_records") + op.drop_index("ix_mapping_records_variant_id", table_name="mapping_records") + op.drop_table("mapping_records") + + op.drop_index("ix_alleles_clingen_allele_id", table_name="alleles") + op.drop_index("ix_alleles_level", table_name="alleles") + op.drop_index("ix_alleles_vrs_digest", table_name="alleles") + op.drop_table("alleles") diff --git a/alembic/versions/b8e1f0a2c4d7_drop_alleles_transcript_column.py b/alembic/versions/b8e1f0a2c4d7_drop_alleles_transcript_column.py new file mode 100644 index 000000000..2705d9a3c --- /dev/null +++ b/alembic/versions/b8e1f0a2c4d7_drop_alleles_transcript_column.py @@ -0,0 +1,32 @@ +"""drop alleles.transcript column + +Revision ID: b8e1f0a2c4d7 +Revises: f4d2a9c1b7e3 +Create Date: 2026-06-05 + +The `transcript` column duplicated data already present in the HGVS columns — it was +always extract_accession(hgvs_g/hgvs_c/hgvs_p). It is now a derived hybrid_property on +the Allele model (split_part(coalesce(hgvs_g, hgvs_c, hgvs_p), ':', 1)), so the stored +column is removed to keep a single source of truth and avoid drift. +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "b8e1f0a2c4d7" +down_revision = "f4d2a9c1b7e3" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.drop_column("alleles", "transcript") + + +def downgrade() -> None: + # Re-add the column and backfill it from the HGVS columns (the same derivation the + # hybrid_property uses) so the restored NOT NULL column is consistent. + op.add_column("alleles", sa.Column("transcript", sa.String(), nullable=True)) + op.execute("UPDATE alleles SET transcript = split_part(coalesce(hgvs_g, hgvs_c, hgvs_p), ':', 1)") + op.alter_column("alleles", "transcript", nullable=False) diff --git a/alembic/versions/c3d5e7f9a1b2_temporal_mapping_record_alleles.py b/alembic/versions/c3d5e7f9a1b2_temporal_mapping_record_alleles.py new file mode 100644 index 000000000..26d3d6dbf --- /dev/null +++ b/alembic/versions/c3d5e7f9a1b2_temporal_mapping_record_alleles.py @@ -0,0 +1,48 @@ +"""add valid-time versioning to mapping_record_alleles + +Revision ID: c3d5e7f9a1b2 +Revises: b8e1f0a2c4d7 +Create Date: 2026-06-05 + +Make the link table valid-time versioned (TemporalLink): a link is live while valid_to is +NULL, and superseding it closes valid_to instead of deleting, so reverse translation can be +re-run independently while prior derivations remain queryable point-in-time. The partial +unique index enforces a single live link per (mapping_record, allele). + +Assumes no pre-existing duplicate live links — true for these parallel tables, which are +new-only writes and not yet serving. If this ever runs against data with duplicates, the +unique index creation will fail and the duplicates must be retired first. +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c3d5e7f9a1b2" +down_revision = "b8e1f0a2c4d7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "mapping_record_alleles", + sa.Column("valid_from", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + ) + op.add_column( + "mapping_record_alleles", + sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True), + ) + op.create_index( + "uq_mapping_record_alleles_live", + "mapping_record_alleles", + ["mapping_record_id", "allele_id"], + unique=True, + postgresql_where=sa.text("valid_to IS NULL"), + ) + + +def downgrade() -> None: + op.drop_index("uq_mapping_record_alleles_live", table_name="mapping_record_alleles") + op.drop_column("mapping_record_alleles", "valid_to") + op.drop_column("mapping_record_alleles", "valid_from") diff --git a/alembic/versions/d4e6f8a0b2c3_temporal_mapping_records.py b/alembic/versions/d4e6f8a0b2c3_temporal_mapping_records.py new file mode 100644 index 000000000..603c171a5 --- /dev/null +++ b/alembic/versions/d4e6f8a0b2c3_temporal_mapping_records.py @@ -0,0 +1,76 @@ +"""move mapping_records onto valid-time versioning + +Revision ID: d4e6f8a0b2c3 +Revises: c3d5e7f9a1b2 +Create Date: 2026-06-05 + +Replace the stored `current` flag and the `created_at`/`updated_at` audit dates with valid-time +columns (ValidTime mixin): a mapping record is live while valid_to is NULL, and a re-map retires +the prior version (closing valid_to) instead of flipping a boolean. `current` becomes derived +(valid_to IS NULL). `mapped_date` (the date the mapping was performed) is domain data and stays. + +The partial unique index promotes to the database the "one live mapping record per variant" +invariant the mapping job previously enforced only in app code. + +Backfills from the columns being dropped, so existing rows keep their validity. Assumes no +duplicate live records per variant (true for these pre-cutover parallel tables; otherwise the +unique index creation fails and the duplicates must be retired first). +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d4e6f8a0b2c3" +down_revision = "c3d5e7f9a1b2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "mapping_records", + sa.Column("valid_from", sa.DateTime(timezone=True), nullable=True, server_default=sa.func.now()), + ) + op.add_column( + "mapping_records", + sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True), + ) + + # Backfill validity from the columns being replaced: a record's life began at created_at, and + # a non-current record was retired at updated_at (the only in-place update it ever took). + op.execute("UPDATE mapping_records SET valid_from = created_at::timestamptz") + op.execute("UPDATE mapping_records SET valid_to = updated_at::timestamptz WHERE current = false") + + op.alter_column("mapping_records", "valid_from", nullable=False) + + op.drop_column("mapping_records", "current") + op.drop_column("mapping_records", "created_at") + op.drop_column("mapping_records", "updated_at") + + op.create_index( + "uq_mapping_records_current", + "mapping_records", + ["variant_id"], + unique=True, + postgresql_where=sa.text("valid_to IS NULL"), + ) + + +def downgrade() -> None: + op.drop_index("uq_mapping_records_current", table_name="mapping_records") + + op.add_column("mapping_records", sa.Column("current", sa.Boolean(), nullable=True)) + op.add_column("mapping_records", sa.Column("created_at", sa.Date(), nullable=True)) + op.add_column("mapping_records", sa.Column("updated_at", sa.Date(), nullable=True)) + + op.execute("UPDATE mapping_records SET current = (valid_to IS NULL)") + op.execute("UPDATE mapping_records SET created_at = valid_from::date") + op.execute("UPDATE mapping_records SET updated_at = coalesce(valid_to, valid_from)::date") + + op.alter_column("mapping_records", "current", nullable=False) + op.alter_column("mapping_records", "created_at", nullable=False) + op.alter_column("mapping_records", "updated_at", nullable=False) + + op.drop_column("mapping_records", "valid_to") + op.drop_column("mapping_records", "valid_from") diff --git a/alembic/versions/f4d2a9c1b7e3_add_cross_level_translation_annotation_type.py b/alembic/versions/f4d2a9c1b7e3_add_cross_level_translation_annotation_type.py new file mode 100644 index 000000000..790659d08 --- /dev/null +++ b/alembic/versions/f4d2a9c1b7e3_add_cross_level_translation_annotation_type.py @@ -0,0 +1,49 @@ +"""add cross_level_translation annotation type + +Revision ID: f4d2a9c1b7e3 +Revises: a1b2c3d4e5f6 +Create Date: 2026-06-02 + +Extends ck_variant_annotation_type_valid to allow the 'cross_level_translation' +annotation type. The VRS mapping worker writes one such row per variant to record +whether cross-level translation (filling the levels the assay did not map) +succeeded, was skipped (multivariant / no transcript), or failed. +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "f4d2a9c1b7e3" +down_revision = "a1b2c3d4e5f6" +branch_labels = None +depends_on = None + +_TYPES_OLD = ( + "'vrs_mapping', 'clingen_allele_id', 'mapped_hgvs', 'variant_translation', " + "'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', " + "'ldh_submission'" +) +_TYPES_NEW = "'vrs_mapping', 'cross_level_translation', " + ( + "'clingen_allele_id', 'mapped_hgvs', 'variant_translation', " + "'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', " + "'ldh_submission'" +) + + +def upgrade() -> None: + op.drop_constraint("ck_variant_annotation_type_valid", "variant_annotation_status", type_="check") + op.create_check_constraint( + "ck_variant_annotation_type_valid", + "variant_annotation_status", + f"annotation_type IN ({_TYPES_NEW})", + ) + + +def downgrade() -> None: + op.execute("DELETE FROM variant_annotation_status WHERE annotation_type = 'cross_level_translation'") + op.drop_constraint("ck_variant_annotation_type_valid", "variant_annotation_status", type_="check") + op.create_check_constraint( + "ck_variant_annotation_type_valid", + "variant_annotation_status", + f"annotation_type IN ({_TYPES_OLD})", + ) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 9835cff4e..ac4bcff58 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -19,6 +19,7 @@ services: REDIS_PORT: 6379 REDIS_SSL: "false" LOG_CONFIG: dev + PYTHONPATH: "/variant-annotation:/code/src" ports: - "8002:8000" ulimits: @@ -27,6 +28,7 @@ services: hard: 65536 volumes: - .:/code + - ../variant-annotation:/variant-annotation - mavedb-seqrepo-dev:/usr/local/share/seqrepo worker: @@ -44,12 +46,14 @@ services: REDIS_PORT: 6379 REDIS_SSL: "false" LOG_CONFIG: dev + PYTHONPATH: "/variant-annotation:/code/src" ulimits: nofile: soft: 65536 hard: 65536 volumes: - .:/code + - ../variant-annotation:/variant-annotation - mavedb-seqrepo-dev:/usr/local/share/seqrepo depends_on: - db diff --git a/mypy_stubs/ga4gh/core/__init__.pyi b/mypy_stubs/ga4gh/core/__init__.pyi index f5814d7cf..8044c175f 100644 --- a/mypy_stubs/ga4gh/core/__init__.pyi +++ b/mypy_stubs/ga4gh/core/__init__.pyi @@ -1,3 +1,10 @@ from . import models as models +from .identifiers import PrevVrsVersion as PrevVrsVersion -__all__ = ["models"] +def ga4gh_identify( + vro: object, + in_place: str = ..., + as_version: PrevVrsVersion | None = ..., +) -> str | None: ... + +__all__ = ["PrevVrsVersion", "ga4gh_identify", "models"] diff --git a/mypy_stubs/ga4gh/vrs/dataproxy.pyi b/mypy_stubs/ga4gh/vrs/dataproxy.pyi new file mode 100644 index 000000000..25cabfda9 --- /dev/null +++ b/mypy_stubs/ga4gh/vrs/dataproxy.pyi @@ -0,0 +1,8 @@ +from abc import ABC + +class _DataProxy(ABC): + def get_metadata(self, identifier: str) -> dict: ... + def get_sequence(self, identifier: str, start: int | None = ..., end: int | None = ...) -> str: ... + +class SeqRepoDataProxy(_DataProxy): + def __init__(self, sr: object) -> None: ... diff --git a/mypy_stubs/ga4gh/vrs/extras/__init__.pyi b/mypy_stubs/ga4gh/vrs/extras/__init__.pyi new file mode 100644 index 000000000..e69de29bb diff --git a/mypy_stubs/ga4gh/vrs/extras/translator.pyi b/mypy_stubs/ga4gh/vrs/extras/translator.pyi new file mode 100644 index 000000000..d3bb6aeb4 --- /dev/null +++ b/mypy_stubs/ga4gh/vrs/extras/translator.pyi @@ -0,0 +1,19 @@ +from typing import Any + +from ga4gh.vrs.dataproxy import _DataProxy + +class _Translator: + default_assembly_name: str + data_proxy: _DataProxy + identify: bool + def __init__( + self, + data_proxy: _DataProxy, + default_assembly_name: str = ..., + identify: bool = ..., + ) -> None: ... + # Returns a VRS variation (Allele/CisPhasedBlock/...); typed loosely so callers + # can annotate the concrete subtype they expect without a redundant cast. + def translate_from(self, var: str, fmt: str | None = ..., **kwargs: Any) -> Any: ... + +class AlleleTranslator(_Translator): ... diff --git a/mypy_stubs/ga4gh/vrs/normalize.pyi b/mypy_stubs/ga4gh/vrs/normalize.pyi new file mode 100644 index 000000000..1aea15b23 --- /dev/null +++ b/mypy_stubs/ga4gh/vrs/normalize.pyi @@ -0,0 +1,5 @@ +from typing import Any + +from ga4gh.vrs.dataproxy import _DataProxy + +def normalize(vo: Any, data_proxy: _DataProxy | None = ..., **kwargs: Any) -> Any: ... diff --git a/mypy_stubs/hgvs/assemblymapper.pyi b/mypy_stubs/hgvs/assemblymapper.pyi new file mode 100644 index 000000000..798878312 --- /dev/null +++ b/mypy_stubs/hgvs/assemblymapper.pyi @@ -0,0 +1,20 @@ +from typing import Any + +import hgvs.sequencevariant + +class AssemblyMapper: + def __init__( + self, + hdp: Any, + assembly_name: str = ..., + alt_aln_method: str = ..., + normalize: bool = ..., + prevalidation_level: str | None = ..., + in_par_assume: str = ..., + replace_reference: bool = ..., + *args: Any, + **kwargs: Any, + ) -> None: ... + def g_to_t(self, var_g: Any, tx_ac: str) -> hgvs.sequencevariant.SequenceVariant: ... + def c_to_g(self, var_c: Any) -> hgvs.sequencevariant.SequenceVariant: ... + def c_to_p(self, var_c: Any, translation_table: Any = ...) -> hgvs.sequencevariant.SequenceVariant: ... diff --git a/poetry.lock b/poetry.lock index 8eb9bec66..70b2698ee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -269,6 +269,51 @@ dev = ["bandit (>=1.7,<2.0)", "build (>=0.8,<1.0)", "flake8 (>=4.0,<5.0)", "ipyt docs = ["mkdocs"] tests = ["pytest (>=7.1,<8.0)", "pytest-cov (>=4.1,<5.0)", "pytest-optional-tests", "tox (>=3.25,<4.0)", "vcrpy"] +[[package]] +name = "biopython" +version = "1.87" +description = "Freely available tools for computational molecular biology." +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"server\"" +files = [ + {file = "biopython-1.87-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:35f13796188412f135acbed196bd6cfedc1257199d50b4883e289bbd96320efc"}, + {file = "biopython-1.87-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63978e1b3ae040c52369bc1bd3f4c668171f5f1f0808798eccd74b575cce455d"}, + {file = "biopython-1.87-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e951f4862ffc1dccc28e1c25245059fa653d86028a88f5dfe1b7875875f3a4f"}, + {file = "biopython-1.87-cp310-cp310-win32.whl", hash = "sha256:86596b05f39afbc5984ee6239c93fd75f6a97fffb9a630d74b45652c24aab964"}, + {file = "biopython-1.87-cp310-cp310-win_amd64.whl", hash = "sha256:efc16dd8a9312eb655fa590821495b0d8ca25d19f7b4ef4fa4da9e71d59d33e5"}, + {file = "biopython-1.87-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:58e36efa7eaa8813cffe440af4824afa20cc3c21b1b179a95cc0fb15c4b83c01"}, + {file = "biopython-1.87-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ccf00d15e698656796ab14ff3eb175e282da7a08eedd36a29b6cacec5a33e97f"}, + {file = "biopython-1.87-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bab39ec4108fb2542a9f1012db8269426fd6e86ed84ebc1b0bd11134ef443dd3"}, + {file = "biopython-1.87-cp311-cp311-win32.whl", hash = "sha256:126e18ad44e959a1984560562fa4f295c075ce2e610e8ddbc9ec6f52e09f70d8"}, + {file = "biopython-1.87-cp311-cp311-win_amd64.whl", hash = "sha256:d4ff5369ffb7a966bcf921cae6c8a6eab5070b5df1c9f2ae8c18ad40fdcb7ec5"}, + {file = "biopython-1.87-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:77ccc634621904d4a8fa0a43b5e0f093fa9df8c9577ed3858af648bb3528f51e"}, + {file = "biopython-1.87-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a3428155c3e0abbed7aad5ff08e034d435b84dfe560c8ec58e7d43abda4b6a43"}, + {file = "biopython-1.87-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:65ba69ef0273e983a9036c2a228142bc34266179a5f03660fc281d332d718630"}, + {file = "biopython-1.87-cp312-cp312-win32.whl", hash = "sha256:b077777fd2c555434bdcee58743f6f860aa80e1e005d9671913aa73823c6a773"}, + {file = "biopython-1.87-cp312-cp312-win_amd64.whl", hash = "sha256:856e3d64f1f27db493474ff84916ed8572731a525e001c7d0d8f41a0fd187000"}, + {file = "biopython-1.87-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e05ef5d632c319ab3ef77705c74061190d0792b07e1f2b9eee867401b2758e7e"}, + {file = "biopython-1.87-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:772539297fa16a78f38651c793f53f8c11bd18317b111982e72cf30a6e57512a"}, + {file = "biopython-1.87-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6d221b2e08e7e89713fdbfb15c8ea6744e908d59f672cd2b6fcf9ed47910d05e"}, + {file = "biopython-1.87-cp313-cp313-win32.whl", hash = "sha256:fab1b12f6bc4646b7f56b4c390ecff685f02b5b29e3a0c10477195bb49fe62f8"}, + {file = "biopython-1.87-cp313-cp313-win_amd64.whl", hash = "sha256:01ee30203bd4b2145cdfe2878499e549a7087f897a6f4d1ebd9de30790123140"}, + {file = "biopython-1.87-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:db73fe16aa2b20677ac86d1997612acb0aa1a3720bc899f65d2bce5583208e1b"}, + {file = "biopython-1.87-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:89ffe272517478691439a59cccd3cc2929fc8f6bfb8cbc8cc5acc103660395a7"}, + {file = "biopython-1.87-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff28a6f31630b3c9f52903478a2ed9dd894b07c1998e40eaeefbeefac20f2d0f"}, + {file = "biopython-1.87-cp314-cp314-win32.whl", hash = "sha256:d740c75d4bc94f9dff51719a0deda37e5e885f06ee6dfbb5e9a21bbe9de35a9c"}, + {file = "biopython-1.87-cp314-cp314-win_amd64.whl", hash = "sha256:98e397096336a49804b6aaaeac8c47ad82e3e4430862f0cde37be73037f1017e"}, + {file = "biopython-1.87-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e4878a9b56775480154c686f81e98f6d907b44d87605bdc2f53538ccdfde9624"}, + {file = "biopython-1.87-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6fcec2e3602ed52ced701f8f7851952383f84dbc4caeb4d202d088170e86b6d"}, + {file = "biopython-1.87-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c8da2d44a4b912c7550a051a5ff4bb72a61decc9c4b19ea92cba4c02fffb143c"}, + {file = "biopython-1.87-cp314-cp314t-win32.whl", hash = "sha256:3670d76759c6cb53ba617f9823d3a438c1aa5415abef6addd29cb81d61d7b312"}, + {file = "biopython-1.87-cp314-cp314t-win_amd64.whl", hash = "sha256:331c4151608a1d8406eff0d3c52a0ff1fa3e82604fc85f11c696c562919fb161"}, + {file = "biopython-1.87.tar.gz", hash = "sha256:8456c803459b679a9712422e5a7fd9809f2f089bf69bb085f3b077946ac9bdbf"}, +] + +[package.dependencies] +numpy = "*" + [[package]] name = "bioutils" version = "0.6.1" @@ -1432,6 +1477,19 @@ files = [ dnspython = ">=2.0.0" idna = ">=2.0.0" +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"server\"" +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "eutils" version = "0.6.0" @@ -2893,6 +2951,22 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"server\"" +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "orcid" version = "1.0.3" @@ -4796,6 +4870,61 @@ dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx_rtd_theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["aiohttp (>=3.10.5)", "flake8 (>=6.1,<7.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=25.3.0,<25.4.0)", "pycodestyle (>=2.11.0,<2.12.0)"] +[[package]] +name = "variant-annotation" +version = "0.1.0" +description = "Genetic variant mapping and annotation pipeline" +optional = true +python-versions = ">=3.11" +groups = ["main"] +markers = "extra == \"server\"" +files = [] +develop = true + +[package.dependencies] +biopython = "*" +boto3 = "*" +click = "*" +openpyxl = "*" +python-dotenv = "*" +redis = "*" +requests = "*" +variant-translation = "1.1.0b1" + +[package.extras] +dev = ["ruff"] +gnomad = ["hail"] +mapping = ["dcd-mapping @ git+https://github.com/VariantEffect/dcd_mapping2.git@v2026.1.2"] +publish = ["build", "twine"] +tests = ["pytest"] + +[package.source] +type = "directory" +url = "../variant-annotation" + +[[package]] +name = "variant-translation" +version = "1.1.0b1" +description = "Reverse-translate protein HGVS variants." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"server\"" +files = [ + {file = "variant_translation-1.1.0b1-py3-none-any.whl", hash = "sha256:cdf693ab564b767c89e7b463b1345e801d2ab4a5f312631584bf79d178ba6122"}, + {file = "variant_translation-1.1.0b1.tar.gz", hash = "sha256:5151d02ccb5c651c7ba47398d75355f2bd19a36717faa39dc2a390cfb4c7968f"}, +] + +[package.dependencies] +click = "*" +hgvs = "*" +python-dotenv = "*" +requests = "*" + +[package.extras] +dev = ["pytest", "ruff"] +publish = ["build", "twine"] + [[package]] name = "virtualenv" version = "21.3.1" @@ -5098,9 +5227,9 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_it type = ["pytest-mypy"] [extras] -server = ["aiocache", "alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "pyathena", "python-jose", "python-multipart", "requests", "slack-sdk", "starlette", "starlette-context", "uvicorn", "watchtower"] +server = ["aiocache", "alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "pyathena", "python-jose", "python-multipart", "requests", "slack-sdk", "starlette", "starlette-context", "uvicorn", "variant-annotation", "watchtower"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "64ca4755b1a5c8930882a7ad3b25feee434f42ffb02ba2841f38c86de56cc948" +content-hash = "25445affad8eb09fb30e96f152dd0f066eb6f6a4d13e61023fba5357002f62ba" diff --git a/pyproject.toml b/pyproject.toml index caa3f0fd6..da80ff7af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ starlette = { version = "~0.49.0", optional = true } starlette-context = { version = "^0.3.6", optional = true } slack-sdk = { version = "~3.21.3", optional = true } uvicorn = { extras = ["standard"], version = "*", optional = true } +variant-annotation = { path = "../variant-annotation", develop = true, optional = true } watchtower = { version = "~3.2.0", optional = true } asyncclick = "^8.3.0.7" filelock = "^3.29.0" @@ -93,7 +94,7 @@ SQLAlchemy = { extras = ["mypy"], version = "~2.0.0" } [tool.poetry.extras] -server = ["aiocache", "alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "pyathena", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"] +server = ["aiocache", "alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "pyathena", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "variant-annotation", "watchtower"] [tool.mypy] @@ -104,6 +105,13 @@ plugins = [ ] mypy_path = "mypy_stubs" +# variant-annotation is installed as a PEP 660 editable sibling package whose import +# hook mypy cannot follow, and it ships no py.typed marker. Treat it as untyped rather +# than maintaining drift-prone local stubs for an actively-developed internal library. +[[tool.mypy.overrides]] +module = "variant_annotation.*" +ignore_missing_imports = true + [tool.pytest.ini_options] addopts = "-v --import-mode=importlib" asyncio_mode = 'strict' diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index a94c16d6e..45b91b756 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -2,7 +2,9 @@ from typing import TYPE_CHECKING, Optional import boto3 +from biocommons.seqrepo import SeqRepo from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider, SeqFetcher +from ga4gh.vrs.dataproxy import SeqRepoDataProxy from mavedb.lib.mapping import VRSMap @@ -16,6 +18,7 @@ DCD_MAP_URL = os.environ.get("DCD_MAPPING_URL", "http://dcd-mapping:8000") CDOT_URL = os.environ.get("CDOT_URL", "http://cdot-rest:8000") +SEQREPO_DIR = os.environ.get("HGVS_SEQREPO_DIR", "/seqrepo") CSV_UPLOAD_S3_BUCKET_NAME = os.getenv("UPLOAD_S3_BUCKET_NAME", "score-set-csv-uploads-dev") @@ -27,6 +30,21 @@ def cdot_rest() -> RESTDataProvider: return RESTDataProvider(url=CDOT_URL, seqfetcher=seqfetcher()) +def seqrepo() -> SeqRepo: + return SeqRepo(SEQREPO_DIR) + + +def seqrepo_data_proxy() -> SeqRepoDataProxy: + """VRS sequence/refget data proxy backed by local SeqRepo. + + Distinct from cdot_rest(): cdot is an hgvs *coordinate* data provider, while + AlleleTranslator needs a VRS *sequence* proxy that can derive_refget_accession. + Mirrors dcd_mapping's SeqRepo-backed translator. Uses the same HGVS_SEQREPO_DIR + location as deps.get_seqrepo and the refget router. + """ + return SeqRepoDataProxy(seqrepo()) + + def vrs_mapper(url: Optional[str] = None) -> VRSMap: return VRSMap(DCD_MAP_URL) if not url else VRSMap(url) diff --git a/src/mavedb/db/mixins.py b/src/mavedb/db/mixins.py new file mode 100644 index 000000000..4bc5c1ca9 --- /dev/null +++ b/src/mavedb/db/mixins.py @@ -0,0 +1,201 @@ +"""Reusable declarative mixins.""" + +from datetime import datetime +from typing import ClassVar, Optional, Sequence, TypeVar + +from sqlalchemy import ColumnElement, DateTime, Update, func, inspect as sa_inspect, select, update +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.orm import Mapped, Session, mapped_column + +T = TypeVar("T", bound="ValidTime") + + +class ValidTime: + """Valid-time versioning for rows that change over time (transaction-time SCD Type 2). + + Applies to either a versioned entity (a row that is replaced by a newer version of the same + logical thing — e.g. a mapping record) or a link/association row (a relationship that comes + and goes). Immutable, content-addressed rows (deduplicated entities like an allele or a + source-versioned annotation record) do not use this — their "what applies now" is answered by + the links that reference them, not by versioning the row itself. + + A row is live while ``valid_to`` is NULL. Superseding a row sets its ``valid_to`` to the + successor's ``valid_from`` instead of deleting it, so the full history is retained and a + point-in-time query is a single half-open ``[valid_from, valid_to)`` predicate. ``current`` + and ``as_of`` express that predicate so call sites never hand-roll it. + + Convention: replacing a live row with a successor goes through :meth:`supersede_with` (single + row) or :meth:`supersede_live_where` (bulk), which stamp the retired ``valid_to`` and the new + ``valid_from`` with one timestamp so the handoff is gap-free regardless of transaction + boundaries. ``retire`` (single) / ``retire_live_where`` (bulk) are *withdrawal* primitives — + retiring with no successor; do not pair a bare retire with a separate insert, or a slow job + between the two opens a window where the row is neither live-old nor live-new. The method pairs + mirror: ``retire``/``supersede_with`` act on one row, ``*_live_where`` act on a predicate. + + ``current`` is derived (``valid_to IS NULL``), not stored — one source of truth, no + dual-write to keep consistent. Each table using this mixin should add a partial unique index + over its natural key ``WHERE valid_to IS NULL`` to enforce a single live row per key (the link + key for an association, the logical-entity key for an entity). + + Transaction time only: ``valid_from``/``valid_to`` record when *we* held the row, not when an + external source considered it effective. Tables fed by externally-versioned sources (e.g. a + ClinVar release, a gnomAD version) should add their own source-version column for that axis. + + Consumer contract — a model mixing this in, and any code writing it, MUST honor: + + 1. **Add the partial unique index.** Declare a unique index over the natural key + ``WHERE valid_to IS NULL`` (one live row per key). It is the loud backstop: forgetting to + retire a predecessor before inserting its successor raises ``IntegrityError`` instead of + silently leaving two live rows. + 2. **Replace through supersede, never retire-then-insert.** Use :meth:`supersede_with` (single + row) or :meth:`supersede_live_where` (bulk) to swap a live row for a successor. A bare + :meth:`retire` followed by a separate insert is the gap footgun — if a slow job or a commit + falls between them, the retired ``valid_to`` and the new ``valid_from`` come from different + transaction clocks and a point-in-time query lands in a hole where the row is neither + old-live nor new-live. Reserve ``retire``/``retire_live_where`` for genuine withdrawal (no + successor). Gaps only arise on a *same-key* handoff; the cascade in (3) only ever retires + children under a superseded parent's old key, so it never needs a supersede. + 3. **Declare ``__retire_cascade__`` for parent rows with ``ValidTime`` children.** A live link to + a retired parent is stale; retiring a parent must retire its live child links. The ORM + relationship cascade does not do this (it fires only on hard DELETE). Bulk + :meth:`supersede_live_where` refuses to run on a class that declares ``__retire_cascade__`` + precisely because it cannot cascade — supersede such rows one at a time with + :meth:`supersede_with`. + 4. **Filter reads with ``current``/``as_of``.** Never read live rows by assuming a query returns + only current state; constrain with ``current`` (or ``as_of(ts)``) explicitly. Because of (3), + a live link implies a live parent, so allele/parent-side ``current`` queries do not surface + links dangling off retired parents. + """ + + valid_from: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + valid_to: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True) + + # Names of relationships to child ``ValidTime`` rows that should be retired when this row is. + # A link to a retired row is itself stale, so closing a parent's window must close its live + # links' windows too — the relationship cascade ("all, delete-orphan") only fires on a hard + # DELETE, not on this soft ``valid_to`` close. Each named relationship must target a + # ``ValidTime`` model; ``retire`` issues one bulk UPDATE per relationship. + # + # This covers only the transaction-time axis: "the parent was superseded, so its links are too." + # Links retired on a source-version axis (e.g. a new gnomAD release superseding an + # allele→gnomAD-variant link while the allele itself stays live) are a separate trigger and are + # not driven from here. Cascade is one level deep (bulk UPDATE, not per-row ``retire``); a deeper + # chain would need its own handling. + __retire_cascade__: ClassVar[tuple[str, ...]] = () + + @hybrid_property + def current(self) -> bool: + return self.valid_to is None + + @current.inplace.expression + @classmethod + def _current_expression(cls) -> ColumnElement[bool]: + return cls.valid_to.is_(None) + + @classmethod + def as_of(cls, ts: datetime) -> ColumnElement[bool]: + """Filter clause selecting the rows live at ``ts`` (half-open ``[valid_from, valid_to)``).""" + return (cls.valid_from <= ts) & (cls.valid_to.is_(None) | (cls.valid_to > ts)) + + def retire(self, session: Optional[Session] = None, at: Optional[datetime] = None) -> None: + """Close this row's validity window, cascading to the live child links named in + ``__retire_cascade__``. Idempotent — an already-closed row keeps its original valid_to. + + Pass ``at`` to stamp an explicit ``valid_to`` shared with a successor's ``valid_from``; this + is how :meth:`supersede_with` keeps the handoff gap-free. Without ``at`` the close uses + ``func.now()`` (Postgres ``transaction_timestamp``). This is a *withdrawal* primitive on its + own — to replace a row with a successor, use :meth:`supersede_with` so the handoff is closed. + + ``session`` is required only when ``__retire_cascade__`` is non-empty (the cascade issues a + bulk UPDATE per child relationship). A leaf row with no declared cascades retires without one. + The cascade runs even when this row was already closed, so a half-finished prior retire (row + closed, links not) still converges. + """ + stamp = at if at is not None else func.now() + if self.valid_to is None: + self.valid_to = stamp + + if not self.__retire_cascade__: + return + if session is None: + raise ValueError( + f"{type(self).__name__}.retire() needs a session to cascade-retire {self.__retire_cascade__}." + ) + + mapper = sa_inspect(type(self)) + assert mapper is not None # a mapped class always inspects to a Mapper + for name in self.__retire_cascade__: + rel = mapper.relationships[name] + child_cls = rel.mapper.class_ + conditions = [remote == getattr(self, local.key) for local, remote in rel.local_remote_pairs] + session.execute(child_cls.retire_live_where(*conditions, at=at)) + + @classmethod + def retire_live_where(cls, *conditions: ColumnElement[bool], at: Optional[datetime] = None) -> Update: + """An UPDATE that retires the live rows matching ``conditions`` (closes their valid_to). + Only currently-live rows are touched, so already-retired rows keep their original valid_to. + Pass ``at`` to stamp an explicit ``valid_to`` (used by :meth:`supersede`); defaults to + ``func.now()``. + + This is a *withdrawal* primitive — retiring live rows with no successor. To replace live rows + with new ones, use :meth:`supersede_live_where` so retire and insert share one timestamp. + + Usage: ``session.execute(Model.retire_live_where(Model.foo == bar))``. + """ + return ( + update(cls).where(cls.valid_to.is_(None), *conditions).values(valid_to=at if at is not None else func.now()) + ) + + def supersede_with(self: T, session: Session, replacement: T, at: Optional[datetime] = None) -> T: + """Retire this row (cascading to its child links) and insert ``replacement``, stamping the + retired row's ``valid_to`` and the replacement's ``valid_from`` with one timestamp so the + handoff has no gap — independent of how many transactions or how much wall-clock separate + them. The single-row counterpart to :meth:`supersede_live_where`, for superseding one known + row that may carry ``__retire_cascade__`` children. + + Flushes the retire before the insert: the partial unique index (one live row per natural key) + is checked per statement and the unit of work emits INSERTs before UPDATEs, so without the + flush the replacement and the not-yet-closed original would both be live and trip the index. + """ + if at is None: + at = session.scalar(select(func.now())) + assert at is not None # SELECT now() always returns a timestamp + self.retire(session, at=at) + session.flush() + replacement.valid_from = at + session.add(replacement) + return replacement + + @classmethod + def supersede_live_where( + cls, + session: Session, + new_rows: Sequence[T], + *conditions: ColumnElement[bool], + at: Optional[datetime] = None, + ) -> Sequence[T]: + """Retire the live rows matching ``conditions`` and insert ``new_rows``, stamping both sides + with one timestamp so every retired row's ``valid_to`` equals every new row's ``valid_from`` + — a gap-free handoff independent of transaction boundaries. The bulk counterpart to + :meth:`supersede_with` (and the supersede form of :meth:`retire_live_where`), for replacing a + *set* of live rows in one scope (e.g. a score set's derived links) with a freshly computed + set. + + The retire executes immediately (before the inserts flush), so a reused natural key does not + trip the partial unique index. Only for leaf ``ValidTime`` rows: a bulk retire cannot fire + ``__retire_cascade__``, so this refuses to run on a class that declares one — use + :meth:`supersede_with` per row there. + """ + if cls.__retire_cascade__: + raise ValueError( + f"{cls.__name__} declares __retire_cascade__; bulk supersede cannot cascade to children. " + "Use supersede_with per row so child links retire too." + ) + if at is None: + at = session.scalar(select(func.now())) + assert at is not None # SELECT now() always returns a timestamp + session.execute(cls.retire_live_where(*conditions, at=at)) + for row in new_rows: + row.valid_from = at + session.add_all(new_rows) + return new_rows diff --git a/src/mavedb/deps.py b/src/mavedb/deps.py index 8e4ce5106..712a6e5a3 100644 --- a/src/mavedb/deps.py +++ b/src/mavedb/deps.py @@ -1,4 +1,3 @@ -import os from typing import Any, AsyncGenerator, Generator from arq import ArqRedis, create_pool @@ -6,7 +5,7 @@ from cdot.hgvs.dataproviders import RESTDataProvider from sqlalchemy.orm import Session -from mavedb.data_providers.services import cdot_rest +from mavedb.data_providers.services import cdot_rest, seqrepo from mavedb.db.session import SessionLocal from mavedb.worker.settings import RedisWorkerSettings @@ -33,5 +32,4 @@ def hgvs_data_provider() -> RESTDataProvider: def get_seqrepo() -> SeqRepo: - seqrepo_dir = os.environ.get("HGVS_SEQREPO_DIR", "/seqrepo") - return SeqRepo(seqrepo_dir) + return seqrepo() diff --git a/src/mavedb/lib/hgvs.py b/src/mavedb/lib/hgvs.py new file mode 100644 index 000000000..5826e67c3 --- /dev/null +++ b/src/mavedb/lib/hgvs.py @@ -0,0 +1,94 @@ +import re +from typing import Optional + +# Coordinate prefix of an HGVS variant description: a single type letter plus a dot +# (g. c. n. m. r. p. ...), capturing the prefix and the remaining description separately. +_HGVS_COORD_PREFIX = re.compile(r"^([a-z]\.)(.+)$") + + +def extract_accession(hgvs_string: str) -> str: + """Extract the reference accession from an HGVS string, or return empty string if it cannot + be parsed. + + This function makes no assumptions about the structure of the accession (e.g. whether it starts with "NM_" + or "NC_") and is robust to extra whitespace. It simply returns the substring before the first colon, which + is the standard HGVS separator between the accession and the variant description. Callers must validate + the accession format separately if needed. + """ + token = (hgvs_string or "").strip() + if ":" not in token: + return "" + + return token.split(":", 1)[0].strip() + + +_HGVS_P_PREDICTION = re.compile(r":p\.\((.+)\)\s*$") + + +def strip_protein_prediction_parens(hgvs_p: str) -> str: + """Unwrap the prediction parentheses from a protein HGVS: ``p.(Ala222Val)`` -> ``p.Ala222Val``. + + Forward translation (``c_to_p``) emits a *predicted* consequence in parentheses. ga4gh + translates either form fine, but the parens are noise we don't want in the stored HGVS -- + they denote inference, not a different variant -- so we normalize to the bare form for + consistency. A string with no prediction parens is returned unchanged. + """ + return _HGVS_P_PREDICTION.sub(r":p.\1", hgvs_p) + + +def split_cis_phased_hgvs(hgvs_string: str) -> list[str]: + """Split a cis-phased multivariant HGVS expression into fully-qualified component strings. + + The reverse-translate-variants tool emits the non-adjacent component substitutions of a + single codon change as one bracketed genomic expression (``NC_000001.11:g.[123A>G;125T>C]``) + whenever they are too far apart on the genome to collapse into a single delins. ga4gh's + AlleleTranslator only ever produces a single Allele, so each component must be translated + independently and recombined into a VRS CisPhasedBlock. + + Each returned component carries the original accession and coordinate prefix + (``NC_000001.11:g.123A>G``). A non-bracketed expression is returned unchanged as a + single-element list, so callers can treat both cases uniformly. + + Unlike a bare mavehgvs split, the accession is preserved: the components feed straight + into VRS translation, which requires a reference accession to resolve positions. + """ + if "[" not in hgvs_string: + return [hgvs_string] + + accession, _, remainder = hgvs_string.partition(":") + prefix = remainder[: remainder.index("[")] # e.g. "g." / "c." + inner = remainder[remainder.index("[") + 1 : remainder.rindex("]")] + return [f"{accession}:{prefix}{component}" for component in inner.split(";") if component] + + +def join_cis_phased_hgvs(components: list[str]) -> Optional[str]: + """Join cis-phased component HGVS strings into one bracketed expression. + + Inverse of :func:`split_cis_phased_hgvs`: ``["NC_…:g.123A>G", "NC_…:g.125T>C"]`` becomes + ``"NC_…:g.[123A>G;125T>C]"``. A single component is returned unchanged. + + Returns ``None`` when the components do not share a single accession and coordinate prefix — + they are then not expressible as one cis-phased block (e.g. members on different sequences). + """ + if not components: + return None + if len(components) == 1: + return components[0] + + accessions: set[str] = set() + prefixes: set[str] = set() + descriptions: list[str] = [] + for component in components: + accession, separator, remainder = component.partition(":") + match = _HGVS_COORD_PREFIX.match(remainder) + if not separator or not match: + return None + + accessions.add(accession) + prefixes.add(match.group(1)) + descriptions.append(match.group(2)) + + if len(accessions) != 1 or len(prefixes) != 1: + return None + + return f"{accessions.pop()}:{prefixes.pop()}[{';'.join(descriptions)}]" diff --git a/src/mavedb/lib/mapping/schema.py b/src/mavedb/lib/mapping/schema.py index 939b60401..02d377780 100644 --- a/src/mavedb/lib/mapping/schema.py +++ b/src/mavedb/lib/mapping/schema.py @@ -7,9 +7,31 @@ """ from datetime import datetime +from enum import Enum from typing import Any, Optional, TypedDict +class MappingOutcome(str, Enum): + """Per-record outcome stamped by dcd-mapping on every emitted score annotation. + + Mirrors ``dcd_mapping.schemas.MappingOutcome``. Lets a benign absence of a VRS allele + be told apart from a genuine failure -- a distinction ``error_message`` alone cannot + make (benign outcomes leave it ``None``). ``MAPPED`` is a success; ``INTRONIC`` and + ``NO_PROTEIN_CONSEQUENCE`` are benign skips (no allele, not failures); ``FAILED`` is the + only genuine failure. + """ + + MAPPED = "mapped" + INTRONIC = "intronic" + NO_PROTEIN_CONSEQUENCE = "no_protein_consequence" + FAILED = "failed" + + @property + def is_benign_absence(self) -> bool: + """True for outcomes that legitimately produce no allele yet are not failures.""" + return self in (MappingOutcome.INTRONIC, MappingOutcome.NO_PROTEIN_CONSEQUENCE) + + class GeneInfo(TypedDict, total=False): hgnc_symbol: Optional[str] selection_method: Optional[str] diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 7bb574d91..93b7c67e9 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -409,9 +409,7 @@ def fetch_score_set_search_filter_options( controlled_keywords_counter_list = [] for key, label_counter in controlled_keywords_counter.items(): for label, count in label_counter.items(): - controlled_keywords_counter_list.append( - ControlledKeywordFilterOption(key=key, value=label, count=count) - ) + controlled_keywords_counter_list.append(ControlledKeywordFilterOption(key=key, value=label, count=count)) logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) @@ -642,11 +640,11 @@ def get_score_set_variants_as_csv( namespaced_score_set_columns[ns] = ["clinical_significance", "clinical_review_status"] need_mappings = ( - include_post_mapped_hgvs - or "clingen" in namespaces - or "vep" in namespaces - or "gnomad" in namespaces - or bool(clinvar_namespaces) + include_post_mapped_hgvs + or "clingen" in namespaces + or "vep" in namespaces + or "gnomad" in namespaces + or bool(clinvar_namespaces) ) need_gnomad = "gnomad" in namespaces @@ -861,7 +859,9 @@ def variant_to_csv_row( value = str(mapping.hgvs_g) if mapping and mapping.hgvs_g else na_rep if value == na_rep: fallback_hgvs = ( - get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None + get_hgvs_from_post_mapped(mapping.post_mapped, combine_cis=True) + if mapping and mapping.post_mapped + else None ) if fallback_hgvs is not None and is_hgvs_g(fallback_hgvs): value = fallback_hgvs @@ -872,7 +872,9 @@ def variant_to_csv_row( value = str(mapping.hgvs_p) if mapping and mapping.hgvs_p else na_rep if value == na_rep: fallback_hgvs = ( - get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None + get_hgvs_from_post_mapped(mapping.post_mapped, combine_cis=True) + if mapping and mapping.post_mapped + else None ) if fallback_hgvs is not None and is_hgvs_p(fallback_hgvs): value = fallback_hgvs diff --git a/src/mavedb/lib/variant_translations.py b/src/mavedb/lib/variant_translations.py index c24c5ed95..701cc17d1 100644 --- a/src/mavedb/lib/variant_translations.py +++ b/src/mavedb/lib/variant_translations.py @@ -7,10 +7,12 @@ from typing import cast +from sqlalchemy import select from sqlalchemy.dialects.postgresql import insert from sqlalchemy.engine import CursorResult from sqlalchemy.orm import Session +from mavedb.models.allele import Allele from mavedb.models.variant_translation import VariantTranslation @@ -39,3 +41,21 @@ def upsert_variant_translations(db: Session, translations: list[tuple[str, str]] created = result.rowcount existing = len(unique) - created return created, existing + + +def get_or_create_allele(db: Session, allele_draft: Allele) -> Allele: + """Return the existing Allele matching ``allele_draft``'s vrs_digest, else add the draft. + + This is a get-or-create, not an upsert: a matching row is returned untouched, and on + a miss the draft is added to the session (not flushed). The draft is never used to + update an existing row. + + NOTE: This function does not persist the returned Allele to the database; the caller + is responsible for committing the session. + """ + existing = db.scalars(select(Allele).where(Allele.vrs_digest == allele_draft.vrs_digest)).one_or_none() + if existing is not None: + return existing + + db.add(allele_draft) + return allele_draft diff --git a/src/mavedb/lib/variants.py b/src/mavedb/lib/variants.py index bef9725c0..a941006cf 100644 --- a/src/mavedb/lib/variants.py +++ b/src/mavedb/lib/variants.py @@ -1,6 +1,7 @@ import re from typing import Any, Optional +from mavedb.lib.hgvs import join_cis_phased_hgvs from mavedb.lib.validation.constants.general import hgvs_columns from mavedb.models.target_gene import TargetGene from mavedb.models.variant import Variant @@ -25,7 +26,15 @@ def hgvs_from_vrs_allele(allele: dict) -> str: raise KeyError("Invalid VRS allele structure. Expected 'expressions'.") -def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]: +def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any], *, combine_cis: bool = False) -> Optional[str]: + """Extract a single HGVS string from a post-mapped VRS object. + + Multi-variant blocks (Haplotype/CisPhasedBlock) are cis-phased, so their members combine + into one bracketed expression (``NC_…:g.[a;b]``) when ``combine_cis`` is set. It defaults + off because some consumers cannot yet handle a bracketed expression — notably ClinGen + submission, which has no single CAID for a multi-variant cis block (see + https://github.com/VariantEffect/mavedb-api/issues/764). + """ if not post_mapped_vrs: return None @@ -40,13 +49,9 @@ def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]: if len(variations_hgvs) == 0: return None - # raise ValueError(f"No variations found in variant {variant_urn}.") - # TODO (https://github.com/VariantEffect/mavedb-api/issues/468) In a future version, we will be able to generate - # a combined HGVS string for haplotypes and cis phased blocks directly from mapper output. if len(variations_hgvs) > 1: - return None - # raise ValueError(f"Multiple variations found in variant {variant_urn}.") + return join_cis_phased_hgvs(variations_hgvs) if combine_cis else None return variations_hgvs[0] diff --git a/src/mavedb/lib/vrs_utils.py b/src/mavedb/lib/vrs_utils.py new file mode 100644 index 000000000..052c4b702 --- /dev/null +++ b/src/mavedb/lib/vrs_utils.py @@ -0,0 +1,186 @@ +"""VRS allele identification helpers. + +Centralizes the digest-correctness invariant for GA4GH VRS alleles: the +``ga4gh_identify`` Merkle tree caches sub-object digests on the object after +first identification, so any subsequent mutation (refgetAccession swap, +normalization, state coercion) leaves a stale id unless the cached digests +are cleared first. All allele identification in dcd_mapping must route +through :func:`identify_allele` so the digest is always recomputed from +current content. +""" + +from itertools import cycle +from typing import Any + +from ga4gh.core import ga4gh_identify +from ga4gh.vrs.extras.translator import AlleleTranslator +from ga4gh.vrs.models import ( + Allele, + CisPhasedBlock, + LiteralSequenceExpression, + ReferenceLengthExpression, + SequenceLocation, +) +from ga4gh.vrs.normalize import normalize + +from mavedb.lib.hgvs import split_cis_phased_hgvs + + +def translate_hgvs_to_vrs(hgvs: str, translator: AlleleTranslator) -> Allele: + """Convert HGVS variation description to VRS object. + + The AlleleTranslator is supplied by the caller and reused across calls. ga4gh's + AlleleTranslator opens a UTA connection lazily on first translate (via HgvsTools) + and holds it for its lifetime, so constructing one per call opens — and leaks — a + UTA connection per variant, exhausting the server's slot budget. Build one per + job/worker and pass it in. + + :param hgvs: MAVE-HGVS variation string + :param translator: caller-owned AlleleTranslator backed by a sequence/refget proxy + :return: Corresponding VRS allele as a Pydantic class + """ + # coerce tmp HGVS string into formally correct term + if hgvs.startswith("NC_") and ":c." in hgvs: + hgvs = hgvs.replace(":c.", ":g.") + + allele: Allele = translator.translate_from(hgvs, "hgvs", do_normalize=False) + + if ( + not isinstance(allele.location, SequenceLocation) + or not isinstance(allele.location.start, int) + or not isinstance(allele.location.end, int) + or not isinstance(allele.state, LiteralSequenceExpression) + ): + raise ValueError + + return allele + + +def translate_hgvs_to_variation(hgvs: str, translator: AlleleTranslator) -> Allele | CisPhasedBlock: + """Translate an HGVS expression — possibly a cis-phased multivariant — into a VRS object. + + Mirrors dcd_mapping's ``vrs_map._construct_vrs_allele``: each component HGVS is translated + to an Allele independently; a single component returns a bare Allele, while two or more are + wrapped in a CisPhasedBlock. The reverse-translation job emits bracketed genomic forms + (``g.[a;b]``) for non-adjacent codon components that ga4gh's AlleleTranslator cannot + translate directly, so splitting and recombining is the only way to represent them. + + The block's GA4GH digest is order-independent, so the same biological cis-phased set always + identifies to one ``ga4gh:CPB.`` digest and dedups to a single row regardless of component + ordering. + + Every component is normalized and re-identified through :func:`normalize_and_identify` + before use. ``translate_from`` is called with ``do_normalize=False`` and stamps ``id`` via + plain ``ga4gh_identify`` on the reused translator, so without this step a component carries a + non-canonical digest computed from the unnormalized object and from the Merkle tree's cached + sub-object digests — distinct biological variants can then collide onto one ``vrs_digest`` and + be merged by the digest-keyed ``get_or_create_allele``. Normalizing here also keeps RT digests + consistent with the mapper's, which is what lets the same allele dedup across sources. + + :param hgvs: a single- or cis-phased-multivariant HGVS string + :param translator: caller-owned AlleleTranslator reused across calls + :return: an Allele for a single variant, or a CisPhasedBlock for a cis-phased set + """ + members = [ + normalize_and_identify(translate_hgvs_to_vrs(component, translator), translator.data_proxy) + for component in split_cis_phased_hgvs(hgvs) + ] + if len(members) == 1: + return members[0] + + block = CisPhasedBlock(members=members) # type: ignore[call-arg] + block.id = identify_variation(block) + return block + + +def identify_allele(allele: Allele) -> str: + """Clear cached digests and return a fresh GA4GH identifier for *allele*. + + ``ga4gh_identify`` is a Merkle-tree: it calls ``get_or_create_digest`` on + sub-objects, returning any cached value without recomputing. Clearing both + the location digest and the allele digest first ensures the id is always + derived from the current object content — not from a value set before a + refgetAccession mutation or normalization. + + ``id`` is recomputed as well: ``ga4gh_identify`` returns a non-empty ``id`` as-is + (its default ``in_place`` only fills an *empty* id), so an allele that already + carries one — e.g. stamped by an ``identify=True`` ``AlleleTranslator`` — would + otherwise keep its stale id even with the digests cleared. Use in_place="always" + to force it to be recomputed from the content-derived digest. + """ + if isinstance(allele.location, SequenceLocation): + allele.location.digest = None + + allele.digest = None + digest = ga4gh_identify(allele, in_place="always") + if digest is None: + raise ValueError("Failed to compute GA4GH identifier for allele") # noqa: EM101 + + return digest + + +def identify_variation(variation: Allele | CisPhasedBlock) -> str: + """Clear cached digests and return a fresh GA4GH id for an Allele or CisPhasedBlock. + + Generalizes :func:`identify_allele` to cis-phased blocks. A block's Merkle digest is + derived from its members' digests, so a stale member digest would silently propagate into + the block id. Clear every member (and its location) plus the block itself before + identifying so the id always reflects current content. + """ + if isinstance(variation, Allele): + return identify_allele(variation) + + for member in variation.members: + if isinstance(member, Allele): + if isinstance(member.location, SequenceLocation): + member.location.digest = None + member.digest = None + + variation.digest = None + digest = ga4gh_identify(variation, in_place="always") + if digest is None: + raise ValueError("Failed to compute GA4GH identifier for variation") # noqa: EM101 + + return digest + + +def normalize_and_identify(allele: Allele, data_proxy: Any) -> Allele: + """Normalize *allele* and stamp it with a freshly computed GA4GH digest. + + Pairs the finalize steps every VRS allele construction path needs. + Routing identification through :func:`identify_allele` (rather than + ``ga4gh_identify`` directly) is the invariant that protects against the + Merkle-tree's stale-digest behavior after mutation -- so any allele + construction site that bypasses this helper risks reintroducing the + stale-digest bug. + + Normalization can leave an indel as a ``ReferenceLengthExpression``; this coerces + it to a ``LiteralSequenceExpression`` so the result matches dcd_mapping's + authoritative alleles (``vrs_map._rle_to_lse``), which always store LSE. Without + the coercion the same biological indel hashes to two digests -- RLE here, LSE from + the mapper -- so reverse translation's regenerated genomic form fails to dedup + against the authoritative row and a duplicate allele is linked. + """ + allele = normalize(allele, data_proxy=data_proxy) + if isinstance(allele.state, ReferenceLengthExpression): + allele.state = _rle_to_lse(allele.state, allele.location, data_proxy) + allele.id = identify_allele(allele) + return allele + + +def _rle_to_lse( + rle: ReferenceLengthExpression, location: SequenceLocation, data_proxy: Any +) -> LiteralSequenceExpression: + """Coerce a ReferenceLengthExpression to an equivalent LiteralSequenceExpression. + + Mirrors ``dcd_mapping:vrs_map.py::_rle_to_lse`` byte-for-byte so an allele built here + hashes identically to the mapper's authoritative allele for the same variant. Derives + the literal sequence by tiling the repeat subunit out to ``rle.length``. + """ + sequence_id = location.sequenceReference.refgetAccession + start: int = location.start + end = start + rle.repeatSubunitLength + subsequence = data_proxy.get_sequence(f"ga4gh:{sequence_id}", start, end) + c = cycle(subsequence) + derived_sequence = "".join(next(c) for _ in range(rle.length)) + return LiteralSequenceExpression(sequence=derived_sequence) diff --git a/src/mavedb/lib/workflow/definitions.py b/src/mavedb/lib/workflow/definitions.py index 056ff5309..3d3b70511 100644 --- a/src/mavedb/lib/workflow/definitions.py +++ b/src/mavedb/lib/workflow/definitions.py @@ -20,6 +20,16 @@ def annotation_pipeline_job_definitions( [(upstream_mapping_key, DependencyType.SUCCESS_REQUIRED)] if upstream_mapping_key else [] ) return [ + { + "key": "reverse_translate_variants_for_score_set", + "function": "reverse_translate_variants_for_score_set", + "type": JobType.MAPPED_VARIANT_ANNOTATION, + "params": { + "correlation_id": None, # Required param to be filled in at runtime + "score_set_id": None, # Required param to be filled in at runtime + }, + "dependencies": mapping_dep, + }, { "key": "submit_score_set_mappings_to_car", "function": "submit_score_set_mappings_to_car", @@ -29,7 +39,7 @@ def annotation_pipeline_job_definitions( "score_set_id": None, # Required param to be filled in at runtime "updater_id": None, # Required param to be filled in at runtime }, - "dependencies": mapping_dep, + "dependencies": [("reverse_translate_variants_for_score_set", DependencyType.SUCCESS_REQUIRED)], }, { "key": "warm_clingen_cache", diff --git a/src/mavedb/models/__init__.py b/src/mavedb/models/__init__.py index 2f0d65b48..35c1ba0a5 100644 --- a/src/mavedb/models/__init__.py +++ b/src/mavedb/models/__init__.py @@ -1,6 +1,7 @@ __all__ = [ "access_key", "acmg_classification", + "allele", "collection", "clinical_control", "controlled_keyword", @@ -16,6 +17,8 @@ "legacy_keyword", "license", "mapped_variant", + "mapping_record", + "mapping_record_allele", "pipeline", "publication_identifier", "published_variant", diff --git a/src/mavedb/models/allele.py b/src/mavedb/models/allele.py new file mode 100644 index 000000000..54ce07e50 --- /dev/null +++ b/src/mavedb/models/allele.py @@ -0,0 +1,59 @@ +from datetime import date +from typing import TYPE_CHECKING, Any, Optional + +from sqlalchemy import Column, Date, Index, Integer, String, UniqueConstraint, func +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.lib.hgvs import extract_accession + +if TYPE_CHECKING: + from .mapping_record_allele import MappingRecordAllele + + +class Allele(Base): + __tablename__ = "alleles" + + id: Mapped[int] = Column(Integer, primary_key=True) + + vrs_digest = Column(String, nullable=False) + level = Column(String(length=16), nullable=False) + + hgvs_g = Column(String, nullable=True) + hgvs_c = Column(String, nullable=True) + hgvs_p = Column(String, nullable=True) + + clingen_allele_id = Column(String, nullable=True) + post_mapped: Optional[Any] = Column(JSONB(none_as_null=True), nullable=True) + + created_at = Column(Date, nullable=False, default=date.today) + updated_at = Column(Date, nullable=False, default=date.today, onupdate=date.today) + + @hybrid_property + def transcript(self) -> str: + """Reference accession of the populated HGVS column (derived, not stored). + + Exactly one of hgvs_g/hgvs_c/hgvs_p is populated per allele, and the transcript is that + string's accession. Derived rather than stored so it cannot drift from the HGVS column + it duplicates. + """ + return extract_accession(self.hgvs_g or self.hgvs_c or self.hgvs_p or "") + + @transcript.inplace.expression + @classmethod + def _transcript_expression(cls): + return func.split_part(func.coalesce(cls.hgvs_g, cls.hgvs_c, cls.hgvs_p), ":", 1) + + mapping_record_links: Mapped[list["MappingRecordAllele"]] = relationship( + "MappingRecordAllele", + back_populates="allele", + ) + + __table_args__ = ( + UniqueConstraint("vrs_digest", name="uq_alleles_vrs_digest"), + Index("ix_alleles_vrs_digest", "vrs_digest"), + Index("ix_alleles_level", "level"), + Index("ix_alleles_clingen_allele_id", "clingen_allele_id"), + ) diff --git a/src/mavedb/models/enums/annotation_type.py b/src/mavedb/models/enums/annotation_type.py index b1595347b..a3bbd1693 100644 --- a/src/mavedb/models/enums/annotation_type.py +++ b/src/mavedb/models/enums/annotation_type.py @@ -3,6 +3,7 @@ class AnnotationType(str, Enum): VRS_MAPPING = "vrs_mapping" + CROSS_LEVEL_TRANSLATION = "cross_level_translation" CLINGEN_ALLELE_ID = "clingen_allele_id" MAPPED_HGVS = "mapped_hgvs" VARIANT_TRANSLATION = "variant_translation" diff --git a/src/mavedb/models/mapping_record.py b/src/mavedb/models/mapping_record.py new file mode 100644 index 000000000..132a69b8b --- /dev/null +++ b/src/mavedb/models/mapping_record.py @@ -0,0 +1,96 @@ +from datetime import date +from typing import TYPE_CHECKING, Any, Optional + +from sqlalchemy import Boolean, Column, Date, Enum, ForeignKey, Index, Integer, String, func, text +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.db.mixins import ValidTime +from mavedb.lib.hgvs import extract_accession +from mavedb.models.enums.annotation_layer import AnnotationLayer + +if TYPE_CHECKING: + from .mapping_record_allele import MappingRecordAllele + from .target_gene_mapping import TargetGeneMapping + from .variant import Variant + + +class MappingRecord(ValidTime, Base): + __tablename__ = "mapping_records" + + id: Mapped[int] = Column(Integer, primary_key=True) + + variant_id = Column(Integer, ForeignKey("variants.id"), nullable=False) + variant: Mapped["Variant"] = relationship("Variant", back_populates="mapping_records") + + # Digest of the pre-mapped (assayed-level) VRS representation, indexed for + # cross-score-set dedup lookups. + vrs_digest = Column(String, nullable=True) + pre_mapped: Optional[Any] = Column(JSONB(none_as_null=True), nullable=True) + + # Level at which the variant was *assayed* — distinct from alignment_level (QC). + assay_level = Column(String(length=16), nullable=False) + hgvs_assay_level = Column(String, nullable=True) + + @hybrid_property + def transcript(self) -> str: + """Reference accession of the assay-level HGVS (derived, not stored). + + Derived rather than stored so it cannot drift from hgvs_assay_level, which already + carries the accession. + """ + return extract_accession(self.hgvs_assay_level or "") + + @transcript.inplace.expression + @classmethod + def _transcript_expression(cls): + return func.split_part(cls.hgvs_assay_level, ":", 1) + + mapping_api_version = Column(String, nullable=False) + # Domain data: the date the mapping was performed (from the dcd-mapping run), surfaced to + # users — distinct from valid_from, which is when this version became the live record. + mapped_date = Column(Date, nullable=False, default=date.today) + vrs_version = Column(String, nullable=True) + + # valid_from / valid_to and the derived `current` come from ValidTime: a re-map retires the + # prior live record (closes its valid_to) and inserts a new live one, so mapping history is + # retained and point-in-time queries are a single predicate. + + # Per-mapping QC fields from dcd-mapping. + alignment_level = Column( + Enum(AnnotationLayer, create_constraint=True, length=16, native_enum=False, validate_strings=True), + nullable=True, + ) + at_mismatched_locus = Column(Boolean, nullable=True) + near_gap = Column(Boolean, nullable=True) + + target_gene_mapping_id = Column(Integer, ForeignKey("target_gene_mappings.id"), index=True, nullable=True) + target_gene_mapping: Mapped[Optional["TargetGeneMapping"]] = relationship( + "TargetGeneMapping", back_populates="mapping_records" + ) + + allele_links: Mapped[list["MappingRecordAllele"]] = relationship( + "MappingRecordAllele", + back_populates="mapping_record", + cascade="all, delete-orphan", + ) + + # Retiring a record retires its live allele links too (both the authoritative link and any + # derived links a reverse-translation run attached). See ValidTime.__retire_cascade__. + __retire_cascade__ = ("allele_links",) + + __table_args__ = ( + Index("ix_mapping_records_vrs_digest", "vrs_digest"), + Index("ix_mapping_records_variant_id", "variant_id"), + # At most one live mapping record per variant — promotes to the database the invariant the + # mapping job enforces in app code (it retires the prior live record before inserting a new + # one). Superseded versions remain with a closed valid_to for point-in-time queries. + Index( + "uq_mapping_records_current", + "variant_id", + unique=True, + postgresql_where=text("valid_to IS NULL"), + ), + ) diff --git a/src/mavedb/models/mapping_record_allele.py b/src/mavedb/models/mapping_record_allele.py new file mode 100644 index 000000000..cda8bf0f3 --- /dev/null +++ b/src/mavedb/models/mapping_record_allele.py @@ -0,0 +1,60 @@ +from typing import TYPE_CHECKING + +from sqlalchemy import Boolean, Column, ForeignKey, Index, Integer, text +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.db.mixins import ValidTime + +if TYPE_CHECKING: + from .allele import Allele + from .mapping_record import MappingRecord + + +class MappingRecordAllele(ValidTime, Base): + __tablename__ = "mapping_record_alleles" + + id: Mapped[int] = Column(Integer, primary_key=True) + mapping_record_id: Mapped[int] = Column( + Integer, + ForeignKey("mapping_records.id", ondelete="CASCADE"), + nullable=False, + ) + allele_id: Mapped[int] = Column( + Integer, + ForeignKey("alleles.id", ondelete="RESTRICT"), + nullable=False, + ) + # Provenance: ``True`` when this allele was the assay's actual measurement + # for this mapping record (sourced from vrs_map, with real alignment QC), + # ``False`` when it was derived by the cross-level or same-level + # synonymous translator. The flag lives on the link rather than on + # ``Allele`` because the same VRS allele can be authoritative for one + # mapping record and translator-derived for another. + is_authoritative = Column(Boolean, nullable=False, default=False) + + mapping_record: Mapped["MappingRecord"] = relationship("MappingRecord", back_populates="allele_links") + allele: Mapped["Allele"] = relationship("Allele", back_populates="mapping_record_links") + + __table_args__ = ( + Index( + "ix_mapping_record_alleles_mapping_record_id", + "mapping_record_id", + ), + Index( + "ix_mapping_record_alleles_allele_id", + "allele_id", + ), + # At most one live link per (mapping_record, allele). The job retires (closes valid_to) + # rather than deletes, so superseded links remain for point-in-time queries; only the + # live row participates in this constraint. A derived link is never written when an + # authoritative one already exists for the pair, so is_authoritative is intentionally + # absent from the key — a record links a given allele once. + Index( + "uq_mapping_record_alleles_live", + "mapping_record_id", + "allele_id", + unique=True, + postgresql_where=text("valid_to IS NULL"), + ), + ) diff --git a/src/mavedb/models/target_gene.py b/src/mavedb/models/target_gene.py index 671a4380f..5419c3478 100644 --- a/src/mavedb/models/target_gene.py +++ b/src/mavedb/models/target_gene.py @@ -23,10 +23,10 @@ class TargetGene(Base): __tablename__ = "target_genes" - id = Column(Integer, primary_key=True) + id: Mapped[int] = Column(Integer, primary_key=True) name = Column(String, nullable=False) - category = Column( + category: Mapped[TargetCategory] = Column( Enum(TargetCategory, create_constraint=True, length=32, native_enum=False, validate_strings=True), nullable=False, ) diff --git a/src/mavedb/models/target_gene_mapping.py b/src/mavedb/models/target_gene_mapping.py index 417f249d4..52761e34b 100644 --- a/src/mavedb/models/target_gene_mapping.py +++ b/src/mavedb/models/target_gene_mapping.py @@ -23,15 +23,16 @@ if TYPE_CHECKING: from mavedb.models.mapped_variant import MappedVariant + from mavedb.models.mapping_record import MappingRecord from mavedb.models.target_gene import TargetGene class TargetGeneMapping(Base): __tablename__ = "target_gene_mappings" - id = Column(Integer, primary_key=True) + id: Mapped[int] = Column(Integer, primary_key=True) - target_gene_id = Column(Integer, ForeignKey("target_genes.id"), nullable=False, index=True) + target_gene_id: Mapped[int] = Column(Integer, ForeignKey("target_genes.id"), nullable=False, index=True) target_gene: Mapped["TargetGene"] = relationship("TargetGene", back_populates="target_gene_mappings") alignment_level = Column( @@ -80,4 +81,9 @@ class TargetGeneMapping(Base): back_populates="target_gene_mapping", ) + mapping_records: Mapped[list["MappingRecord"]] = relationship( + "MappingRecord", + back_populates="target_gene_mapping", + ) + __table_args__ = (Index("ix_target_gene_mappings_target_alignment", "target_gene_id", "alignment_level"),) diff --git a/src/mavedb/models/variant.py b/src/mavedb/models/variant.py index 59b6e729c..dd46c430c 100644 --- a/src/mavedb/models/variant.py +++ b/src/mavedb/models/variant.py @@ -9,13 +9,14 @@ if TYPE_CHECKING: from .mapped_variant import MappedVariant + from .mapping_record import MappingRecord from .score_set import ScoreSet class Variant(Base): __tablename__ = "variants" - id = Column(Integer, primary_key=True) + id: Mapped[int] = Column(Integer, primary_key=True) urn = Column(String(64), index=True, nullable=True, unique=True) data = Column(JSONB, nullable=False) @@ -35,5 +36,9 @@ class Variant(Base): back_populates="variant", cascade="all, delete-orphan" ) + mapping_records: Mapped[List["MappingRecord"]] = relationship( + back_populates="variant", cascade="all, delete-orphan" + ) + # Bidirectional relationship with ScoreCalibrationFunctionalClassification is left # purposefully undefined for performance reasons. diff --git a/src/mavedb/models/variant_annotation_status.py b/src/mavedb/models/variant_annotation_status.py index f39c47a64..357d6a6e6 100644 --- a/src/mavedb/models/variant_annotation_status.py +++ b/src/mavedb/models/variant_annotation_status.py @@ -93,7 +93,7 @@ class VariantAnnotationStatus(Base): # FK index for job_run_id — needed for CASCADE deletes on job_runs Index("ix_variant_annotation_status_job_run_id", "job_run_id"), CheckConstraint( - "annotation_type IN ('vrs_mapping', 'clingen_allele_id', 'mapped_hgvs', 'variant_translation', 'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', 'ldh_submission')", + "annotation_type IN ('vrs_mapping', 'cross_level_translation', 'clingen_allele_id', 'mapped_hgvs', 'variant_translation', 'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', 'ldh_submission')", name="ck_variant_annotation_type_valid", ), CheckConstraint( diff --git a/src/mavedb/worker/jobs/__init__.py b/src/mavedb/worker/jobs/__init__.py index e421bbad2..488398117 100644 --- a/src/mavedb/worker/jobs/__init__.py +++ b/src/mavedb/worker/jobs/__init__.py @@ -33,11 +33,13 @@ from mavedb.worker.jobs.variant_processing.mapping import ( map_variants_for_score_set, ) +from mavedb.worker.jobs.variant_processing.reverse_translation import reverse_translate_variants_for_score_set __all__ = [ # Variant processing jobs "create_variants_for_score_set", "map_variants_for_score_set", + "reverse_translate_variants_for_score_set", # External service integration jobs "submit_score_set_mappings_to_car", "submit_score_set_mappings_to_ldh", diff --git a/src/mavedb/worker/jobs/external_services/clingen.py b/src/mavedb/worker/jobs/external_services/clingen.py index 639160ed5..5a65cc728 100644 --- a/src/mavedb/worker/jobs/external_services/clingen.py +++ b/src/mavedb/worker/jobs/external_services/clingen.py @@ -137,6 +137,9 @@ async def submit_score_set_mappings_to_car(ctx: dict, job_id: int, job_manager: variant_post_mapped_hgvs: dict[str, list[int]] = {} no_hgvs_count = 0 for mapped_variant_id, post_mapped in variant_post_mapped_objects: + # Intentionally not combine_cis=True: multi-variant cis-phased blocks have no single + # CAID, so they are skipped here pending ClinGen guidance on how to register them + # (https://github.com/VariantEffect/mavedb-api/issues/764). hgvs_for_post_mapped = get_hgvs_from_post_mapped(post_mapped) if not hgvs_for_post_mapped: @@ -357,6 +360,8 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: variant_content = [] variant_for_urn = {} for variant, mapped_variant in variant_objects: + # See the note above: cis-phased blocks are skipped here pending ClinGen guidance + # (https://github.com/VariantEffect/mavedb-api/issues/764). variation = get_hgvs_from_post_mapped(mapped_variant.post_mapped) if not variation: diff --git a/src/mavedb/worker/jobs/registry.py b/src/mavedb/worker/jobs/registry.py index 94cd76e32..c3fb8fd2f 100644 --- a/src/mavedb/worker/jobs/registry.py +++ b/src/mavedb/worker/jobs/registry.py @@ -32,6 +32,7 @@ from mavedb.worker.jobs.variant_processing import ( create_variants_for_score_set, map_variants_for_score_set, + reverse_translate_variants_for_score_set, ) # All job functions for ARQ worker @@ -39,6 +40,7 @@ # Variant processing jobs create_variants_for_score_set, map_variants_for_score_set, + reverse_translate_variants_for_score_set, # External service jobs submit_score_set_mappings_to_car, submit_score_set_mappings_to_ldh, @@ -100,6 +102,13 @@ "key": "map_variants_for_score_set", "type": JobType.VARIANT_MAPPING, }, + reverse_translate_variants_for_score_set: { + "dependencies": [], + "params": {"score_set_id": None, "correlation_id": None}, + "function": "reverse_translate_variants_for_score_set", + "key": "reverse_translate_variants_for_score_set", + "type": JobType.VARIANT_MAPPING, + }, submit_score_set_mappings_to_car: { "dependencies": [], "params": {"score_set_id": None, "correlation_id": None}, diff --git a/src/mavedb/worker/jobs/variant_processing/__init__.py b/src/mavedb/worker/jobs/variant_processing/__init__.py index a6df09753..240de5e28 100644 --- a/src/mavedb/worker/jobs/variant_processing/__init__.py +++ b/src/mavedb/worker/jobs/variant_processing/__init__.py @@ -10,8 +10,10 @@ from .mapping import ( map_variants_for_score_set, ) +from .reverse_translation import reverse_translate_variants_for_score_set __all__ = [ "create_variants_for_score_set", "map_variants_for_score_set", + "reverse_translate_variants_for_score_set", ] diff --git a/src/mavedb/worker/jobs/variant_processing/mapping.py b/src/mavedb/worker/jobs/variant_processing/mapping.py index efc8226b3..9e6a8da74 100644 --- a/src/mavedb/worker/jobs/variant_processing/mapping.py +++ b/src/mavedb/worker/jobs/variant_processing/mapping.py @@ -8,6 +8,7 @@ import asyncio import functools import logging +from collections import Counter from datetime import date from typing import Any @@ -23,13 +24,17 @@ ) from mavedb.lib.logging.context import format_raised_exception_info_as_dict from mavedb.lib.mapping import EXCLUDED_PREMAPPED_ANNOTATION_KEYS +from mavedb.lib.mapping.schema import MappingOutcome from mavedb.lib.types.workflow import JobExecutionOutcome +from mavedb.lib.variant_translations import get_or_create_allele from mavedb.lib.variants import get_hgvs_from_post_mapped +from mavedb.models.allele import Allele as AlleleDbModel from mavedb.models.enums.annotation_layer import AnnotationLayer from mavedb.models.enums.annotation_type import AnnotationType from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory from mavedb.models.enums.mapping_state import MappingState -from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.score_set import ScoreSet from mavedb.models.target_gene_mapping import TargetGeneMapping from mavedb.models.user import User @@ -130,7 +135,7 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan # Per-(target, alignment_level) QC records produced by the dcd-mapping API. # All records share the same tool_version because they come from a single run; - # we use that as the global ``mapping_api_version`` carried on each MappedVariant. + # we use that as the global ``mapping_api_version`` carried on each MappingRecord. target_mappings_payload = mapping_results.get("target_mappings") or [] tool_version = next( (tm.get("tool_version") for tm in target_mappings_payload if tm.get("tool_version")), @@ -218,7 +223,7 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan continue target_gene_mapping = TargetGeneMapping( - target_gene_id=target_gene.id, + target_gene=target_gene, alignment_level=AnnotationLayer.from_wire(level_value), preferred=bool(tm.get("preferred", False)), reference_assembly=tm.get("reference_assembly"), @@ -255,7 +260,10 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan total_variants = len(mapped_scores) job_manager.save_to_context({"total_variants_to_process": total_variants}) - successful_mapped_variants = 0 + # Tally every record by its typed outcome; the mapped/failed/benign buckets are + # derived from this after the loop. Keeping all four keys preserves the + # intronic-vs-no-protein distinction in logs. + outcome_counts: Counter[MappingOutcome] = Counter() logger.info( f"Processing {total_variants} mapped variants for score set {score_set.urn}.", extra=job_manager.logging_context(), @@ -268,23 +276,28 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan job_manager.save_to_context({"processing_variant": variant.id}) logger.debug(f"Processing variant {variant.id}.", extra=job_manager.logging_context()) - # there should only be one current mapped variant per variant id, so update old mapped variant to current = false + # Only allow one live MappingRecord per variant. The prior live record (if any) is + # superseded by the new version below via supersede_with, which retires it (cascading to + # its allele links) and inserts the new record under one timestamp. existing_mapped_variant = ( - job_manager.db.query(MappedVariant) - .filter(MappedVariant.variant_id == variant.id, MappedVariant.current.is_(True)) + job_manager.db.query(MappingRecord) + .filter(MappingRecord.variant_id == variant.id, MappingRecord.current) .one_or_none() ) - if existing_mapped_variant: job_manager.save_to_context({"existing_mapped_variant": existing_mapped_variant.id}) - existing_mapped_variant.current = False - job_manager.db.add(existing_mapped_variant) - logger.debug(msg="Set existing mapped variant to current = false.", extra=job_manager.logging_context()) - annotation_was_successful = mapped_score.get("pre_mapped") and mapped_score.get("post_mapped") - if annotation_was_successful: - successful_mapped_variants += 1 - job_manager.save_to_context({"successful_mapped_variants": successful_mapped_variants}) + # The typed outcome -- not allele presence -- decides success/benign/failure. + # Absent outcome means an older or malformed payload; fail fast. + raw_outcome = mapped_score.get("outcome") + if not raw_outcome: + raise NonexistentMappingResultsError( + f"ScoreAnnotation for variant {variant_urn!r} is missing its outcome." + ) + outcome = MappingOutcome(raw_outcome) + + outcome_counts[outcome] += 1 + job_manager.save_to_context({"outcome_counts": {o.value: n for o, n in outcome_counts.items()}}) # dcd-mapping guarantees both fields are set on every ScoreAnnotation, # including failed variants (the annotate step re-attributes failures to @@ -294,61 +307,125 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan score_alignment_level = mapped_score.get("alignment_level") if not score_target or not score_alignment_level: raise NonexistentMappingResultsError( - f"ScoreAnnotation for variant {variant_urn!r} is missing " - f"target_gene_identifier or alignment_level." + f"ScoreAnnotation for variant {variant_urn!r} is missing target_gene_identifier or alignment_level." ) + pre_mapped_allele: dict = mapped_score.get("pre_mapped") or {} + post_mapped_allele: dict = mapped_score.get("post_mapped") or {} + annotation_layer = AnnotationLayer.from_wire(score_alignment_level) + assay_level_hgvs = get_hgvs_from_post_mapped(post_mapped_allele, combine_cis=True) + + # dcd-mapping guarantees every mapped score is attributable to a TargetGeneMapping + # via (target_gene_identifier, alignment_level) -- including failed variants, which + # it re-attributes to the target's preferred layer. A miss implies a malformed payload. target_gene_mapping_row = target_gene_mapping_by_key.get((score_target, score_alignment_level)) - mapped_variant = MappedVariant( - pre_mapped=mapped_score.get("pre_mapped", null()), - post_mapped=mapped_score.get("post_mapped", null()), - hgvs_assay_level=get_hgvs_from_post_mapped(mapped_score.get("post_mapped", {})), + if target_gene_mapping_row is None: + raise NonexistentMappingResultsError( + f"ScoreAnnotation for variant {variant_urn!r} has no TargetGeneMapping for " + f"(target={score_target!r}, alignment_level={score_alignment_level!r})." + ) + + mapping_record = MappingRecord( variant_id=variant.id, - modification_date=date.today(), + vrs_digest=pre_mapped_allele.get("id"), + pre_mapped=pre_mapped_allele or None, + assay_level=annotation_layer, + hgvs_assay_level=assay_level_hgvs, mapped_date=mapping_results["mapped_date"], - vrs_version=mapped_score.get("vrs_version", null()), + vrs_version=mapped_score.get("vrs_version", None), mapping_api_version=tool_version, - error_message=mapped_score.get("error_message", null()), - target_gene_mapping_id=target_gene_mapping_row.id if target_gene_mapping_row else None, - alignment_level=AnnotationLayer.from_wire(score_alignment_level), + target_gene_mapping_id=target_gene_mapping_row.id, + alignment_level=annotation_layer, at_mismatched_locus=mapped_score.get("at_mismatched_locus"), near_gap=mapped_score.get("near_gap"), - current=True, ) + if existing_mapped_variant: + # Retire the prior record (cascading to its allele links) and insert this one under a + # single timestamp, so the prior valid_to equals the new valid_from with no gap. + existing_mapped_variant.supersede_with(job_manager.db, mapping_record) + logger.debug( + msg="Superseded prior mapping record and its allele links.", extra=job_manager.logging_context() + ) + else: + job_manager.db.add(mapping_record) + + # MAPPED -> success; benign absences -> skipped; FAILED -> failed. The raw outcome + # is preserved in annotation_metadata so the benign distinction survives. + if outcome is MappingOutcome.MAPPED: + annotation_status = AnnotationStatus.SUCCESS + annotation_failure_category = None + elif outcome.is_benign_absence: + annotation_status = AnnotationStatus.SKIPPED + annotation_failure_category = None + else: + annotation_status = AnnotationStatus.FAILED + annotation_failure_category = AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED annotation_manager.add_annotation( variant_id=variant.id, # type: ignore annotation_type=AnnotationType.VRS_MAPPING, version=tool_version, - status=AnnotationStatus.SUCCESS if annotation_was_successful else AnnotationStatus.FAILED, - failure_category=None - if annotation_was_successful - else AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED, + status=annotation_status, + failure_category=annotation_failure_category, annotation_data={ "error_message": mapped_score.get("error_message", null()), "annotation_metadata": { - "mapped_assay_level_hgvs": get_hgvs_from_post_mapped(mapped_score.get("post_mapped", {})), + "outcome": outcome.value, + "mapped_assay_level_hgvs": assay_level_hgvs, }, }, current=True, ) - job_manager.db.add(mapped_variant) + # Only variants with a post-mapped representation yield an authoritative Allele; + # failed and benign-absent variants get a MappingRecord but no linked allele. + if post_mapped_allele: + allele_draft = AlleleDbModel( + vrs_digest=post_mapped_allele["id"], + level=annotation_layer, + hgvs_g=assay_level_hgvs if annotation_layer == AnnotationLayer.genomic else None, + hgvs_c=assay_level_hgvs if annotation_layer == AnnotationLayer.cdna else None, + hgvs_p=assay_level_hgvs if annotation_layer == AnnotationLayer.protein else None, + post_mapped=post_mapped_allele, + ) + authoritative_allele = get_or_create_allele(job_manager.db, allele_draft) + job_manager.db.flush() + + # TODO#765: Mapping is not idempotent, so we must always create a new link. + job_manager.db.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=authoritative_allele.id, + is_authoritative=True, + ) + ) + logger.debug(msg="Linked mapped variant to authoritative allele.", extra=job_manager.logging_context()) + logger.debug(msg="Added new mapped variant to session.", extra=job_manager.logging_context()) + job_manager.db.flush() annotation_manager.flush() - if successful_mapped_variants == 0: + # Collapse the per-outcome tally into the three buckets the rest of the job reasons about. + mapped_count = outcome_counts[MappingOutcome.MAPPED] + failed_count = outcome_counts[MappingOutcome.FAILED] + skipped_count = sum(n for o, n in outcome_counts.items() if o.is_benign_absence) + + # State keys off genuine failures only: no failures -> complete (benign absences + # don't count); failures with nothing mapped -> failed; otherwise incomplete. + if failed_count == 0: + score_set.mapping_state = MappingState.complete + elif mapped_count == 0: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "All variants failed to map."} - elif successful_mapped_variants < total_variants: - score_set.mapping_state = MappingState.incomplete else: - score_set.mapping_state = MappingState.complete + score_set.mapping_state = MappingState.incomplete job_manager.save_to_context( { - "successful_mapped_variants": successful_mapped_variants, + "mapped_count": mapped_count, + "failed_count": failed_count, + "skipped_count": skipped_count, "mapping_state": score_set.mapping_state.name, "mapping_errors": score_set.mapping_errors, "inserted_mapped_variants": len(mapped_scores), @@ -397,7 +474,8 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan logger.info(msg="Inserted mapped variants into db.", extra=job_manager.logging_context()) - if successful_mapped_variants == 0: + # Fail the job only on genuine failure with nothing mapped; all-benign is a success. + if mapped_count == 0 and failed_count > 0: logger.error(msg="No variants were successfully mapped.", extra=job_manager.logging_context()) job_manager.db.flush() return JobExecutionOutcome.failed( @@ -405,19 +483,27 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan data={ "score_set_id": score_set.id, "mapped_count": 0, - "unmapped_count": total_variants, + "failed_count": failed_count, + "skipped_count": skipped_count, "total_count": total_variants, }, failure_category=FailureCategory.VRS_MAPPING_FAILED, ) - logger.info(msg="Variant mapping job completed successfully.", extra=job_manager.logging_context()) + logger.info( + msg=( + f"Variant mapping job completed successfully: {mapped_count} mapped, " + f"{failed_count} failed, {skipped_count} skipped (intronic / no protein consequence)." + ), + extra=job_manager.logging_context(), + ) job_manager.db.flush() return JobExecutionOutcome.succeeded( data={ "score_set_id": score_set.id, - "mapped_count": successful_mapped_variants, - "unmapped_count": total_variants - successful_mapped_variants, + "mapped_count": mapped_count, + "failed_count": failed_count, + "skipped_count": skipped_count, "total_count": total_variants, } ) diff --git a/src/mavedb/worker/jobs/variant_processing/reverse_translation.py b/src/mavedb/worker/jobs/variant_processing/reverse_translation.py new file mode 100644 index 000000000..1bf918b0a --- /dev/null +++ b/src/mavedb/worker/jobs/variant_processing/reverse_translation.py @@ -0,0 +1,522 @@ +"""Reverse translation worker job — builds cross-level HGVS equivalence classes. + +For each mapped variant in a score set, calls construct_equivalent_variants from +the variant-annotation library to produce all coding and genomic HGVS candidates +encoding the same protein consequence, plus that protein consequence itself. The +candidates (coding, genomic, and the protein) are written as non-authoritative +Allele rows linked to the existing MappingRecord via MappingRecordAllele. +""" + +import asyncio +import dataclasses +import functools +import logging +from datetime import date +from enum import Enum +from typing import Any, NamedTuple, Sequence + +from ga4gh.vrs.extras.translator import AlleleTranslator +from sqlalchemy import select +from variant_annotation.lib.accessions import looks_like_refseq_protein_accession +from variant_annotation.lib.translation import construct_equivalent_variants +from variant_annotation.lib.translation.types import TranslationConfig, VariantInput, WtCodonMode + +from mavedb.lib.annotation_status_manager import AnnotationStatusManager +from mavedb.lib.hgvs import extract_accession, strip_protein_prediction_parens +from mavedb.lib.types.workflow import JobExecutionOutcome +from mavedb.lib.variant_translations import get_or_create_allele +from mavedb.lib.vrs_utils import translate_hgvs_to_variation +from mavedb.models.allele import Allele as AlleleDbModel +from mavedb.models.enums.annotation_layer import AnnotationLayer +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory +from mavedb.models.enums.target_category import TargetCategory +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele +from mavedb.models.score_set import ScoreSet +from mavedb.models.target_gene import TargetGene +from mavedb.models.target_gene_mapping import TargetGeneMapping +from mavedb.models.variant import Variant +from mavedb.worker.jobs.utils.setup import validate_job_params +from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management +from mavedb.worker.lib.managers.job_manager import JobManager +from mavedb.worker.lib.translation_ports import WorkerCoordinateTranslator, uta_transcript_source + +logger = logging.getLogger(__name__) + +# Job defaults: full coding equivalence class — every synonymous codon, indels included. +# wt_codon_mode "all" requires include_indels=True. +_DEFAULT_TRANSLATION_CONFIG: dict[str, Any] = { + "include_indels": True, + "wt_codon_mode": WtCodonMode.ALL, +} + + +class _TranscriptResolution(NamedTuple): + """Pairs a mapping record with its pre-UTA transcript info: gene_transcript if the mapper + supplied a cdna row, protein_accession if NP_→NM_ resolution is still needed. Both None + means the record will be skipped.""" + + rec: MappingRecord + variant: Variant + gene_transcript: str | None + protein_accession: str | None + target_gene_id: int | None + + +class _TranscriptResolutionSkipReason(Enum): + NO_ASSAY_LEVEL_HGVS = "no_assay_level_hgvs" + TRANSCRIPT_UNRESOLVED = "transcript_unresolved" + NO_CODING_TRANSCRIPT = "no_coding_transcript" + + @classmethod + def classify( + cls, resolution: _TranscriptResolution, category: TargetCategory | None + ) -> tuple["_TranscriptResolutionSkipReason", str]: + """Classify why a record was skipped. + + Protein-coding target with no transcript → recoverable (transcript_unresolved). + Non-coding/regulatory target → correct skip (no_coding_transcript). + """ + if not resolution.rec.hgvs_assay_level: + return ( + cls.NO_ASSAY_LEVEL_HGVS, + "No assay-level HGVS available to reverse-translate.", + ) + if category == TargetCategory.protein_coding: + return ( + cls.TRANSCRIPT_UNRESOLVED, + "Protein-coding target but no coding transcript could be resolved " + "(no cdna TargetGeneMapping and no NP_->NM_ association). Recoverable: " + "re-map or check transcript selection.", + ) + return ( + cls.NO_CODING_TRANSCRIPT, + "Non-coding/regulatory target has no protein consequence to reverse-translate.", + ) + + +def _coding_transcripts_for_proteins(protein_accessions: set[str]) -> dict[str, str]: + """Resolve RefSeq protein accessions (NP_/XP_) to their preferred coding transcripts via UTA. + + Protein-level mappings carry no cdna transcript in TargetGeneMapping, so reverse + translation falls back to the NP_→NM_ association in UTA. Connection is opened and + closed per-call to avoid leaking connections across long-lived worker jobs. + """ + if not protein_accessions: + return {} + + with uta_transcript_source() as client: + return { + pro_ac: transcript + for pro_ac in sorted(protein_accessions) + if (transcript := client.transcript_for_protein(pro_ac)) is not None + } + + +def _build_translation_config(overrides: dict[str, Any] | None) -> TranslationConfig: + """Build a TranslationConfig from optional job-param overrides merged with job defaults. + + Overrides win over _DEFAULT_TRANSLATION_CONFIG; wt_codon_mode accepts string values and + is coerced to the enum. Raises ValueError for unknown fields, invalid modes, or invalid + combinations (e.g. wt_codon_mode != "none" without include_indels). + """ + config_kwargs: dict[str, Any] = {**_DEFAULT_TRANSLATION_CONFIG, **(overrides or {})} + + allowed_fields = {field.name for field in dataclasses.fields(TranslationConfig)} + unknown_fields = set(config_kwargs) - allowed_fields + if unknown_fields: + raise ValueError( + f"Unknown translation_config option(s): {', '.join(sorted(unknown_fields))}. " + f"Valid options: {', '.join(sorted(allowed_fields))}." + ) + + raw_mode = config_kwargs.get("wt_codon_mode") + if raw_mode is not None: + try: + config_kwargs["wt_codon_mode"] = WtCodonMode(raw_mode) + except ValueError: + valid_modes = ", ".join(repr(mode.value) for mode in WtCodonMode) + raise ValueError( + f"Invalid translation_config wt_codon_mode {raw_mode!r}. Valid values: {valid_modes}." + ) from None + + try: + return TranslationConfig(**config_kwargs) + except ValueError as exc: + raise ValueError(f"Invalid translation_config: {exc}") from exc + + +@with_pipeline_management +async def reverse_translate_variants_for_score_set( + ctx: dict, job_id: int, job_manager: JobManager +) -> JobExecutionOutcome: + """Build the cross-level HGVS equivalence class for every mapped variant in the score set. + + For each current MappingRecord with an hgvs_assay_level, collapses to a ProteinConsequence + and expands to all coding/genomic HGVS candidates via the variant-annotation library. + Each candidate is written as a non-authoritative Allele linked to the MappingRecord. + + job_params: score_set_id (int), correlation_id (str), + translation_config (dict, optional) — TranslationConfig overrides. + """ + job = job_manager.get_job() + validate_job_params(["score_set_id", "correlation_id"], job) + + score_set_id: int = job.job_params["score_set_id"] # type: ignore[index] + correlation_id: str = job.job_params["correlation_id"] # type: ignore[index] + translation_config = _build_translation_config(job.job_params.get("translation_config")) # type: ignore[union-attr] + score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one() + + job_manager.save_to_context( + { + "application": "mavedb-worker", + "function": "reverse_translate_variants_for_score_set", + "resource": score_set.urn, + "correlation_id": correlation_id, + } + ) + job_manager.update_progress(0, 100, "Starting reverse translation job.") + logger.info(msg="Started reverse translation job.", extra=job_manager.logging_context()) + + # Build {(target_gene_id, run date) -> NM_ transcript} from cdna TargetGeneMappings. + # TargetGeneMapping rows accumulate across re-maps (retired records still FK them), so we + # key by (target_gene_id, mapped_date) to bind each record to its own run's cdna row. + # All TargetGeneMappings in one job share a mapped_date; within a key, highest id wins. + # + # TODO#763: mapped_date is day-granular, so two re-maps on the same calendar day collide. + # A later same-day run that emits no cdna row can still bind the earlier run's stale + # transcript. Only a versioned run anchor closes this fully. + cdna_transcript_by_run: dict[tuple[int, date | None], str | None] = { + (target_gene_id, mapped_date): reference_accession + for target_gene_id, mapped_date, reference_accession in job_manager.db.execute( + select( + TargetGeneMapping.target_gene_id, + TargetGeneMapping.mapped_date, + TargetGeneMapping.reference_accession, + ) + .join(TargetGene, TargetGene.id == TargetGeneMapping.target_gene_id) + .where(TargetGene.score_set_id == score_set_id) + .where(TargetGeneMapping.alignment_level == AnnotationLayer.cdna) + .where(TargetGeneMapping.reference_accession.isnot(None)) + .order_by(TargetGeneMapping.id) + ) + .tuples() + .all() + } + + # Load current authoritative MappingRecords with their Variant and TargetGeneMapping. + # mapped_date from the joined (genomic) TargetGeneMapping anchors the cdna transcript lookup. + rows: Sequence[tuple[MappingRecord, Variant, int, date | None]] = ( + job_manager.db.execute( + select( + MappingRecord, + Variant, + TargetGeneMapping.target_gene_id, + TargetGeneMapping.mapped_date, + ) + .join(MappingRecordAllele, MappingRecord.id == MappingRecordAllele.mapping_record_id) + .join(Variant, MappingRecord.variant_id == Variant.id) + .outerjoin(TargetGeneMapping, MappingRecord.target_gene_mapping_id == TargetGeneMapping.id) + .where(Variant.score_set_id == score_set_id) + .where(MappingRecord.current) + .where(MappingRecordAllele.is_authoritative.is_(True)) + .where(MappingRecordAllele.current) + ) + .tuples() + .all() + ) + + if not rows: + logger.warning( + msg="No current and authoritative mapping records found for this score set.", + extra=job_manager.logging_context(), + ) + job_manager.db.flush() + return JobExecutionOutcome.succeeded(data={"translated": 0, "failed": 0, "skipped": 0, "alleles_created": 0}) + + # Genomic/cdna records resolve via the cdna TargetGeneMapping; protein records have no + # cdna reference_accession, so collect their NP_ accessions for a batched NP_→NM_ UTA lookup. + transcript_resolutions: list[_TranscriptResolution] = [] + protein_accessions: set[str] = set() + for rec, variant, target_gene_id, mapped_date in rows: + coding_accession = cdna_transcript_by_run.get((target_gene_id, mapped_date)) + protein_accession = None + if not coding_accession and rec.hgvs_assay_level is not None: + raw_accession = extract_accession(rec.hgvs_assay_level) + if looks_like_refseq_protein_accession(raw_accession): + protein_accession = raw_accession + protein_accessions.add(raw_accession) + + transcript_resolutions.append( + _TranscriptResolution(rec, variant, coding_accession, protein_accession, target_gene_id) + ) + + transcript_by_protein = _coding_transcripts_for_proteins(protein_accessions) + + # Target category per gene — used to classify skips as recoverable (coding target, + # transcript unresolved) vs. correct (non-coding/regulatory, no protein consequence). + target_category_by_gene: dict[int, TargetCategory] = dict( + job_manager.db.execute( + select(TargetGene.id, TargetGene.category).where(TargetGene.score_set_id == score_set_id) + ) + .tuples() + .all() + ) + + # Build VariantInputs with the resolved coding transcript (p./c./g. all collapse to a + # ProteinConsequence on that transcript). Object identity is preserved so TranslationResult + # can be correlated back to its MappingRecord. Records with no transcript are skipped. + variant_inputs: list[Any] = [] + variant_input_map: dict[int, tuple[MappingRecord, Variant]] = {} + skipped_variants: list[_TranscriptResolution] = [] + for p in transcript_resolutions: + transcript = p.gene_transcript or ( + transcript_by_protein.get(p.protein_accession) if p.protein_accession else None + ) + if not transcript or not p.rec.hgvs_assay_level: + skipped_variants.append(p) + continue + + inp = VariantInput(hgvs=p.rec.hgvs_assay_level, transcript=transcript) + variant_inputs.append(inp) + variant_input_map[id(inp)] = (p.rec, p.variant) + + total = len(variant_inputs) + job_manager.save_to_context({"total_variants": total, "skipped_variants": len(skipped_variants)}) + job_manager.update_progress( + 10, + 100, + f"Prepared {total} variants for reverse translation ({len(skipped_variants)} skipped, no coding transcript).", + ) + logger.info( + msg=f"Running reverse translation for {total} variants ({len(skipped_variants)} skipped).", + extra=job_manager.logging_context(), + ) + + # construct_equivalent_variants is I/O-bound (blocks on a subprocess); run in the + # default thread pool rather than the process pool to avoid pickling the port objects. + coordinates = WorkerCoordinateTranslator(ctx["hdp"]) + loop = asyncio.get_running_loop() + job_manager.update_progress(20, 100, "Running reverse translation subprocess.") + # WtCodonMode.ALL reads the reference codon via TranscriptSource.codon_at, so pass a + # live UTA-backed source. codon_at is only touched in the post-subprocess WT-codon step, + # but the connection must outlive the executor call, so scope it around the whole call. + # (transcript_for_protein is redundant here -- the job already resolved every transcript + # and supplies it via VariantInput.transcript.) + with uta_transcript_source() as transcripts: + results, errors = await loop.run_in_executor( + None, + functools.partial( + construct_equivalent_variants, + variant_inputs, + transcripts=transcripts, + coordinates=coordinates, + config=translation_config, + ), + ) + + job_manager.update_progress(70, 100, "Writing translated alleles to database.") + logger.info( + msg=f"Translation complete: {len(results)} succeeded, {len(errors)} failed.", + extra=job_manager.logging_context(), + ) + + translated = 0 + failed = 0 + alleles_created = 0 + annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) + allele_translator = AlleleTranslator(ctx["seqrepo"]) + + current_record_ids = ( + select(MappingRecord.id) + .join(Variant, MappingRecord.variant_id == Variant.id) + .where(Variant.score_set_id == score_set_id) + .where(MappingRecord.current.is_(True)) + ) + + # Defer linkage until all candidates are processed so the prior derived links can be + # superseded atomically — retire and insert share a timestamp with no gap. + new_links: list[MappingRecordAllele] = [] + + # The live authoritative (record, allele) pairs. A derived candidate that equals a record's + # authoritative allele is not linked again. + authoritative_pairs: set[tuple[int, int]] = { + (record_id, allele_id) + for record_id, allele_id in job_manager.db.execute( + select(MappingRecordAllele.mapping_record_id, MappingRecordAllele.allele_id) + .where(MappingRecordAllele.is_authoritative.is_(True)) + .where(MappingRecordAllele.current) + .where(MappingRecordAllele.mapping_record_id.in_(current_record_ids)) + ) + .tuples() + .all() + if record_id is not None and allele_id is not None + } + + for result in results: + rec, variant = variant_input_map[id(result.input)] + candidate_count = 0 + + # Equivalence generation may surface the same VRS object more than once. + # Dedup by vrs_digest per mapping record to avoid duplicate links. + seen_digests: set[str] = set() + failed_candidates: list[dict[str, str]] = [] + candidates: list[tuple[str, AnnotationLayer, str]] = [ + (hgvs_g, AnnotationLayer.genomic, "hgvs_g") for hgvs_g in result.hgvs_g_candidates + ] + [(hgvs_c, AnnotationLayer.cdna, "hgvs_c") for hgvs_c in result.hgvs_c_candidates] + + # Emit the protein consequence as the protein-level member of the equivalence set. + # Prediction parens (p.(Ala222Val)) are stripped before translation and storage. + # None for protein-assay inputs where the protein is already the authoritative allele. + if result.hgvs_p: + candidates.append((strip_protein_prediction_parens(result.hgvs_p), AnnotationLayer.protein, "hgvs_p")) + + for hgvs, level, hgvs_field in candidates: + # A candidate may not be translatable (intronic projection, malformed expression). + # Track per-candidate failures; a variant fails only if *all* candidates fail. + try: + variation = translate_hgvs_to_variation(hgvs, allele_translator) + except Exception as e: + logger.warning( + msg=f"Failed to translate candidate HGVS to VRS: {hgvs} ({e})", + extra=job_manager.logging_context(), + ) + failed_candidates.append({"hgvs": hgvs, "level": level.value, "error": str(e)}) + continue + + if variation.id in seen_digests: + continue + + seen_digests.add(variation.id) + draft_allele = AlleleDbModel( + vrs_digest=variation.id, + post_mapped=variation.model_dump(), + level=level, + **{hgvs_field: hgvs}, # type: ignore[arg-type] + ) + allele = get_or_create_allele(job_manager.db, draft_allele) + job_manager.db.flush() + + if (rec.id, allele.id) in authoritative_pairs: + continue # already linked + + new_links.append( + MappingRecordAllele( + mapping_record_id=rec.id, + allele_id=allele.id, + is_authoritative=False, + ) + ) + candidate_count += 1 + + alleles_created += candidate_count + annotation_metadata = { + "hgvs_input": result.input.hgvs, + "hgvs_c_candidates": result.hgvs_c_candidates, + "hgvs_g_candidates": result.hgvs_g_candidates, + "alleles_created": candidate_count, + "failed_candidates": failed_candidates, + } + + # No translatable candidates and failures mean the variant failed reverse translation. No + # failures and no candidates is a success with no alleles created. + if candidate_count == 0 and failed_candidates: + failed += 1 + annotation_manager.add_annotation( + variant_id=variant.id, + annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, + status=AnnotationStatus.FAILED, + failure_category=AnnotationFailureCategory.UNKNOWN, + annotation_data={ + "error_message": "All candidate HGVS failed VRS translation.", + "annotation_metadata": annotation_metadata, + }, + ) + else: + translated += 1 + annotation_manager.add_annotation( + variant_id=variant.id, + annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, + status=AnnotationStatus.SUCCESS, + annotation_data={"annotation_metadata": annotation_metadata}, + ) + + # Supersede prior live derived links atomically. + # TODO#765: re-runs retire and recreate the whole derived set because re-mapping re-mints + # records; idempotent records would allow unchanged links to stay live. + MappingRecordAllele.supersede_live_where( + job_manager.db, + new_links, + MappingRecordAllele.is_authoritative.is_(False), + MappingRecordAllele.mapping_record_id.in_(current_record_ids), + ) + + # TODO#767: non-substitution consequences (del/ins/delins/fs/ext) have no synonymous + # equivalence class, so they arrive here as TranslationErrors and are miscounted as + # FAILED. Classify by the protein consequence's edit type up front and map these to + # SKIPPED instead of pattern-matching engine error strings. + for error in errors: + _rec, variant = variant_input_map[id(error.input)] + failed += 1 + annotation_manager.add_annotation( + variant_id=variant.id, + annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, + status=AnnotationStatus.FAILED, + failure_category=AnnotationFailureCategory.UNKNOWN, + annotation_data={ + "error_message": error.error, + "annotation_metadata": {"hgvs_input": error.input.hgvs}, + }, + ) + + skipped = len(skipped_variants) + for p in skipped_variants: + category = target_category_by_gene.get(p.target_gene_id) if p.target_gene_id is not None else None + skip_category, reason = _TranscriptResolutionSkipReason.classify(p, category) + annotation_manager.add_annotation( + variant_id=p.variant.id, + annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, + status=AnnotationStatus.SKIPPED, + annotation_data={ + "annotation_metadata": { + "hgvs_input": p.rec.hgvs_assay_level, + "skip_category": skip_category.value, + "reason": reason, + } + }, + ) + + annotation_manager.flush() + + job_manager.save_to_context( + { + "translated": translated, + "failed": failed, + "skipped": skipped, + "alleles_created": alleles_created, + } + ) + logger.info( + msg=( + f"Reverse translation complete: {translated} translated, {failed} failed, " + f"{skipped} skipped, {alleles_created} alleles created." + ), + extra=job_manager.logging_context(), + ) + job_manager.db.flush() + + if translated == 0 and failed > 0: + logger.error( + msg="All variant reverse translations failed.", + extra=job_manager.logging_context(), + ) + return JobExecutionOutcome.failed( + reason="All variant reverse translations failed.", + data={"translated": 0, "failed": failed, "skipped": skipped, "alleles_created": 0}, + failure_category=FailureCategory.DATA_ERROR, + ) + + return JobExecutionOutcome.succeeded( + data={"translated": translated, "failed": failed, "skipped": skipped, "alleles_created": alleles_created} + ) diff --git a/src/mavedb/worker/lib/translation_ports.py b/src/mavedb/worker/lib/translation_ports.py new file mode 100644 index 000000000..b715aace4 --- /dev/null +++ b/src/mavedb/worker/lib/translation_ports.py @@ -0,0 +1,68 @@ +"""Worker-side adapters for the variant_annotation translation ports. + +construct_equivalent_variants depends on two protocols — CoordinateTranslator +and TranscriptSource. This module supplies the worker's CoordinateTranslator +(WorkerCoordinateTranslator) and a factory (uta_transcript_source) for the +TranscriptSource, which is variant_annotation's own UTA-backed UtaClient. + +WorkerCoordinateTranslator defers AssemblyMapper initialization until the first +call — the hgvs library makes network calls on mapper construction that must not +fire in unit-test contexts where construct_equivalent_variants is mocked. +""" + +from __future__ import annotations + +import contextlib +import os +from collections.abc import Generator +from typing import Any + +from variant_annotation.lib.clients.uta import UtaClient, connect_uta + + +@contextlib.contextmanager +def uta_transcript_source() -> Generator[UtaClient]: + """Yield a UTA-backed TranscriptSource over a connection scoped to the block. + + Backs both the NP_→NM_ association lookup and the WT-codon read used by + WtCodonMode.ALL (TranscriptSource.codon_at). The connection is closed on exit, + so callers must use the client within the ``with`` block. + """ + uta_db_url = (os.environ.get("UTA_DB_URL") or "").strip() + if not uta_db_url: + raise RuntimeError("UTA_DB_URL must be set to resolve transcript facts (NP_→NM_ associations and WT codons).") + with contextlib.closing(connect_uta(uta_db_url)) as conn: + yield UtaClient(conn) + + +class WorkerCoordinateTranslator: + """CoordinateTranslator backed by the worker's HGVS data provider.""" + + def __init__(self, hdp: Any) -> None: + self._hdp = hdp + self._parser: Any = None + self._mapper: Any = None + + def _ensure_initialized(self) -> None: + if self._mapper is None: + import hgvs.assemblymapper + import hgvs.parser + + self._parser = hgvs.parser.Parser() + self._mapper = hgvs.assemblymapper.AssemblyMapper( + self._hdp, + assembly_name="GRCh38", + alt_aln_method="splign", + ) + + def c_to_p(self, c_hgvs: str) -> str: + self._ensure_initialized() + return str(self._mapper.c_to_p(self._parser.parse(c_hgvs))) + + def g_to_c(self, g_hgvs: str, transcript: str) -> str: + self._ensure_initialized() + return str(self._mapper.g_to_t(self._parser.parse(g_hgvs), transcript)) + + def c_to_g(self, c_hgvs: str) -> str: + self._ensure_initialized() + return str(self._mapper.c_to_g(self._parser.parse(c_hgvs))) diff --git a/src/mavedb/worker/settings/lifecycle.py b/src/mavedb/worker/settings/lifecycle.py index 54a0b4c76..5be66bd2d 100644 --- a/src/mavedb/worker/settings/lifecycle.py +++ b/src/mavedb/worker/settings/lifecycle.py @@ -9,7 +9,7 @@ from concurrent import futures -from mavedb.data_providers.services import cdot_rest +from mavedb.data_providers.services import cdot_rest, seqrepo_data_proxy def standalone_ctx(): @@ -18,6 +18,7 @@ def standalone_ctx(): ctx["pool"] = futures.ProcessPoolExecutor() ctx["redis"] = None # Redis connection can be set up here if needed. ctx["hdp"] = cdot_rest() + ctx["seqrepo"] = seqrepo_data_proxy() ctx["state"] = {} # Additional context setup can be added here as needed. @@ -37,6 +38,7 @@ async def shutdown(ctx): async def on_job_start(ctx): ctx["hdp"] = cdot_rest() + ctx["seqrepo"] = seqrepo_data_proxy() ctx["state"] = {} diff --git a/tests/db/test_mixins.py b/tests/db/test_mixins.py new file mode 100644 index 000000000..ad0333a5d --- /dev/null +++ b/tests/db/test_mixins.py @@ -0,0 +1,289 @@ +"""Unit tests for the ValidTime mixin. + +Tested against minimal, purpose-built models rather than any real MaveDB models. ``_VtParent`` is +a cascade-bearing entity (natural key ``key``); ``_VtChild`` is a leaf link (natural key +``(parent_id, tag)``). Both carry a partial unique index over their natural key +``WHERE valid_to IS NULL``, mirroring the contract real consumers must honor. + +Many checks pass an explicit ``at`` that differs from the DB clock: a single-transaction test cannot +otherwise distinguish a deliberate shared timestamp from the coincidental same-transaction handoff +the gap bug relied on, so the explicit ``at`` is what actually proves the timestamp is threaded. +""" + +from datetime import datetime, timedelta, timezone + +import pytest +from sqlalchemy import Column, ForeignKey, Index, Integer, select, text +from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import declarative_base, relationship + +from mavedb.db.mixins import ValidTime + +VtBase = declarative_base() + +# Fixed, deterministic windows — far from the test's transaction clock so an accidental +# server_default (func.now()) stamp would be visibly wrong rather than coincidentally equal. +T0 = datetime(2020, 1, 1, tzinfo=timezone.utc) +T1 = datetime(2021, 1, 1, tzinfo=timezone.utc) +T2 = datetime(2022, 1, 1, tzinfo=timezone.utc) + + +class _VtParent(ValidTime, VtBase): + __tablename__ = "vt_parents" + + id = Column(Integer, primary_key=True) + key = Column(Integer, nullable=False) + children = relationship("_VtChild", back_populates="parent") + + __retire_cascade__ = ("children",) + __table_args__ = (Index("uq_vt_parents_live", "key", unique=True, postgresql_where=text("valid_to IS NULL")),) + + +class _VtChild(ValidTime, VtBase): + __tablename__ = "vt_children" + + id = Column(Integer, primary_key=True) + parent_id = Column(Integer, ForeignKey("vt_parents.id"), nullable=False) + tag = Column(Integer, nullable=False) + parent = relationship("_VtParent", back_populates="children") + + __table_args__ = ( + Index("uq_vt_children_live", "parent_id", "tag", unique=True, postgresql_where=text("valid_to IS NULL")), + ) + + +@pytest.fixture(autouse=True) +def vt_tables(session): + """Create the throwaway ValidTime tables on the test engine, drop them after. Autouse so every + test in the module gets the tables without naming the fixture in its signature.""" + engine = session.get_bind() + VtBase.metadata.create_all(bind=engine) + yield + # Release any locks the test's still-open transaction holds before dropping: a post-commit + # attribute access reopens a transaction holding ACCESS SHARE on these tables, and DROP TABLE + # would block on it forever (this fixture tears down while the session fixture is still open). + session.rollback() + VtBase.metadata.drop_all(bind=engine) + + +def _parent(session, key, *, valid_from=None): + p = _VtParent(key=key, valid_from=valid_from) if valid_from else _VtParent(key=key) + session.add(p) + session.commit() + return p + + +def _child(session, parent, tag, *, valid_from=None): + c = ( + _VtChild(parent_id=parent.id, tag=tag, valid_from=valid_from) + if valid_from + else _VtChild(parent_id=parent.id, tag=tag) + ) + session.add(c) + session.commit() + return c + + +class TestCurrentAndAsOf: + def test_fresh_row_is_current(self, session): + p = _parent(session, 1) + assert p.valid_to is None + assert p.current is True + + def test_retired_row_is_not_current(self, session): + p = _parent(session, 1) + p.retire(session, at=T1) + assert p.current is False + + def test_current_expression_filters_to_live_rows(self, session): + live = _parent(session, 1) + retired = _parent(session, 2) + retired.retire(session, at=T1) + session.commit() + + rows = session.scalars(select(_VtParent).where(_VtParent.current)).all() + assert [r.id for r in rows] == [live.id] + + def test_as_of_selects_rows_live_at_timestamp(self, session): + # Window [T0, T2): explicit valid_from and a retire at T2. + p = _parent(session, 1, valid_from=T0) + p.retire(session, at=T2) + session.commit() + + def live_ids(ts): + return [r.id for r in session.scalars(select(_VtParent).where(_VtParent.as_of(ts))).all()] + + assert live_ids(T0 - timedelta(days=1)) == [] # before the window opens + assert live_ids(T0) == [p.id] # at valid_from (inclusive) + assert live_ids(T1) == [p.id] # inside the window + assert live_ids(T2) == [] # at valid_to (exclusive) + assert live_ids(T2 + timedelta(days=1)) == [] # after the window closes + + +class TestRetire: + def test_retire_closes_valid_to(self, session): + c = _child(session, _parent(session, 1), 1) + c.retire(at=T1) + assert c.valid_to == T1 + + def test_retire_is_idempotent(self, session): + c = _child(session, _parent(session, 1), 1) + c.retire(at=T1) + c.retire(at=T2) # already closed — keeps the original valid_to + assert c.valid_to == T1 + + def test_leaf_retire_needs_no_session(self, session): + c = _child(session, _parent(session, 1), 1) + c.retire() # no cascade declared, so no session required + session.commit() + assert c.valid_to is not None + + def test_retire_cascades_to_live_child_links(self, session): + p = _parent(session, 1) + c1 = _child(session, p, 1) + c2 = _child(session, p, 2) + + p.retire(session, at=T1) + session.commit() + + assert p.valid_to == T1 + assert c1.valid_to == T1 # cascade closes the children at the same instant + assert c2.valid_to == T1 + + def test_cascade_retire_without_session_raises(self, session): + p = _parent(session, 1) + with pytest.raises(ValueError, match="cascade-retire"): + p.retire() # cascade declared but no session to issue the child UPDATE + + def test_cascade_runs_even_when_row_already_closed(self, session): + # A half-finished prior retire (parent closed, child not) must still converge. + p = _parent(session, 1) + c = _child(session, p, 1) + p.valid_to = T0 # simulate parent closed without its child + session.commit() + + p.retire(session, at=T1) # idempotent on the parent, but still cascades + session.commit() + assert p.valid_to == T0 # unchanged (idempotent) + assert c.valid_to == T1 # child still retired + + +class TestRetireLiveWhere: + def test_retires_matching_live_rows(self, session): + p = _parent(session, 1) + c1 = _child(session, p, 1) + c2 = _child(session, p, 2) + + session.execute(_VtChild.retire_live_where(_VtChild.parent_id == p.id, at=T1)) + session.commit() + + assert c1.valid_to == T1 + assert c2.valid_to == T1 + + def test_leaves_already_retired_rows_untouched(self, session): + p = _parent(session, 1) + already = _child(session, p, 1) + already.retire(at=T0) + session.commit() + live = _child(session, p, 2) + + session.execute(_VtChild.retire_live_where(_VtChild.parent_id == p.id, at=T1)) + session.commit() + + assert already.valid_to == T0 # retired row keeps its original valid_to + assert live.valid_to == T1 + + +class TestSupersedeWith: + def test_gap_free_handoff_with_explicit_at(self, session): + old = _parent(session, 1, valid_from=T0) + new = _VtParent(key=1) + + old.supersede_with(session, new, at=T1) + session.commit() + + # The retired valid_to equals the successor's valid_from exactly — no point-in-time hole. + assert old.valid_to == T1 + assert new.valid_from == T1 + + def test_default_at_handoff_is_gap_free(self, session): + old = _parent(session, 1) + new = _VtParent(key=1) + + old.supersede_with(session, new) # at defaults to a single captured func.now() + session.commit() + + assert old.valid_to is not None + assert old.valid_to == new.valid_from + + def test_cascades_child_links_at_the_handoff_timestamp(self, session): + old = _parent(session, 1) + child = _child(session, old, 1) + new = _VtParent(key=1) + + old.supersede_with(session, new, at=T1) + session.commit() + + assert old.valid_to == T1 + assert new.valid_from == T1 + assert child.valid_to == T1 # the old parent's child link retired under the same timestamp + + def test_respects_partial_unique_index_via_flush_ordering(self, session): + # old and new share natural key=1; supersede flushes the retire before inserting the + # successor, so the two are never simultaneously live and the unique index does not trip. + old = _parent(session, 1) + new = _VtParent(key=1) + + old.supersede_with(session, new, at=T1) + session.commit() # would raise IntegrityError if both were live at the INSERT + + live = session.scalars(select(_VtParent).where(_VtParent.key == 1, _VtParent.current)).all() + assert [r.id for r in live] == [new.id] + + +class TestSupersedeLiveWhere: + def test_stamps_explicit_valid_from_on_inserts(self, session): + # Without flushing, valid_from would be None under the server_default path; supersede sets it + # explicitly, which is what keeps the handoff gap-free across a later-transaction flush. + new = _VtChild(parent_id=1, tag=1) + _VtChild.supersede_live_where(session, [new], _VtChild.parent_id == 1, at=T1) + assert new.valid_from == T1 + + def test_gap_free_handoff_with_reused_natural_key(self, session): + p = _parent(session, 1) + old = _child(session, p, 1) + new = _VtChild(parent_id=p.id, tag=1) # same natural key (parent_id, tag) + + _VtChild.supersede_live_where(session, [new], _VtChild.parent_id == p.id, at=T1) + session.commit() # reused key: retire executes before the insert, so the index holds + + assert old.valid_to == T1 + assert new.valid_from == T1 + live = session.scalars(select(_VtChild).where(_VtChild.parent_id == p.id, _VtChild.current)).all() + assert [r.id for r in live] == [new.id] + + def test_empty_new_rows_retires_the_scope(self, session): + # A re-run that produces nothing should still withdraw the prior live set. + p = _parent(session, 1) + old = _child(session, p, 1) + + _VtChild.supersede_live_where(session, [], _VtChild.parent_id == p.id, at=T1) + session.commit() + + assert old.valid_to == T1 + assert session.scalars(select(_VtChild).where(_VtChild.current)).all() == [] + + def test_refuses_a_cascade_bearing_class(self, session): + # Bulk supersede cannot fire __retire_cascade__, so it must refuse rather than orphan links. + with pytest.raises(ValueError, match="__retire_cascade__"): + _VtParent.supersede_live_where(session, [], _VtParent.key == 1) + + +class TestPartialUniqueIndexBackstop: + def test_two_live_rows_for_one_key_raise(self, session): + # The DB index is the loud backstop: forgetting to retire a predecessor before inserting its + # successor fails at flush instead of silently leaving two live rows. + session.add(_VtParent(key=1)) + session.add(_VtParent(key=1)) + with pytest.raises(IntegrityError): + session.commit() diff --git a/tests/helpers/util/setup/worker.py b/tests/helpers/util/setup/worker.py index cb6949b4e..252471ca8 100644 --- a/tests/helpers/util/setup/worker.py +++ b/tests/helpers/util/setup/worker.py @@ -4,13 +4,13 @@ from sqlalchemy import select +from mavedb.models.enums.job_pipeline import JobStatus from mavedb.models.score_set import ScoreSet as ScoreSetDbModel from mavedb.models.variant import Variant from mavedb.worker.jobs import ( create_variants_for_score_set, map_variants_for_score_set, ) -from mavedb.models.enums.job_pipeline import JobStatus from mavedb.worker.lib.managers.job_manager import JobManager from tests.helpers.constants import ( TEST_CODING_LAYER, @@ -23,6 +23,16 @@ ) +# Placeholder reference accessions for qualifying target-relative HGVS in mock +# post-mapped output, keyed by the HGVS kind prefix (``c.``/``n.``/``g.``/``p.``). +_PLACEHOLDER_ACCESSIONS = { + "c.": "NM_999999.1", + "n.": "NR_999999.1", + "g.": "NC_999999.1", + "p.": "NP_999999.1", +} + + async def create_variants_in_score_set( session, mock_s3_client, score_df, count_df, mock_worker_ctx, variant_creation_run ): @@ -128,6 +138,10 @@ async def construct_mock_mapping_output( "target_gene_identifier": target.name, "alignment_level": layer, "preferred": idx == 0, + # The accession this level was aligned against — a transcript (NM_) + # only at the coding level. Reverse translation resolves its coding + # transcript hint from the cdna entry's reference_accession. + "reference_accession": _PLACEHOLDER_ACCESSIONS[f"{layer}."], "tool_name": "dcd-mapping", "tool_version": "pytest.0.0", "tool_parameters": {}, @@ -155,15 +169,26 @@ async def construct_mock_mapping_output( "alignment_level": default_alignment_level, } - # Don't alter HGVS strings in post mapped output. This makes it considerably - # easier to assert correctness in tests. + # Reuse the variant's own HGVS in post-mapped output to keep assertions simple. + # Real mapper post-mapped HGVS is always accession-qualified (it is mapped onto + # a reference sequence), so prefix a placeholder accession when the source HGVS + # is target-relative (a bare ``c.``/``n.``/``g.``/``p.`` with no accession). if with_post_mapped: - mapped_score["post_mapped"]["expressions"][0]["value"] = variant.hgvs_nt or variant.hgvs_pro + hgvs = variant.hgvs_nt or variant.hgvs_pro + if hgvs and ":" not in hgvs: + hgvs = f"{_PLACEHOLDER_ACCESSIONS.get(hgvs[:2], 'NM_999999.1')}:{hgvs}" + mapped_score["post_mapped"]["expressions"][0]["value"] = hgvs # Skip every other variant if not with_all_variants if not with_all_variants and idx % 2 == 0: mapped_score["post_mapped"] = {} + # Mirror the mapper's per-record outcome: both alleles present -> MAPPED, else + # FAILED. Tests needing benign outcomes set ``outcome`` explicitly afterward. + mapped_score["outcome"] = ( + "mapped" if mapped_score["pre_mapped"] and mapped_score["post_mapped"] else "failed" + ) + mapping_output["mapped_scores"].append(mapped_score) if not mapping_output["mapped_scores"]: diff --git a/tests/lib/test_hgvs.py b/tests/lib/test_hgvs.py new file mode 100644 index 000000000..45793034a --- /dev/null +++ b/tests/lib/test_hgvs.py @@ -0,0 +1,96 @@ +import pytest + +from mavedb.lib.hgvs import ( + extract_accession, + join_cis_phased_hgvs, + split_cis_phased_hgvs, + strip_protein_prediction_parens, +) + + +@pytest.mark.parametrize( + ("hgvs", "expected"), + [ + ("NP_000456.2:p.(Ala222Val)", "NP_000456.2:p.Ala222Val"), + ("NP_000456.2:p.(Tyr745Ter)", "NP_000456.2:p.Tyr745Ter"), + ("NP_000456.2:p.(Ala222_Val225del)", "NP_000456.2:p.Ala222_Val225del"), + # no prediction parens -> unchanged + ("NP_000456.2:p.Ala222Val", "NP_000456.2:p.Ala222Val"), + ("NM_000001.1:c.5A>G", "NM_000001.1:c.5A>G"), + ], +) +def test_strip_protein_prediction_parens(hgvs, expected): + assert strip_protein_prediction_parens(hgvs) == expected + + +@pytest.mark.parametrize( + ("hgvs", "expected"), + [ + ("NC_000001.11:g.1000A>G", "NC_000001.11"), + ("NM_000001.1:c.5A>G", "NM_000001.1"), + ("NC_000001.11:g.[1000A>G;1002T>C]", "NC_000001.11"), + ("no-colon-here", ""), + ("", ""), + ], +) +def test_extract_accession(hgvs, expected): + assert extract_accession(hgvs) == expected + + +def test_split_cis_phased_hgvs_passes_through_single_variant(): + assert split_cis_phased_hgvs("NC_000001.11:g.1000A>G") == ["NC_000001.11:g.1000A>G"] + + +def test_split_cis_phased_hgvs_splits_bracketed_genomic_expression(): + # Non-adjacent codon components are emitted as one bracketed genomic expression; each + # component must regain the accession and prefix to be VRS-translatable on its own. + assert split_cis_phased_hgvs("NC_000001.11:g.[1000A>G;1002T>C]") == [ + "NC_000001.11:g.1000A>G", + "NC_000001.11:g.1002T>C", + ] + + +def test_split_cis_phased_hgvs_handles_three_components_and_coding_prefix(): + assert split_cis_phased_hgvs("NM_000001.1:c.[1A>G;3T>C;5G>A]") == [ + "NM_000001.1:c.1A>G", + "NM_000001.1:c.3T>C", + "NM_000001.1:c.5G>A", + ] + + +def test_join_cis_phased_hgvs_passes_through_single_component(): + assert join_cis_phased_hgvs(["NC_000001.11:g.1000A>G"]) == "NC_000001.11:g.1000A>G" + + +def test_join_cis_phased_hgvs_combines_genomic_components(): + assert ( + join_cis_phased_hgvs(["NC_000001.11:g.1000A>G", "NC_000001.11:g.1002T>C"]) == "NC_000001.11:g.[1000A>G;1002T>C]" + ) + + +@pytest.mark.parametrize( + "expression", + [ + "NC_000001.11:g.[1000A>G;1002T>C]", + "NM_000001.1:c.[1A>G;3T>C;5G>A]", + ], +) +def test_join_is_inverse_of_split(expression): + assert join_cis_phased_hgvs(split_cis_phased_hgvs(expression)) == expression + + +def test_join_cis_phased_hgvs_returns_none_for_empty(): + assert join_cis_phased_hgvs([]) is None + + +def test_join_cis_phased_hgvs_returns_none_for_mixed_accessions(): + # Members on different sequences are not one cis-phased block. + assert join_cis_phased_hgvs(["NC_000001.11:g.1A>G", "NC_000002.12:g.2T>C"]) is None + + +def test_join_cis_phased_hgvs_returns_none_for_mixed_coordinate_prefixes(): + assert join_cis_phased_hgvs(["NM_000001.1:c.1A>G", "NM_000001.1:g.2T>C"]) is None + + +def test_join_cis_phased_hgvs_returns_none_for_component_without_accession(): + assert join_cis_phased_hgvs(["g.1A>G", "g.2T>C"]) is None diff --git a/tests/lib/test_variants.py b/tests/lib/test_variants.py index ca9c2b0b3..9cffaa793 100644 --- a/tests/lib/test_variants.py +++ b/tests/lib/test_variants.py @@ -46,6 +46,18 @@ def test_get_hgvs_from_post_mapped_cis_phased_block(): assert result is None +def test_get_hgvs_from_post_mapped_cis_phased_block_combine_cis(): + # combine_cis collapses the cis-phased members into one bracketed expression. + result = get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, combine_cis=True) + assert result == "NM_003345:p.[Asp5Phe;Asp5Phe]" + + +def test_get_hgvs_from_post_mapped_single_allele_combine_cis_is_unbracketed(): + # A single-variant post-mapped allele is unaffected by combine_cis. + result = get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, combine_cis=True) + assert result == TEST_HGVS_IDENTIFIER + + def test_get_hgvs_from_post_mapped_single_allele_vrs_1(): with pytest.raises(ValueError): get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X) diff --git a/tests/lib/test_vrs_utils.py b/tests/lib/test_vrs_utils.py new file mode 100644 index 000000000..f40dce976 --- /dev/null +++ b/tests/lib/test_vrs_utils.py @@ -0,0 +1,194 @@ +# ruff: noqa: E402 + +from types import SimpleNamespace + +import pytest + +pytest.importorskip("ga4gh.vrs") + +from ga4gh.vrs.models import ( + Allele, + CisPhasedBlock, + LiteralSequenceExpression, + ReferenceLengthExpression, + SequenceLocation, + SequenceReference, +) + +from mavedb.lib import vrs_utils +from mavedb.lib.vrs_utils import ( + _rle_to_lse, + identify_variation, + normalize_and_identify, + translate_hgvs_to_variation, +) + +_SQ = "SQ." + "a" * 32 + +# A digest a reused ga4gh AlleleTranslator might leave on a component before it is +# re-identified — stale from the Merkle cache, or carried over from a sibling variant. +_STALE_ID = "ga4gh:VA." + "Z" * 32 + + +def _allele(start: int, alt: str) -> Allele: + return Allele( + location=SequenceLocation( + sequenceReference=SequenceReference(refgetAccession=_SQ), + start=start, + end=start + 1, + ), + state=LiteralSequenceExpression(sequence=alt), + ) + + +def _patch_component_translator(monkeypatch, alleles_by_hgvs: dict[str, Allele]) -> None: + """Stub the per-component single-allele translator that translate_hgvs_to_variation calls.""" + + def _fake(hgvs: str, translator=None) -> Allele: + return alleles_by_hgvs[hgvs] + + monkeypatch.setattr(vrs_utils, "translate_hgvs_to_vrs", _fake) + + +def _stub_normalize(monkeypatch) -> None: + """Make normalization a no-op so identification can be exercised without a seqrepo. + + Identification (the regression surface) runs for real; only the proxy-backed + normalize step is stubbed out. + """ + monkeypatch.setattr(vrs_utils, "normalize", lambda allele, data_proxy: allele) + + +def _translator() -> SimpleNamespace: + return SimpleNamespace(data_proxy=object()) + + +def test_single_variant_returns_a_bare_allele(monkeypatch): + allele = _allele(1000, "G") + _patch_component_translator(monkeypatch, {"NC_000001.11:g.1000A>G": allele}) + _stub_normalize(monkeypatch) + + result = translate_hgvs_to_variation("NC_000001.11:g.1000A>G", translator=_translator()) + + assert isinstance(result, Allele) + assert not isinstance(result, CisPhasedBlock) + + +def test_single_variant_is_reidentified_not_left_with_translator_digest(monkeypatch): + # ga4gh's translate_from (do_normalize=False) stamps id via plain ga4gh_identify on + # the reused translator; translate_hgvs_to_variation must re-identify so the persisted + # vrs_digest reflects current content rather than that carried-over value. + allele = _allele(1000, "G") + allele.id = _STALE_ID + _patch_component_translator(monkeypatch, {"NC_000001.11:g.1000A>G": allele}) + _stub_normalize(monkeypatch) + + result = translate_hgvs_to_variation("NC_000001.11:g.1000A>G", translator=_translator()) + + assert result.id is not None + assert result.id != _STALE_ID + assert result.id.startswith("ga4gh:VA.") + + +def test_same_position_distinct_alts_get_distinct_digests(monkeypatch): + # Regression: NC_000002.12:g.214809459A>C and g.214809459A>T are different MaveDB + # variants that cannot encode the same codon, yet were merged onto one Allele row + # (and thus one coding equivalent like NM_000465.4:c.109_111delinsCGA). The single + # component returned by the reused translator carried a stale/shared digest, which + # the digest-keyed get_or_create_allele then deduplicated. Re-identification must + # give distinct content distinct digests so they never collide. + a_c = _allele(214809458, "C") + a_t = _allele(214809458, "T") + a_c.id = a_t.id = _STALE_ID # what the unfixed path would persist for both + _patch_component_translator( + monkeypatch, + { + "NC_000002.12:g.214809459A>C": a_c, + "NC_000002.12:g.214809459A>T": a_t, + }, + ) + _stub_normalize(monkeypatch) + translator = _translator() + + res_c = translate_hgvs_to_variation("NC_000002.12:g.214809459A>C", translator=translator) + res_t = translate_hgvs_to_variation("NC_000002.12:g.214809459A>T", translator=translator) + + assert res_c.id != _STALE_ID + assert res_t.id != _STALE_ID + assert res_c.id != res_t.id + + +def test_cis_phased_multivariant_returns_an_identified_block(monkeypatch): + members = { + "NC_000001.11:g.1000A>G": _allele(1000, "G"), + "NC_000001.11:g.1002T>C": _allele(1002, "C"), + } + _patch_component_translator(monkeypatch, members) + _stub_normalize(monkeypatch) + + result = translate_hgvs_to_variation("NC_000001.11:g.[1000A>G;1002T>C]", translator=_translator()) + + assert isinstance(result, CisPhasedBlock) + assert len(result.members) == 2 + assert result.id is not None and result.id.startswith("ga4gh:CPB.") + + +def test_cis_phased_block_digest_is_order_independent(monkeypatch): + members = { + "NC_000001.11:g.1000A>G": _allele(1000, "G"), + "NC_000001.11:g.1002T>C": _allele(1002, "C"), + } + _patch_component_translator(monkeypatch, members) + _stub_normalize(monkeypatch) + + forward = translate_hgvs_to_variation("NC_000001.11:g.[1000A>G;1002T>C]", translator=_translator()) + reverse = translate_hgvs_to_variation("NC_000001.11:g.[1002T>C;1000A>G]", translator=_translator()) + + # The same biological cis-phased set dedups to one row regardless of component ordering. + assert forward.id == reverse.id + + +def test_identify_variation_clears_stale_block_digest(): + block = CisPhasedBlock(members=[_allele(1000, "G"), _allele(1002, "C")]) + block.digest = "STALE" + + digest = identify_variation(block) + + assert digest != "STALE" + assert digest.startswith("ga4gh:CPB.") # recomputed from content, not the stale cache + + +def _rle_allele(start: int, *, length: int, repeat_subunit_length: int) -> Allele: + return Allele( + location=SequenceLocation( + sequenceReference=SequenceReference(refgetAccession=_SQ), + start=start, + end=start + repeat_subunit_length, + ), + state=ReferenceLengthExpression(length=length, repeatSubunitLength=repeat_subunit_length), + ) + + +def test_rle_to_lse_tiles_repeat_subunit(): + # repeatSubunitLength=2 reads 2 bases from the proxy; length=4 tiles them out to "ACAC". + location = SequenceLocation(sequenceReference=SequenceReference(refgetAccession=_SQ), start=10, end=12) + rle = ReferenceLengthExpression(length=4, repeatSubunitLength=2) + data_proxy = SimpleNamespace(get_sequence=lambda identifier, start, end: "AC") + + result = _rle_to_lse(rle, location, data_proxy) + + assert isinstance(result, LiteralSequenceExpression) + assert result.sequence.root == "ACAC" + + +def test_normalize_and_identify_coerces_rle_to_lse(monkeypatch): + # Regression: normalization can yield an RLE for an indel, but dcd_mapping stores LSE; the + # finalize step must coerce so the same variant doesn't hash to two digests (and duplicate). + rle = _rle_allele(10, length=2, repeat_subunit_length=2) + monkeypatch.setattr(vrs_utils, "normalize", lambda allele, data_proxy: rle) + data_proxy = SimpleNamespace(get_sequence=lambda identifier, start, end: "AC") + + result = normalize_and_identify(rle, data_proxy=data_proxy) + + assert isinstance(result.state, LiteralSequenceExpression) + assert result.id is not None and result.id.startswith("ga4gh:VA.") diff --git a/tests/worker/conftest_optional.py b/tests/worker/conftest_optional.py index 0f1d2e95f..ccd4208a0 100644 --- a/tests/worker/conftest_optional.py +++ b/tests/worker/conftest_optional.py @@ -4,6 +4,7 @@ import pytest from arq import ArqRedis from cdot.hgvs.dataproviders import RESTDataProvider +from ga4gh.vrs.dataproxy import SeqRepoDataProxy from sqlalchemy.orm import Session from mavedb.worker.lib.managers.job_manager import JobManager @@ -52,6 +53,7 @@ def mock_worker_ctx(): """Create a mock worker context dictionary for testing.""" mock_redis = Mock(spec=ArqRedis) mock_hdp = Mock(spec=RESTDataProvider) + mock_seqrepo = Mock(spec=SeqRepoDataProxy) mock_pool = Mock(spec=ProcessPoolExecutor) # Don't mock the session itself to allow real DB interactions in tests @@ -60,5 +62,6 @@ def mock_worker_ctx(): return { "redis": mock_redis, "hdp": mock_hdp, + "seqrepo": mock_seqrepo, "pool": mock_pool, } diff --git a/tests/worker/jobs/conftest.py b/tests/worker/jobs/conftest.py index de835a56d..eac380862 100644 --- a/tests/worker/jobs/conftest.py +++ b/tests/worker/jobs/conftest.py @@ -44,6 +44,38 @@ def map_variants_sample_params(with_populated_domain_data, sample_score_set, sam } +@pytest.fixture +def reverse_translate_variants_sample_params(with_populated_domain_data, sample_score_set): + """Provide sample parameters for reverse_translate_variants_for_score_set job.""" + + return { + "score_set_id": sample_score_set.id, + "correlation_id": "sample-reverse-translation-correlation-id", + } + + +@pytest.fixture +def sample_independent_reverse_translation_run(reverse_translate_variants_sample_params): + """Create a JobRun instance for the reverse_translate_variants_for_score_set job.""" + + return JobRun( + urn="test:reverse_translate_variants_for_score_set", + job_type="reverse_translate_variants_for_score_set", + job_function="reverse_translate_variants_for_score_set", + max_retries=3, + retry_count=0, + job_params=reverse_translate_variants_sample_params, + ) + + +@pytest.fixture +def with_reverse_translation_run(session, sample_independent_reverse_translation_run): + """Add a reverse_translate_variants_for_score_set job run to the session.""" + + session.add(sample_independent_reverse_translation_run) + session.commit() + + @pytest.fixture def link_gnomad_variants_sample_params(with_populated_domain_data, sample_score_set): """Provide sample parameters for create_variants_for_score_set job.""" diff --git a/tests/worker/jobs/variant_processing/test_mapping.py b/tests/worker/jobs/variant_processing/test_mapping.py index dc2ac843e..3a0c6db2f 100644 --- a/tests/worker/jobs/variant_processing/test_mapping.py +++ b/tests/worker/jobs/variant_processing/test_mapping.py @@ -5,15 +5,20 @@ pytest.importorskip("arq") from asyncio.unix_events import _UnixSelectorEventLoop +from datetime import date from unittest.mock import MagicMock, patch from sqlalchemy.exc import NoResultFound from mavedb.lib.mapping import EXCLUDED_PREMAPPED_ANNOTATION_KEYS from mavedb.lib.types.workflow import JobExecutionOutcome +from mavedb.models.allele import Allele +from mavedb.models.enums.annotation_layer import AnnotationLayer from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus from mavedb.models.enums.mapping_state import MappingState -from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele +from mavedb.models.target_gene_mapping import TargetGeneMapping from mavedb.models.variant import Variant from mavedb.models.variant_annotation_status import VariantAnnotationStatus from mavedb.worker.jobs.variant_processing.mapping import map_variants_for_score_set @@ -24,6 +29,25 @@ pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") +def authoritative_allele_for(session, mapping_record): + """Return the authoritative (post-mapped) Allele linked to a mapping record, or None. + + Under the allele data model the post-mapped VRS representation lives on an + ``Allele`` joined to the ``MappingRecord`` through ``MappingRecordAllele``. + A successfully post-mapped variant has exactly one authoritative link; a + variant that failed to post-map has a mapping record but no such link. + """ + return ( + session.query(Allele) + .join(MappingRecordAllele, MappingRecordAllele.allele_id == Allele.id) + .filter( + MappingRecordAllele.mapping_record_id == mapping_record.id, + MappingRecordAllele.is_authoritative.is_(True), + ) + .one_or_none() + ) + + @pytest.mark.unit @pytest.mark.asyncio class TestMapVariantsForScoreSetUnit: @@ -90,7 +114,10 @@ async def test_map_variants_for_score_set_no_mapped_scores( _UnixSelectorEventLoop, "run_in_executor", return_value=self.dummy_mapping_output( - {"mapped_scores": [], "error_message": "No variants were mapped for this score set"} + { + "mapped_scores": [], + "error_message": "No variants were mapped for this score set", + } ), ), ): @@ -134,7 +161,10 @@ async def test_map_variants_for_score_set_no_reference_data( _UnixSelectorEventLoop, "run_in_executor", return_value=self.dummy_mapping_output( - {"mapped_scores": [MagicMock()], "error_message": "Reference metadata missing from mapping results"} + { + "mapped_scores": [MagicMock()], + "error_message": "Reference metadata missing from mapping results", + } ), ), ): @@ -318,9 +348,9 @@ async def dummy_mapping_job(): for target in sample_score_set.target_genes: assert target.mapped_hgnc_name is None - # Verify that a mapped variant was created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 1 + # Verify that a mapping record was created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 1 # Verify that annotation statuses were created and correct annotation_statuses = ( @@ -435,9 +465,9 @@ async def dummy_mapping_job(): else: assert target.post_mapped_metadata.get("protein") is None - # Verify that a mapped variant was created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 1 + # Verify that a mapping record was created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 1 # Verify that annotation statuses were created and correct annotation_statuses = ( @@ -450,6 +480,50 @@ async def dummy_mapping_job(): assert annotation_statuses[0].annotation_type == "vrs_mapping" assert annotation_statuses[0].status == "success" + async def test_persists_cdna_target_gene_mapping_with_reference_accession_and_null_qc( + self, + session, + with_independent_processing_runs, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_score_set, + ): + """An identity cdna TargetGeneMapping (a cdna layer with no per-variant scores + joining it) persists with its reference_accession (NM_) and null QC/counts -- the + artifact reverse translation consumes to resolve the projection transcript.""" + variant = Variant( + score_set_id=sample_score_set.id, + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + + # Genomic assay layer + an identity cdna layer (no cdna per-variant scores). + async def dummy_mapping_job(): + return await construct_mock_mapping_output( + session=session, score_set=sample_score_set, with_layers={"g", "c"} + ) + + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=dummy_mapping_job()): + result = await map_variants_for_score_set( + mock_worker_ctx, + sample_independent_variant_mapping_run.id, + JobManager(session, mock_worker_ctx["redis"], sample_independent_variant_mapping_run.id), + ) + assert result.status == JobStatus.SUCCEEDED + + cdna_tgms = ( + session.query(TargetGeneMapping).filter(TargetGeneMapping.alignment_level == AnnotationLayer.cdna).all() + ) + assert len(cdna_tgms) == len(sample_score_set.target_genes) + tgm = cdna_tgms[0] + assert tgm.reference_accession == "NM_999999.1" + # Identity row: no mapped_variant joins it, so QC/counts are null. + assert tgm.total_variants is None + assert tgm.variants_mapped_cleanly is None + assert tgm.percent_identity is None + async def test_map_variants_for_score_set_success_no_successful_mapping( self, session, @@ -501,13 +575,12 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_state == MappingState.failed assert sample_score_set.mapping_errors["error_message"] == "All variants failed to map." - # Verify that one mapped variant was created. Although no successful mapping, an entry is still created. - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 1 + # Verify that one mapping record was created. Although no successful mapping, an entry is still created. + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 1 - # Verify that the mapped variant has no post-mapped data - mapped_variant = mapped_variants[0] - assert mapped_variant.post_mapped == {} + # Verify that the mapping record has no authoritative (post-mapped) allele + assert authoritative_allele_for(session, mapping_records[0]) is None # Verify that annotation statuses were created and correct annotation_statuses = ( @@ -584,19 +657,14 @@ async def dummy_mapping_job(): # Although only one variant was successfully mapped, verify that an entity was created # for each variant in the score set - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 2 + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 2 - # Verify that only one variant has post-mapped data - mapped_variant_with_post_data = ( - session.query(MappedVariant).filter(MappedVariant.post_mapped != {}).one_or_none() - ) - assert mapped_variant_with_post_data is not None - - mapped_variant_without_post_data = ( - session.query(MappedVariant).filter(MappedVariant.post_mapped == {}).one_or_none() - ) - assert mapped_variant_without_post_data is not None + # Verify that exactly one mapping record has post-mapped (authoritative) allele data + records_with_post_data = [r for r in mapping_records if authoritative_allele_for(session, r) is not None] + records_without_post_data = [r for r in mapping_records if authoritative_allele_for(session, r) is None] + assert len(records_with_post_data) == 1 + assert len(records_without_post_data) == 1 # Verify that annotation statuses were created and correct annotation_status_success = ( @@ -616,6 +684,96 @@ async def dummy_mapping_job(): assert len(annotation_status_failed) == 1 assert annotation_status_failed[0].annotation_type == "vrs_mapping" + async def test_map_variants_for_score_set_benign_outcomes_are_not_failures( + self, + session, + with_independent_processing_runs, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_score_set, + ): + """A score set whose only unmapped variants are benign absences (intronic / no + protein consequence) is ``complete``, not ``incomplete``/``failed``: benign + outcomes carry no allele but are skips, not failures.""" + + async def dummy_mapping_job(): + mapping_output = await construct_mock_mapping_output( + session=session, + score_set=sample_score_set, + with_gene_info=True, + with_layers={"g", "c", "p"}, + with_pre_mapped=True, + with_post_mapped=True, + with_reference_metadata=True, + with_mapped_scores=True, + with_all_variants=True, + ) + # Re-stamp the emitted records as benign absences: no allele, no failure. The + # helper only produces MAPPED/FAILED, so we override here to exercise the path. + benign_outcomes = ["intronic", "no_protein_consequence"] + for idx, mapped_score in enumerate(mapping_output["mapped_scores"]): + mapped_score["pre_mapped"] = {} + mapped_score["post_mapped"] = {} + mapped_score["outcome"] = benign_outcomes[idx % len(benign_outcomes)] + return mapping_output + + variant1 = Variant( + score_set_id=sample_score_set.id, + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + urn="variant:1", + ) + variant2 = Variant( + score_set_id=sample_score_set.id, + hgvs_nt="NM_000000.1:c.2G>T", + hgvs_pro="NP_000000.1:p.Val2Leu", + data={}, + urn="variant:2", + ) + session.add_all([variant1, variant2]) + session.commit() + + with ( + patch.object( + _UnixSelectorEventLoop, + "run_in_executor", + return_value=dummy_mapping_job(), + ), + ): + result = await map_variants_for_score_set( + mock_worker_ctx, + sample_independent_variant_mapping_run.id, + JobManager(session, mock_worker_ctx["redis"], sample_independent_variant_mapping_run.id), + ) + + # The run succeeded and the score set is complete -- nothing genuinely failed. + assert isinstance(result, JobExecutionOutcome) + assert result.status == JobStatus.SUCCEEDED + assert result.data["mapped_count"] == 0 + assert result.data["failed_count"] == 0 + assert result.data["skipped_count"] == 2 + assert sample_score_set.mapping_state == MappingState.complete + assert sample_score_set.mapping_errors is None + + # A record exists per variant, but with no authoritative allele. + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 2 + assert all(authoritative_allele_for(session, r) is None for r in mapping_records) + + # Benign outcomes are SKIPPED (not FAILED), and the finer outcome is preserved in metadata. + annotation_statuses = ( + session.query(VariantAnnotationStatus) + .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id) + .all() + ) + assert len(annotation_statuses) == 2 + assert all(s.status == "skipped" for s in annotation_statuses) + assert all(s.failure_category is None for s in annotation_statuses) + recorded_outcomes = {s.annotation_metadata["outcome"] for s in annotation_statuses} + assert recorded_outcomes == {"intronic", "no_protein_consequence"} + async def test_map_variants_for_score_set_complete_mapping( self, session, @@ -678,17 +836,17 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_state == MappingState.complete assert sample_score_set.mapping_errors is None - # Verify that mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 2 + # Verify that mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 2 # Verify that both variants have post-mapped data. I'm comfortable assuming the # data is correct given our layer permutation tests above. for urn in ["variant:1", "variant:2"]: - mapped_variant = session.query(MappedVariant).filter(MappedVariant.variant.has(urn=urn)).one_or_none() - assert mapped_variant is not None - assert mapped_variant.post_mapped != {} - assert mapped_variant.hgvs_assay_level is not None + mapping_record = session.query(MappingRecord).filter(MappingRecord.variant.has(urn=urn)).one_or_none() + assert mapping_record is not None + assert authoritative_allele_for(session, mapping_record) is not None + assert mapping_record.hgvs_assay_level is not None # Verify that annotation statuses were created and correct annotation_statuses = ( @@ -733,13 +891,23 @@ async def dummy_mapping_job(): ) session.add(variant) session.commit() - mapped_variant = MappedVariant( + mapping_record = MappingRecord( variant_id=variant.id, - current=True, - mapped_date="2023-01-01T00:00:00Z", + assay_level=AnnotationLayer.genomic, + mapped_date=date(2023, 1, 1), mapping_api_version="v1.0.0", ) - session.add(mapped_variant) + session.add(mapping_record) + session.commit() + # Link the prior record to an authoritative allele. Re-mapping must retire this link too, + # not just the record — a live link dangling off a retired record breaks the temporal model. + prior_allele = Allele(vrs_digest="ga4gh:VA.prior", level=AnnotationLayer.genomic.value) + session.add(prior_allele) + session.commit() + prior_link = MappingRecordAllele( + mapping_record_id=mapping_record.id, allele_id=prior_allele.id, is_authoritative=True + ) + session.add(prior_link) session.commit() variant_annotation_status = VariantAnnotationStatus( variant_id=variant.id, current=True, annotation_type="vrs_mapping", status="success" @@ -766,31 +934,42 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_state == MappingState.complete assert sample_score_set.mapping_errors is None - # Verify the existing mapped variant was marked as non-current - non_current_mapped_variant = ( - session.query(MappedVariant) - .filter(MappedVariant.id == mapped_variant.id, MappedVariant.current.is_(False)) + # Verify the existing mapping record was retired (valid_to closed) + non_current_mapping_record = ( + session.query(MappingRecord) + .filter(MappingRecord.id == mapping_record.id, MappingRecord.valid_to.isnot(None)) .one_or_none() ) - assert non_current_mapped_variant is not None + assert non_current_mapping_record is not None + + # Verify the prior record's authoritative allele link was retired alongside the record, + # so no live link dangles off the retired record. + session.refresh(prior_link) + assert prior_link.valid_to is not None - # Verify a new mapped variant entry was created - new_mapped_variant = ( - session.query(MappedVariant) - .filter(MappedVariant.variant_id == variant.id, MappedVariant.current.is_(True)) + # Verify a new, live mapping record entry was created + new_mapping_record = ( + session.query(MappingRecord) + .filter(MappingRecord.variant_id == variant.id, MappingRecord.current) .one_or_none() ) - assert new_mapped_variant is not None + assert new_mapping_record is not None + + # Gap-free handoff: the prior record closed at exactly the new record's valid_from (one + # supersession timestamp), and the cascade closed the prior link at that same instant — so + # no point-in-time query lands in a hole between the old and new record or their links. + assert non_current_mapping_record.valid_to == new_mapping_record.valid_from + assert prior_link.valid_to == non_current_mapping_record.valid_to - # Verify that the new mapped variant has updated mapping data - assert new_mapped_variant.mapped_date != "2023-01-01T00:00:00Z" - assert new_mapped_variant.mapping_api_version != "v1.0.0" + # Verify that the new mapping record has updated mapping data + assert new_mapping_record.mapped_date != date(2023, 1, 1) + assert new_mapping_record.mapping_api_version != "v1.0.0" # Verify the non-current annotation status still exists old_annotation_status = ( session.query(VariantAnnotationStatus) .filter( - VariantAnnotationStatus.variant_id == non_current_mapped_variant.variant_id, + VariantAnnotationStatus.variant_id == non_current_mapping_record.variant_id, VariantAnnotationStatus.current.is_(False), ) .one_or_none() @@ -862,9 +1041,9 @@ async def dummy_mapping_job(): assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 4 + # Verify that mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 4 # Verify score set mapping state assert sample_score_set.mapping_state == MappingState.complete @@ -875,11 +1054,11 @@ async def dummy_mapping_job(): assert target.mapped_hgnc_name is not None assert target.post_mapped_metadata is not None - # Verify that each variant has a corresponding mapped variant + # Verify that each variant has a corresponding mapping record variants = ( session.query(Variant) - .join(MappedVariant, MappedVariant.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, MappedVariant.current.is_(True)) + .join(MappingRecord, MappingRecord.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, MappingRecord.current.is_(True)) .all() ) assert len(variants) == 4 @@ -953,9 +1132,9 @@ async def dummy_mapping_job(): assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 4 + # Verify that mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 4 # Verify score set mapping state assert sample_score_set.mapping_state == MappingState.complete @@ -966,11 +1145,11 @@ async def dummy_mapping_job(): assert target.mapped_hgnc_name is not None assert target.post_mapped_metadata is not None - # Verify that each variant has a corresponding mapped variant + # Verify that each variant has a corresponding mapping record variants = ( session.query(Variant) - .join(MappedVariant, MappedVariant.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, MappedVariant.current.is_(True)) + .join(MappingRecord, MappingRecord.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, MappingRecord.current.is_(True)) .all() ) assert len(variants) == 4 @@ -1053,9 +1232,9 @@ async def dummy_mapping_job(): in sample_score_set.mapping_errors["error_message"] ) - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1130,9 +1309,9 @@ async def dummy_mapping_job(): # Error message originates from our mock mapping construction function assert "test error: no mapped scores" in sample_score_set.mapping_errors["error_message"] - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1206,9 +1385,9 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_errors is not None assert "Reference metadata missing from mapping results" in sample_score_set.mapping_errors["error_message"] - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1246,20 +1425,20 @@ async def test_map_variants_for_score_set_updates_current_mapped_variants( sample_independent_variant_creation_run, ) - # Associate mapped variants with all variants just created in the score set + # Associate mapping records with all variants just created in the score set variants = session.query(Variant).filter(Variant.score_set_id == sample_score_set.id).all() for variant in variants: - mapped_variant = MappedVariant( + mapping_record = MappingRecord( variant_id=variant.id, - current=True, - mapped_date="2023-01-01T00:00:00Z", + assay_level=AnnotationLayer.genomic, + mapped_date=date(2023, 1, 1), mapping_api_version="v1.0.0", ) annotation_status = VariantAnnotationStatus( variant_id=variant.id, current=True, annotation_type="vrs_mapping", status="success" ) session.add(annotation_status) - session.add(mapped_variant) + session.add(mapping_record) session.commit() async def dummy_mapping_job(): @@ -1295,27 +1474,27 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_state == MappingState.complete assert sample_score_set.mapping_errors is None - # Verify that mapped variants were marked as non-current and new entries created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == len(variants) * 2 # Each variant has two mapped entries now + # Verify that mapping records were marked as non-current and new entries created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == len(variants) * 2 # Each variant has two mapping records now for variant in variants: - non_current_mapped_variant = ( - session.query(MappedVariant) - .filter(MappedVariant.variant_id == variant.id, MappedVariant.current.is_(False)) + non_current_mapping_record = ( + session.query(MappingRecord) + .filter(MappingRecord.variant_id == variant.id, MappingRecord.valid_to.isnot(None)) .one_or_none() ) - assert non_current_mapped_variant is not None + assert non_current_mapping_record is not None - new_mapped_variant = ( - session.query(MappedVariant) - .filter(MappedVariant.variant_id == variant.id, MappedVariant.current.is_(True)) + new_mapping_record = ( + session.query(MappingRecord) + .filter(MappingRecord.variant_id == variant.id, MappingRecord.current) .one_or_none() ) - assert new_mapped_variant is not None + assert new_mapping_record is not None - # Verify that the new mapped variant has updated mapping data - assert new_mapped_variant.mapped_date != "2023-01-01T00:00:00Z" - assert new_mapped_variant.mapping_api_version != "v1.0.0" + # Verify that the new mapping record has updated mapping data + assert new_mapping_record.mapped_date != date(2023, 1, 1) + assert new_mapping_record.mapping_api_version != "v1.0.0" # Verify that annotation statuses where marked as non-current and new entries created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1389,9 +1568,9 @@ async def dummy_mapping_job(): assert sample_score_set.mapping_errors is not None assert "test error: no mapped scores" in sample_score_set.mapping_errors["error_message"] - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1448,9 +1627,9 @@ async def dummy_mapping_job(): in sample_score_set.mapping_errors["error_message"] ) - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1518,19 +1697,19 @@ async def dummy_mapping_job(): await arq_worker.async_run() await arq_worker.run_check() - # Verify that mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 4 + # Verify that mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 4 # Verify score set mapping state assert sample_score_set.mapping_state == MappingState.complete assert sample_score_set.mapping_errors is None - # Verify that each variant has a corresponding mapped variant + # Verify that each variant has a corresponding mapping record variants = ( session.query(Variant) - .join(MappedVariant, MappedVariant.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, MappedVariant.current.is_(True)) + .join(MappingRecord, MappingRecord.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, MappingRecord.current.is_(True)) .all() ) assert len(variants) == 4 @@ -1606,19 +1785,19 @@ async def dummy_mapping_job(): await arq_worker.async_run() await arq_worker.run_check() - # Verify that mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 4 + # Verify that mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 4 # Verify score set mapping state assert sample_score_set.mapping_state == MappingState.complete assert sample_score_set.mapping_errors is None - # Verify that each variant has a corresponding mapped variant + # Verify that each variant has a corresponding mapping record variants = ( session.query(Variant) - .join(MappedVariant, MappedVariant.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, MappedVariant.current.is_(True)) + .join(MappingRecord, MappingRecord.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, MappingRecord.current.is_(True)) .all() ) assert len(variants) == 4 @@ -1690,9 +1869,9 @@ async def dummy_mapping_job(): in sample_score_set.mapping_errors["error_message"] ) - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() @@ -1744,9 +1923,9 @@ async def dummy_mapping_job(): in sample_score_set.mapping_errors["error_message"] ) - # Verify that no mapped variants were created - mapped_variants = session.query(MappedVariant).all() - assert len(mapped_variants) == 0 + # Verify that no mapping records were created + mapping_records = session.query(MappingRecord).all() + assert len(mapping_records) == 0 # Verify that no annotation statuses were created annotation_statuses = session.query(VariantAnnotationStatus).all() diff --git a/tests/worker/jobs/variant_processing/test_reverse_translation.py b/tests/worker/jobs/variant_processing/test_reverse_translation.py new file mode 100644 index 000000000..ec926f267 --- /dev/null +++ b/tests/worker/jobs/variant_processing/test_reverse_translation.py @@ -0,0 +1,1228 @@ +# ruff: noqa: E402 + +import pytest + +pytest.importorskip("arq") + +import contextlib +from asyncio.unix_events import _UnixSelectorEventLoop +from datetime import timedelta +from unittest.mock import MagicMock, patch + +from variant_annotation.lib.translation.types import TranslationError, TranslationResult, WtCodonMode + +from mavedb.lib.types.workflow import JobExecutionOutcome +from mavedb.models.allele import Allele +from mavedb.models.enums.annotation_layer import AnnotationLayer +from mavedb.models.enums.job_pipeline import JobStatus +from mavedb.models.enums.target_category import TargetCategory +from mavedb.models.job_run import JobRun +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele +from mavedb.models.target_gene_mapping import TargetGeneMapping +from mavedb.models.variant import Variant +from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.worker.jobs.variant_processing.mapping import map_variants_for_score_set +from mavedb.worker.jobs.variant_processing.reverse_translation import ( + _build_translation_config, + reverse_translate_variants_for_score_set, +) +from mavedb.worker.lib.managers.job_manager import JobManager +from tests.helpers.constants import TEST_GA4GH_IDENTIFIER +from tests.helpers.util.setup.worker import construct_mock_mapping_output + +pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") + +# Module under test, used as the patch root for the external annotation-package calls. +RT_MODULE = "mavedb.worker.jobs.variant_processing.reverse_translation" + + +class FakeVrsVariation: + """Stand-in for a ga4gh.vrs variation returned by translate_hgvs_to_variation. + + The reverse translation job only reads ``.id`` (for the VRS digest / dedup key) + and ``.model_dump()`` (persisted into the ``post_mapped`` JSONB column), so the + fake exposes exactly those. ``vrs_type`` lets a test assert whether a candidate + became a single Allele or a CisPhasedBlock. + """ + + def __init__(self, vrs_id: str, vrs_type: str = "Allele"): + self.id = vrs_id + self.vrs_type = vrs_type + + def model_dump(self) -> dict: + return {"type": self.vrs_type, "id": self.id} + + +def fake_construct(results_by_hgvs: dict, errors_by_hgvs: dict | None = None): + """Build a stand-in for ``construct_equivalent_variants``. + + The job correlates each TranslationResult/TranslationError back to its originating + mapping record via object identity on ``result.input``, so the fake must echo the + exact VariantInput instances it was handed. Candidates are keyed by the input's + assay-level HGVS string. + """ + errors_by_hgvs = errors_by_hgvs or {} + + def _construct(inputs, *, transcripts, coordinates, config): + results, errors = [], [] + for inp in inputs: + if inp.hgvs in errors_by_hgvs: + errors.append(TranslationError(input=inp, error=errors_by_hgvs[inp.hgvs])) + elif inp.hgvs in results_by_hgvs: + entry = results_by_hgvs[inp.hgvs] + c_candidates, g_candidates = entry[0], entry[1] + hgvs_p = entry[2] if len(entry) > 2 else None + results.append( + TranslationResult( + input=inp, + hgvs_c_candidates=list(c_candidates), + hgvs_g_candidates=list(g_candidates), + hgvs_p=hgvs_p, + ) + ) + else: + errors.append(TranslationError(input=inp, error=f"no stub result for {inp.hgvs!r}")) + return results, errors + + return _construct + + +def fake_translate(id_by_hgvs: dict, type_by_hgvs: dict | None = None, errors_by_hgvs: dict | None = None): + """Build a stand-in for ``translate_hgvs_to_variation`` mapping candidate HGVS -> VRS id. + + Unknown HGVS get a deterministic id derived from the string so distinct candidates + stay distinct; supplying explicit ids lets a test force two candidates to collapse. + ``type_by_hgvs`` lets a test mark a bracketed candidate as a ``CisPhasedBlock`` (the + real translate_hgvs_to_variation returns one for cis-phased multivariants). + ``errors_by_hgvs`` makes a candidate raise, standing in for an HGVS form ga4gh cannot + translate to VRS. + """ + type_by_hgvs = type_by_hgvs or {} + errors_by_hgvs = errors_by_hgvs or {} + + def _translate(hgvs: str, translator=None) -> FakeVrsVariation: + if hgvs in errors_by_hgvs: + raise ValueError(errors_by_hgvs[hgvs]) + return FakeVrsVariation( + id_by_hgvs.get(hgvs, f"ga4gh:VA.{hgvs}"), + type_by_hgvs.get(hgvs, "Allele"), + ) + + return _translate + + +async def _map_variants(session, mock_worker_ctx, mapping_run, score_set, with_layers=frozenset({"g", "c", "p"})): + """Run the (mocked) mapping job so the score set's variants gain authoritative + MappingRecords/Alleles — the real inputs the reverse translation job reads.""" + + async def dummy_mapping_job(): + return await construct_mock_mapping_output(session=session, score_set=score_set, with_layers=with_layers) + + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=dummy_mapping_job()): + result = await map_variants_for_score_set( + mock_worker_ctx, + mapping_run.id, + JobManager(session, mock_worker_ctx["redis"], mapping_run.id), + ) + + assert result.status == JobStatus.SUCCEEDED + session.commit() + + +async def _reverse_translate(session, mock_worker_ctx, rt_run): + # The job opens a UTA-backed TranscriptSource to back codon_at (WtCodonMode.ALL). + # construct_equivalent_variants is mocked in these tests, so the source is never + # queried -- stub the factory with a no-op context yielding a dummy client. + with patch(f"{RT_MODULE}.uta_transcript_source", lambda: contextlib.nullcontext(MagicMock())): + return await reverse_translate_variants_for_score_set( + mock_worker_ctx, + rt_run.id, + JobManager(session, mock_worker_ctx["redis"], rt_run.id), + ) + + +def _cross_level_statuses(session, score_set_id, status=None): + query = ( + session.query(VariantAnnotationStatus) + .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + .filter( + Variant.score_set_id == score_set_id, + VariantAnnotationStatus.annotation_type == "cross_level_translation", + ) + ) + if status is not None: + query = query.filter(VariantAnnotationStatus.status == status) + return query.all() + + +def _non_authoritative_links(session): + return session.query(MappingRecordAllele).filter(MappingRecordAllele.is_authoritative.is_(False)).all() + + +@pytest.mark.unit +class TestBuildTranslationConfig: + """Unit tests for _build_translation_config — the optional translation_config job param.""" + + def test_none_uses_job_defaults(self): + config = _build_translation_config(None) + assert config.include_indels is True + assert config.wt_codon_mode == WtCodonMode.ALL + + def test_empty_dict_uses_job_defaults(self): + config = _build_translation_config({}) + assert config.include_indels is True + assert config.wt_codon_mode == WtCodonMode.ALL + + def test_overrides_win_and_string_wt_codon_mode_is_coerced(self): + config = _build_translation_config({"wt_codon_mode": "unambiguous", "max_indel_size": 7}) + assert config.wt_codon_mode == WtCodonMode.UNAMBIGUOUS + assert config.max_indel_size == 7 + assert config.include_indels is True # untouched default preserved + + def test_can_disable_indels_with_none_mode(self): + config = _build_translation_config({"wt_codon_mode": "none", "include_indels": False}) + assert config.wt_codon_mode == WtCodonMode.NONE + assert config.include_indels is False + + def test_rejects_invalid_wt_codon_mode_with_actionable_message(self): + with pytest.raises(ValueError, match=r"wt_codon_mode 'bogus'.*Valid values"): + _build_translation_config({"wt_codon_mode": "bogus"}) + + def test_rejects_wt_codon_mode_without_indels(self): + # TranslationConfig enforces: a wt_codon_mode other than "none" requires include_indels. + with pytest.raises(ValueError, match="translation_config"): + _build_translation_config({"wt_codon_mode": "all", "include_indels": False}) + + def test_rejects_unknown_field_and_lists_valid_options(self): + with pytest.raises( + ValueError, match=r"Unknown translation_config option\(s\): not_a_real_field.*Valid options" + ): + _build_translation_config({"not_a_real_field": 1}) + + +@pytest.mark.unit +@pytest.mark.asyncio +class TestReverseTranslateVariantsForScoreSetUnit: + """Unit tests for the reverse_translate_variants_for_score_set job.""" + + async def test_no_mapping_records_is_a_noop( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """With no current, post-mapped mapping records there is nothing to translate.""" + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert isinstance(result, JobExecutionOutcome) + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 0, "failed": 0, "skipped": 0, "alleles_created": 0} + + # No candidate alleles, links, or annotations were produced. + assert session.query(Allele).count() == 0 + assert _non_authoritative_links(session) == [] + assert _cross_level_statuses(session, sample_score_set.id) == [] + + async def test_creates_genomic_and_coding_candidate_alleles( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A mapped variant expands into one non-authoritative allele per equivalence-class + candidate, each linked to the originating mapping record.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + c_candidate = "NM_000001.1:c.5A>G" + construct = fake_construct({assay_hgvs: ([c_candidate], [g_candidate])}) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic", c_candidate: "ga4gh:VA.coding"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 2} + + # Two non-authoritative links were created, both for our one mapping record. + non_auth_links = _non_authoritative_links(session) + assert len(non_auth_links) == 2 + + genomic_allele = session.query(Allele).filter(Allele.vrs_digest == "ga4gh:VA.genomic").one() + assert genomic_allele.level == AnnotationLayer.genomic.value + assert genomic_allele.hgvs_g == g_candidate + assert genomic_allele.hgvs_c is None + assert genomic_allele.transcript == "NC_000001.11" + assert genomic_allele.post_mapped == {"type": "Allele", "id": "ga4gh:VA.genomic"} + + coding_allele = session.query(Allele).filter(Allele.vrs_digest == "ga4gh:VA.coding").one() + assert coding_allele.level == AnnotationLayer.cdna.value + assert coding_allele.hgvs_c == c_candidate + assert coding_allele.hgvs_g is None + assert coding_allele.transcript == "NM_000001.1" + + # The candidate alleles are linked to the same mapping record as the authoritative allele. + mapping_record = session.query(MappingRecord).filter(MappingRecord.variant_id == variant.id).one() + assert {link.mapping_record_id for link in non_auth_links} == {mapping_record.id} + + statuses = _cross_level_statuses(session, sample_score_set.id) + assert len(statuses) == 1 + assert statuses[0].status == "success" + + async def test_transcript_is_queryable_via_derived_expression( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """Allele.transcript is a derived hybrid_property — filtering on it in SQL resolves to + the accession of the populated HGVS column, with no stored transcript column.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + c_candidate = "NM_000001.1:c.5A>G" + construct = fake_construct({assay_hgvs: ([c_candidate], [g_candidate])}) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic", c_candidate: "ga4gh:VA.coding"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + # The SQL expression derives the accession from the populated HGVS column. + genomic = session.query(Allele).filter(Allele.transcript == "NC_000001.11").all() + assert [a.vrs_digest for a in genomic] == ["ga4gh:VA.genomic"] + coding = session.query(Allele).filter(Allele.transcript == "NM_000001.1").all() + assert [a.vrs_digest for a in coding] == ["ga4gh:VA.coding"] + + async def test_protein_consequence_is_persisted_as_a_protein_allele( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """The deterministic protein consequence (``result.hgvs_p``) is emitted as the + protein-level member of the equivalence set -- a non-authoritative ``level=protein`` + allele linked to the record, not dropped.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + c_candidate = "NM_000001.1:c.5A>G" + # c_to_p emits a predicted consequence in parens; the job strips them before use. + p_consequence = "NP_000001.1:p.(Met1Val)" + p_stripped = "NP_000001.1:p.Met1Val" + construct = fake_construct({assay_hgvs: ([c_candidate], [g_candidate], p_consequence)}) + translate = fake_translate( + { + g_candidate: "ga4gh:VA.genomic", + c_candidate: "ga4gh:VA.coding", + p_stripped: "ga4gh:VA.protein", # the stripped form is what reaches the translator + } + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + protein_allele = session.query(Allele).filter(Allele.vrs_digest == "ga4gh:VA.protein").one() + assert protein_allele.level == AnnotationLayer.protein.value + assert protein_allele.hgvs_p == p_stripped # stored without the prediction parens + # Linked to the record as a non-authoritative member of the equivalence set. + assert protein_allele.id in {link.allele_id for link in _non_authoritative_links(session)} + + async def test_cis_phased_multivariant_candidate_is_persisted_as_a_block( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A bracketed genomic candidate (non-adjacent codon components) is translated into a + CisPhasedBlock and persisted like any other candidate allele, keyed by its CPB digest.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + block_candidate = "NC_000001.11:g.[1000A>G;1002T>C]" + construct = fake_construct({assay_hgvs: ([], [block_candidate])}) + translate = fake_translate( + {block_candidate: "ga4gh:CPB.block"}, + type_by_hgvs={block_candidate: "CisPhasedBlock"}, + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} + + block_allele = session.query(Allele).filter(Allele.vrs_digest == "ga4gh:CPB.block").one() + assert block_allele.level == AnnotationLayer.genomic.value + # The full bracketed expression is stored verbatim; its accession anchors the row. + assert block_allele.hgvs_g == block_candidate + assert block_allele.transcript == "NC_000001.11" + assert block_allele.post_mapped == {"type": "CisPhasedBlock", "id": "ga4gh:CPB.block"} + + assert len(_non_authoritative_links(session)) == 1 + + async def test_duplicate_candidate_digests_are_deduped_per_record( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """When a coding and genomic candidate resolve to the same VRS digest, only one + allele/link is written for that mapping record (per the library's dedup contract).""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + c_candidate = "NM_000001.1:c.5A>G" + construct = fake_construct({assay_hgvs: ([c_candidate], [g_candidate])}) + # Both candidates collapse to the same VRS digest. + translate = fake_translate({g_candidate: "ga4gh:VA.shared", c_candidate: "ga4gh:VA.shared"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} + assert len(_non_authoritative_links(session)) == 1 + assert session.query(Allele).filter(Allele.vrs_digest == "ga4gh:VA.shared").count() == 1 + + async def test_candidate_equal_to_authoritative_allele_is_not_relinked( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A derived candidate whose digest equals the record's authoritative (assay-measured) + allele is not linked again — the record already links that allele authoritatively, and a + derived duplicate would surface the measured variant twice. No new link and no new allele; + an empty derived set is still a success with nothing to add.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + # Exactly one allele exists after mapping: the authoritative post-mapped allele. + assert session.query(Allele).count() == 1 + authoritative_allele = session.query(Allele).one() + assert authoritative_allele.vrs_digest == TEST_GA4GH_IDENTIFIER + + assay_hgvs = "NM_000000.1:c.1A>G" + c_candidate = "NM_000001.1:c.5A>G" + construct = fake_construct({assay_hgvs: ([c_candidate], [])}) + # The candidate resolves to the same digest as the authoritative allele. + translate = fake_translate({c_candidate: TEST_GA4GH_IDENTIFIER}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 0} + + # The authoritative allele was reused, not duplicated. + assert session.query(Allele).count() == 1 + # No derived link was added: the candidate equals the record's authoritative allele. + assert _non_authoritative_links(session) == [] + + async def test_independent_rerun_retires_prior_links_and_keeps_current_stable( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """An independent second run is idempotent: it retires the prior derived links (closing + valid_to) and writes fresh live ones, so the set of *current* non-authoritative links is + stable while the superseded links are retained as history — the partial unique index on + live links is never violated and no alleles are duplicated.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + c_candidate = "NM_000001.1:c.5A>G" + construct = fake_construct({assay_hgvs: ([c_candidate], [g_candidate])}) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic", c_candidate: "ga4gh:VA.coding"}) + + # First run: two live derived links. + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + first = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert first.status == JobStatus.SUCCEEDED + first_links = _non_authoritative_links(session) + assert len(first_links) == 2 + assert all(link.valid_to is None for link in first_links) + + # Second, independent run with identical inputs. + rerun = JobRun( + urn="test:reverse_translate_variants_for_score_set:rerun", + job_type="reverse_translate_variants_for_score_set", + job_function="reverse_translate_variants_for_score_set", + max_retries=3, + retry_count=0, + job_params=dict(sample_independent_reverse_translation_run.job_params), + ) + session.add(rerun) + session.commit() + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + second = await _reverse_translate(session, mock_worker_ctx, rerun) + + assert second.status == JobStatus.SUCCEEDED + + links = _non_authoritative_links(session) + live = [link for link in links if link.valid_to is None] + retired = [link for link in links if link.valid_to is not None] + + # The live set is stable (still exactly the two candidates) and the prior run's links are + # retained as closed history rather than deleted. + assert len(live) == 2 + assert len(retired) == 2 + + # Gap-free handoff: the prior links closed at exactly the instant the new links opened, under + # one supersession timestamp — so a point-in-time query never lands in a hole between runs, + # even though the translation loop ran (and could have committed) between retire and insert. + assert {link.valid_to for link in retired} == {link.valid_from for link in live} + assert len({link.valid_from for link in live}) == 1 + + # The two candidate alleles are reused across runs, not duplicated. + assert session.query(Allele).filter(Allele.vrs_digest.in_(["ga4gh:VA.genomic", "ga4gh:VA.coding"])).count() == 2 + + # The cross-level translation status is versioned the same way: prior retired, one current. + statuses = _cross_level_statuses(session, sample_score_set.id) + assert len(statuses) == 2 + assert len([s for s in statuses if s.current]) == 1 + + async def test_all_failures_fail_the_job( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """If every variant's translation errors, the job fails and records FAILED annotations.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + construct = fake_construct({}, errors_by_hgvs={assay_hgvs: "forward translation failed"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.FAILED + assert result.data == {"translated": 0, "failed": 1, "skipped": 0, "alleles_created": 0} + + assert _non_authoritative_links(session) == [] + failed_statuses = _cross_level_statuses(session, sample_score_set.id, status="failed") + assert len(failed_statuses) == 1 + assert failed_statuses[0].error_message == "forward translation failed" + # Library/input-level failures still carry the assay-level HGVS as metadata. + assert failed_statuses[0].annotation_metadata == {"hgvs_input": assay_hgvs} + + async def test_partial_success_succeeds_with_mixed_annotations( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """One variant translating and another erroring yields a SUCCEEDED job with both a + success and a failure annotation.""" + variant_ok = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + variant_err = Variant( + score_set_id=sample_score_set.id, + urn="variant:2", + hgvs_nt="NM_000000.1:c.2G>T", + hgvs_pro="NP_000000.1:p.Val2Leu", + data={}, + ) + session.add_all([variant_ok, variant_err]) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + g_candidate = "NC_000001.11:g.1000A>G" + construct = fake_construct( + {"NM_000000.1:c.1A>G": ([], [g_candidate])}, + errors_by_hgvs={"NM_000000.1:c.2G>T": "no consequence"}, + ) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 1, "skipped": 0, "alleles_created": 1} + + assert len(_cross_level_statuses(session, sample_score_set.id, status="success")) == 1 + assert len(_cross_level_statuses(session, sample_score_set.id, status="failed")) == 1 + assert len(_non_authoritative_links(session)) == 1 + + async def test_partial_candidate_translation_failure_keeps_success_with_metadata( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """One candidate translating and another failing VRS translation leaves the variant a + SUCCESS (one allele created) while retaining the dropped candidate in metadata.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + good_candidate = "NC_000001.11:g.1000A>G" + bad_candidate = "NC_000001.11:g.999A>T" + construct = fake_construct({assay_hgvs: ([], [good_candidate, bad_candidate])}) + translate = fake_translate( + {good_candidate: "ga4gh:VA.genomic"}, + errors_by_hgvs={bad_candidate: "untranslatable form"}, + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} + assert len(_non_authoritative_links(session)) == 1 + + statuses = _cross_level_statuses(session, sample_score_set.id, status="success") + assert len(statuses) == 1 + failed_candidates = statuses[0].annotation_metadata["failed_candidates"] + assert failed_candidates == [{"hgvs": bad_candidate, "level": "genomic", "error": "untranslatable form"}] + + async def test_all_candidates_failing_translation_marks_variant_failed( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """When every candidate fails VRS translation, the variant is FAILED, no allele is + created, and the per-candidate errors are retained in metadata.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + bad_candidate = "NC_000001.11:g.999A>T" + construct = fake_construct({assay_hgvs: ([], [bad_candidate])}) + translate = fake_translate({}, errors_by_hgvs={bad_candidate: "untranslatable form"}) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.FAILED + assert result.data == {"translated": 0, "failed": 1, "skipped": 0, "alleles_created": 0} + assert _non_authoritative_links(session) == [] + + failed_statuses = _cross_level_statuses(session, sample_score_set.id, status="failed") + assert len(failed_statuses) == 1 + assert failed_statuses[0].error_message == "All candidate HGVS failed VRS translation." + metadata = failed_statuses[0].annotation_metadata + assert metadata["hgvs_input"] == assay_hgvs + assert metadata["failed_candidates"] == [ + {"hgvs": bad_candidate, "level": "genomic", "error": "untranslatable form"} + ] + + async def test_no_coding_transcript_is_skipped_not_failed( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A target gene aligned only at the genomic level has no coding transcript, so its + variants are SKIPPED (no protein consequence) rather than attempted and failed.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + # Genomic-only mapping: no cdna TargetGeneMapping, so no coding transcript to anchor on. + await _map_variants( + session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set, with_layers={"g"} + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", fake_construct({})), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} + + # No translation attempted: no candidate alleles or links, and the variant is + # recorded as SKIPPED rather than FAILED. + assert _non_authoritative_links(session) == [] + assert _cross_level_statuses(session, sample_score_set.id, status="failed") == [] + assert len(_cross_level_statuses(session, sample_score_set.id, status="skipped")) == 1 + + async def test_genomic_accession_coding_target_is_reverse_translated( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A genomic-accession (NC_:g.) coding variant -- previously skipped for want of a + coding transcript -- is reverse-translated once the mapper emits a cdna + TargetGeneMapping, whose reference_accession anchors the projection.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + # Genomic assay layer + cdna identity TargetGeneMapping (carrying the NM_); the + # genomic variant projects onto that transcript for reverse translation. + await _map_variants( + session, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_score_set, + with_layers={"g", "c"}, + ) + + assay_hgvs = "NC_000001.11:g.1000A>G" + g_candidate = "NC_000001.11:g.1000A>G" + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", fake_construct({assay_hgvs: ([], [g_candidate])})), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({g_candidate: "ga4gh:VA.genomic"})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} + assert _cross_level_statuses(session, sample_score_set.id, status="skipped") == [] + + def _run_mapped_date(self, session, target_gene_id): + """The mapped_date the (mock) mapping run stamped on its TargetGeneMappings.""" + return ( + session.query(TargetGeneMapping) + .filter( + TargetGeneMapping.target_gene_id == target_gene_id, + TargetGeneMapping.alignment_level == AnnotationLayer.genomic, + ) + .first() + .mapped_date + ) + + async def test_uses_latest_cdna_row_within_the_run( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """When a run has more than one cdna TargetGeneMapping for a target (same run date), + RT reverse-translates against the newest (highest id), not an arbitrary one.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + # Mapping emits a cdna TargetGeneMapping (NM_999999.1) stamped with the run date. + await _map_variants( + session, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_score_set, + with_layers={"g", "c"}, + ) + target_gene = sample_score_set.target_genes[0] + run_date = self._run_mapped_date(session, target_gene.id) + # A newer cdna row (higher id) for the same target and same run date. + session.add( + TargetGeneMapping( + target_gene_id=target_gene.id, + alignment_level=AnnotationLayer.cdna, + reference_accession="NM_111111.1", + preferred=False, + tool_version="pytest.0.0", + mapped_date=run_date, + ) + ) + session.commit() + + assay_hgvs = "NC_000001.11:g.1000A>G" + g_candidate = "NC_000001.11:g.1000A>G" + delegate = fake_construct({assay_hgvs: ([], [g_candidate])}) + captured: dict = {} + + def capturing_construct(inputs, *, transcripts, coordinates, config): + captured["transcripts"] = [inp.transcript for inp in inputs] + return delegate(inputs, transcripts=transcripts, coordinates=coordinates, config=config) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", capturing_construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({g_candidate: "ga4gh:VA.g"})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + # The newest cdna row within the run wins, not the first-mapped one. + assert captured["transcripts"] == ["NM_111111.1"] + + async def test_ignores_stale_cdna_row_from_a_different_run( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A cdna row left behind by a *different* run (different run date) is not used: + the current run emitted no cdna row, so the variant is skipped (transcript + unresolved) rather than reverse-translated against the stale transcript -- the + mapped_date anchor narrows the accumulation edge case (mavedb-api#763).""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + # Current run maps genomic only -- no cdna row for this run's date. + await _map_variants( + session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set, with_layers={"g"} + ) + target_gene = sample_score_set.target_genes[0] + run_date = self._run_mapped_date(session, target_gene.id) + # A leftover cdna row from a *prior* run (an earlier date) must not be picked up. + session.add( + TargetGeneMapping( + target_gene_id=target_gene.id, + alignment_level=AnnotationLayer.cdna, + reference_accession="NM_888888.1", + preferred=False, + tool_version="pytest.0.0", + mapped_date=run_date - timedelta(days=1), + ) + ) + session.commit() + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", fake_construct({})), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} + skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + assert len(skipped) == 1 + assert skipped[0].annotation_metadata["skip_category"] == "transcript_unresolved" + + async def test_coding_target_skip_is_classified_recoverable( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A protein-coding target with no resolvable coding transcript is a *recoverable* + skip -- classified ``transcript_unresolved`` so it is findable, distinct from a + regulatory target's correct skip.""" + assert sample_score_set.target_genes[0].category == TargetCategory.protein_coding + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + # Genomic-only mapping: no cdna TargetGeneMapping, so no coding transcript resolves. + await _map_variants( + session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set, with_layers={"g"} + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", fake_construct({})), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} + skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + assert len(skipped) == 1 + assert skipped[0].annotation_metadata["skip_category"] == "transcript_unresolved" + + async def test_regulatory_target_skip_is_classified_correct( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A non-coding/regulatory target has no protein consequence: its skip is *correct*, + classified ``no_coding_transcript`` rather than the recoverable category.""" + target = sample_score_set.target_genes[0] + target.category = TargetCategory.regulatory + session.add(target) + session.commit() + + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NC_000001.11:g.1000A>G", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants( + session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set, with_layers={"g"} + ) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", fake_construct({})), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", fake_translate({})), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} + skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + assert len(skipped) == 1 + assert skipped[0].annotation_metadata["skip_category"] == "no_coding_transcript" + + async def test_translation_config_param_overrides_job_defaults( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A translation_config job param overrides the job's TranslationConfig defaults, and the + resulting config is passed through to construct_equivalent_variants.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + delegate = fake_construct({assay_hgvs: ([], [g_candidate])}) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic"}) + + captured: dict = {} + + def capturing_construct(inputs, *, transcripts, coordinates, config): + captured["config"] = config + return delegate(inputs, transcripts=transcripts, coordinates=coordinates, config=config) + + run = sample_independent_reverse_translation_run + run.job_params = {**run.job_params, "translation_config": {"wt_codon_mode": "unambiguous", "max_indel_size": 7}} + session.add(run) + session.commit() + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", capturing_construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, run) + + assert result.status == JobStatus.SUCCEEDED + config = captured["config"] + assert config.wt_codon_mode == WtCodonMode.UNAMBIGUOUS + assert config.max_indel_size == 7 + # Defaults the param did not override are preserved. + assert config.include_indels is True + + async def test_translation_config_defaults_when_param_absent( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """With no translation_config param the job falls back to its sensible defaults + (full codon equivalence class, indels included).""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + await _map_variants(session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set) + + assay_hgvs = "NM_000000.1:c.1A>G" + g_candidate = "NC_000001.11:g.1000A>G" + delegate = fake_construct({assay_hgvs: ([], [g_candidate])}) + translate = fake_translate({g_candidate: "ga4gh:VA.genomic"}) + + captured: dict = {} + + def capturing_construct(inputs, *, transcripts, coordinates, config): + captured["config"] = config + return delegate(inputs, transcripts=transcripts, coordinates=coordinates, config=config) + + with ( + patch(f"{RT_MODULE}.construct_equivalent_variants", capturing_construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + config = captured["config"] + assert config.include_indels is True + assert config.wt_codon_mode == WtCodonMode.ALL + + async def test_protein_level_transcript_resolved_via_uta( + self, + session, + with_independent_processing_runs, + with_reverse_translation_run, + mock_worker_ctx, + sample_independent_variant_mapping_run, + sample_independent_reverse_translation_run, + sample_score_set, + ): + """A protein-only mapping has no cdna alignment transcript, so its coding transcript + is resolved NP_→NM_ via UTA and supplied as the VariantInput hint.""" + variant = Variant( + score_set_id=sample_score_set.id, + urn="variant:1", + hgvs_pro="NP_000000.1:p.Met1Val", + data={}, + ) + session.add(variant) + session.commit() + # Protein-only mapping: no cdna TargetGeneMapping, so the transcript must come from UTA. + await _map_variants( + session, mock_worker_ctx, sample_independent_variant_mapping_run, sample_score_set, with_layers={"p"} + ) + + assay_hgvs = "NP_000000.1:p.Met1Val" + g_candidate = "NC_000001.11:g.1000A>G" + translate = fake_translate({g_candidate: "ga4gh:VA.genomic"}) + + # Capture the transcript hint each VariantInput was built with, then delegate to the + # normal stub for the translation result. + captured_hints: dict[str, str | None] = {} + delegate = fake_construct({assay_hgvs: ([], [g_candidate])}) + + def capturing_construct(inputs, *, transcripts, coordinates, config): + captured_hints.update({inp.hgvs: inp.transcript for inp in inputs}) + return delegate(inputs, transcripts=transcripts, coordinates=coordinates, config=config) + + with ( + patch(f"{RT_MODULE}._coding_transcripts_for_proteins", lambda accessions: {"NP_000000.1": "NM_000111.1"}), + patch(f"{RT_MODULE}.construct_equivalent_variants", capturing_construct), + patch(f"{RT_MODULE}.translate_hgvs_to_variation", translate), + ): + result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) + + assert result.status == JobStatus.SUCCEEDED + assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} + # The NP_ accession was resolved to its NM_ transcript and passed as the hint. + assert captured_hints == {assay_hgvs: "NM_000111.1"}