Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a2d0509
feat(data-providers): add seqrepo-backed vrs data proxy
bencap Jun 4, 2026
cf8d520
fix(vrs): centralize allele id recomputation helpers
bencap Jun 4, 2026
a3f7c32
feat(db): add ValidTime mixin for valid-time row versioning
bencap Jun 7, 2026
5ae3cfd
feat(hgvs): add HGVS accession and cis-phased expression helpers
bencap Jun 7, 2026
08158a6
feat(vrs): translate cis-phased multivariant HGVS into VRS blocks
bencap Jun 7, 2026
a49514d
feat(variants): combine cis-phased members into one HGVS expression
bencap Jun 7, 2026
19f6e80
feat(db): add VRS allele closure tables for reverse translation
bencap Jun 7, 2026
e7c7a7c
chore(types): add mypy stubs for ga4gh and hgvs translation APIs
bencap Jun 7, 2026
ee4018b
build(deps): add variant-annotation as editable sibling dependency
bencap Jun 7, 2026
5402000
feat(worker): write mapping job output to the allele closure tables
bencap Jun 7, 2026
0b1e19f
fixup into cisphased hgvs commit
bencap Jun 7, 2026
002e0e9
feat(worker): provide a SeqRepo data proxy in the worker context
bencap Jun 7, 2026
ca2a08c
fixup into mapper changes
bencap Jun 7, 2026
413113e
feat(worker): add reverse translation job for cross-level HGVS alleles
bencap Jun 7, 2026
d003b7f
fix(reverse-translation): anchor cdna transcript lookup to mapping ru…
bencap Jun 8, 2026
ad61d15
feat(hgvs): add strip_protein_prediction_parens utility
bencap Jun 9, 2026
c914639
fix(vrs-utils): normalize and re-identify VRS alleles to prevent dige…
bencap Jun 9, 2026
5619034
refactor(reverse-translation): add p. alleles to equivalence set and …
bencap Jun 9, 2026
9bfb319
feat(mapping): use typed MappingOutcome to distinguish benign skips f…
bencap Jun 9, 2026
ba92b80
refactor(reverse-translation): replace NullTranscriptSource with live…
bencap Jun 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ RUN curl -sSL https://install.python-poetry.org | python3 -
COPY poetry.lock pyproject.toml ./

# installs runtime dependencies to $VIRTUAL_ENV
RUN poetry install --no-root --extras server
RUN poetry install --no-root --extras server --no-directory
COPY alembic /code/alembic
COPY alembic.ini /code/alembic.ini
COPY src /code/src
Expand Down
23 changes: 23 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
VENV := .venv/bin

.DEFAULT_GOAL := help

.PHONY: help
help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-12s %s\n", $$1, $$2}'

.PHONY: dev
dev: ## Install deps including local editable variant-annotation
poetry install --extras server

.PHONY: test
test: ## Run the test suite
$(VENV)/pytest tests/

.PHONY: lint
lint: ## Check code with ruff
$(VENV)/ruff check src/ tests/

.PHONY: format
format: ## Format code with ruff
$(VENV)/ruff format src/ tests/
151 changes: 151 additions & 0 deletions alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""Add mapping_records, alleles, and mapping_record_alleles tables

Revision ID: a1b2c3d4e5f6
Revises: 398067c53257
Create Date: 2026-05-29

New parallel tables for the Better Reverse Translation epic (#746).
The existing mapped_variants table is left untouched (frozen serving).
"""

import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from alembic import op

# revision identifiers, used by Alembic.
revision = "a1b2c3d4e5f6"
down_revision = "398067c53257"
branch_labels = None
depends_on = None

VALID_ASSAY_LEVELS = "('genomic', 'cdna', 'protein')"
VALID_ALIGNMENT_LEVELS = "('protein', 'cdna', 'genomic')"


def upgrade() -> None:
op.create_table(
"alleles",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("vrs_digest", sa.String(), nullable=False),
sa.Column("level", sa.String(length=16), nullable=False),
sa.Column("transcript", sa.String(), nullable=False),
sa.Column("hgvs_g", sa.String(), nullable=True),
sa.Column("hgvs_c", sa.String(), nullable=True),
sa.Column("hgvs_p", sa.String(), nullable=True),
sa.Column("clingen_allele_id", sa.String(), nullable=True),
sa.Column("post_mapped", postgresql.JSONB(), nullable=True),
sa.Column("created_at", sa.Date(), nullable=False, server_default=sa.text("CURRENT_DATE")),
sa.Column(
"updated_at",
sa.Date(),
nullable=False,
server_default=sa.text("CURRENT_DATE"),
onupdate=sa.text("CURRENT_DATE"),
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("vrs_digest", name="uq_alleles_vrs_digest"),
)
op.create_index("ix_alleles_vrs_digest", "alleles", ["vrs_digest"])
op.create_index("ix_alleles_level", "alleles", ["level"])
op.create_index("ix_alleles_clingen_allele_id", "alleles", ["clingen_allele_id"])

op.create_table(
"mapping_records",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("variant_id", sa.Integer(), nullable=False),
sa.Column("vrs_digest", sa.String(), nullable=True),
sa.Column("pre_mapped", postgresql.JSONB(), nullable=True),
sa.Column("assay_level", sa.String(length=16), nullable=False),
sa.Column("hgvs_assay_level", sa.String(), nullable=True),
sa.Column("mapping_api_version", sa.String(), nullable=False),
sa.Column("mapped_date", sa.Date(), nullable=False),
sa.Column("vrs_version", sa.String(), nullable=True),
sa.Column("current", sa.Boolean(), nullable=False),
sa.Column("alignment_level", sa.String(length=16), nullable=True),
sa.Column("at_mismatched_locus", sa.Boolean(), nullable=True),
sa.Column("near_gap", sa.Boolean(), nullable=True),
sa.Column("target_gene_mapping_id", sa.Integer(), nullable=True),
sa.Column("created_at", sa.Date(), nullable=False, server_default=sa.text("CURRENT_DATE")),
sa.Column(
"updated_at",
sa.Date(),
nullable=False,
server_default=sa.text("CURRENT_DATE"),
onupdate=sa.text("CURRENT_DATE"),
),
sa.ForeignKeyConstraint(
["variant_id"],
["variants.id"],
name="fk_mapping_records_variant_id",
),
sa.ForeignKeyConstraint(
["target_gene_mapping_id"],
["target_gene_mappings.id"],
name="fk_mapping_records_target_gene_mapping_id",
),
sa.PrimaryKeyConstraint("id"),
sa.CheckConstraint(
f"assay_level IN {VALID_ASSAY_LEVELS}",
name="ck_mapping_records_assay_level_valid",
),
)
op.create_index("ix_mapping_records_variant_id", "mapping_records", ["variant_id"])
op.create_index("ix_mapping_records_vrs_digest", "mapping_records", ["vrs_digest"])
op.create_index(
"ix_mapping_records_target_gene_mapping_id",
"mapping_records",
["target_gene_mapping_id"],
)

op.create_table(
"mapping_record_alleles",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("mapping_record_id", sa.Integer(), nullable=False),
sa.Column("allele_id", sa.Integer(), nullable=False),
sa.Column(
"is_authoritative",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
sa.ForeignKeyConstraint(
["mapping_record_id"],
["mapping_records.id"],
name="fk_mapping_record_alleles_mapping_record_id",
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["allele_id"],
["alleles.id"],
name="fk_mapping_record_alleles_allele_id",
ondelete="RESTRICT",
),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
"ix_mapping_record_alleles_mapping_record_id",
"mapping_record_alleles",
["mapping_record_id"],
)
op.create_index(
"ix_mapping_record_alleles_allele_id",
"mapping_record_alleles",
["allele_id"],
)


def downgrade() -> None:
op.drop_index("ix_mapping_record_alleles_allele_id", table_name="mapping_record_alleles")
op.drop_index("ix_mapping_record_alleles_mapping_record_id", table_name="mapping_record_alleles")
op.drop_table("mapping_record_alleles")

op.drop_index("ix_mapping_records_target_gene_mapping_id", table_name="mapping_records")
op.drop_index("ix_mapping_records_vrs_digest", table_name="mapping_records")
op.drop_index("ix_mapping_records_variant_id", table_name="mapping_records")
op.drop_table("mapping_records")

op.drop_index("ix_alleles_clingen_allele_id", table_name="alleles")
op.drop_index("ix_alleles_level", table_name="alleles")
op.drop_index("ix_alleles_vrs_digest", table_name="alleles")
op.drop_table("alleles")
32 changes: 32 additions & 0 deletions alembic/versions/b8e1f0a2c4d7_drop_alleles_transcript_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""drop alleles.transcript column

Revision ID: b8e1f0a2c4d7
Revises: f4d2a9c1b7e3
Create Date: 2026-06-05

The `transcript` column duplicated data already present in the HGVS columns — it was
always extract_accession(hgvs_g/hgvs_c/hgvs_p). It is now a derived hybrid_property on
the Allele model (split_part(coalesce(hgvs_g, hgvs_c, hgvs_p), ':', 1)), so the stored
column is removed to keep a single source of truth and avoid drift.
"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "b8e1f0a2c4d7"
down_revision = "f4d2a9c1b7e3"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.drop_column("alleles", "transcript")


def downgrade() -> None:
# Re-add the column and backfill it from the HGVS columns (the same derivation the
# hybrid_property uses) so the restored NOT NULL column is consistent.
op.add_column("alleles", sa.Column("transcript", sa.String(), nullable=True))
op.execute("UPDATE alleles SET transcript = split_part(coalesce(hgvs_g, hgvs_c, hgvs_p), ':', 1)")
op.alter_column("alleles", "transcript", nullable=False)
48 changes: 48 additions & 0 deletions alembic/versions/c3d5e7f9a1b2_temporal_mapping_record_alleles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""add valid-time versioning to mapping_record_alleles

Revision ID: c3d5e7f9a1b2
Revises: b8e1f0a2c4d7
Create Date: 2026-06-05

Make the link table valid-time versioned (TemporalLink): a link is live while valid_to is
NULL, and superseding it closes valid_to instead of deleting, so reverse translation can be
re-run independently while prior derivations remain queryable point-in-time. The partial
unique index enforces a single live link per (mapping_record, allele).

Assumes no pre-existing duplicate live links — true for these parallel tables, which are
new-only writes and not yet serving. If this ever runs against data with duplicates, the
unique index creation will fail and the duplicates must be retired first.
"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "c3d5e7f9a1b2"
down_revision = "b8e1f0a2c4d7"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
"mapping_record_alleles",
sa.Column("valid_from", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()),
)
op.add_column(
"mapping_record_alleles",
sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True),
)
op.create_index(
"uq_mapping_record_alleles_live",
"mapping_record_alleles",
["mapping_record_id", "allele_id"],
unique=True,
postgresql_where=sa.text("valid_to IS NULL"),
)


def downgrade() -> None:
op.drop_index("uq_mapping_record_alleles_live", table_name="mapping_record_alleles")
op.drop_column("mapping_record_alleles", "valid_to")
op.drop_column("mapping_record_alleles", "valid_from")
76 changes: 76 additions & 0 deletions alembic/versions/d4e6f8a0b2c3_temporal_mapping_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""move mapping_records onto valid-time versioning

Revision ID: d4e6f8a0b2c3
Revises: c3d5e7f9a1b2
Create Date: 2026-06-05

Replace the stored `current` flag and the `created_at`/`updated_at` audit dates with valid-time
columns (ValidTime mixin): a mapping record is live while valid_to is NULL, and a re-map retires
the prior version (closing valid_to) instead of flipping a boolean. `current` becomes derived
(valid_to IS NULL). `mapped_date` (the date the mapping was performed) is domain data and stays.

The partial unique index promotes to the database the "one live mapping record per variant"
invariant the mapping job previously enforced only in app code.

Backfills from the columns being dropped, so existing rows keep their validity. Assumes no
duplicate live records per variant (true for these pre-cutover parallel tables; otherwise the
unique index creation fails and the duplicates must be retired first).
"""

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "d4e6f8a0b2c3"
down_revision = "c3d5e7f9a1b2"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.add_column(
"mapping_records",
sa.Column("valid_from", sa.DateTime(timezone=True), nullable=True, server_default=sa.func.now()),
)
op.add_column(
"mapping_records",
sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True),
)

# Backfill validity from the columns being replaced: a record's life began at created_at, and
# a non-current record was retired at updated_at (the only in-place update it ever took).
op.execute("UPDATE mapping_records SET valid_from = created_at::timestamptz")
op.execute("UPDATE mapping_records SET valid_to = updated_at::timestamptz WHERE current = false")

op.alter_column("mapping_records", "valid_from", nullable=False)

op.drop_column("mapping_records", "current")
op.drop_column("mapping_records", "created_at")
op.drop_column("mapping_records", "updated_at")

op.create_index(
"uq_mapping_records_current",
"mapping_records",
["variant_id"],
unique=True,
postgresql_where=sa.text("valid_to IS NULL"),
)


def downgrade() -> None:
op.drop_index("uq_mapping_records_current", table_name="mapping_records")

op.add_column("mapping_records", sa.Column("current", sa.Boolean(), nullable=True))
op.add_column("mapping_records", sa.Column("created_at", sa.Date(), nullable=True))
op.add_column("mapping_records", sa.Column("updated_at", sa.Date(), nullable=True))

op.execute("UPDATE mapping_records SET current = (valid_to IS NULL)")
op.execute("UPDATE mapping_records SET created_at = valid_from::date")
op.execute("UPDATE mapping_records SET updated_at = coalesce(valid_to, valid_from)::date")

op.alter_column("mapping_records", "current", nullable=False)
op.alter_column("mapping_records", "created_at", nullable=False)
op.alter_column("mapping_records", "updated_at", nullable=False)

op.drop_column("mapping_records", "valid_to")
op.drop_column("mapping_records", "valid_from")
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""add cross_level_translation annotation type

Revision ID: f4d2a9c1b7e3
Revises: a1b2c3d4e5f6
Create Date: 2026-06-02

Extends ck_variant_annotation_type_valid to allow the 'cross_level_translation'
annotation type. The VRS mapping worker writes one such row per variant to record
whether cross-level translation (filling the levels the assay did not map)
succeeded, was skipped (multivariant / no transcript), or failed.
"""

from alembic import op

# revision identifiers, used by Alembic.
revision = "f4d2a9c1b7e3"
down_revision = "a1b2c3d4e5f6"
branch_labels = None
depends_on = None

_TYPES_OLD = (
"'vrs_mapping', 'clingen_allele_id', 'mapped_hgvs', 'variant_translation', "
"'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', "
"'ldh_submission'"
)
_TYPES_NEW = "'vrs_mapping', 'cross_level_translation', " + (
"'clingen_allele_id', 'mapped_hgvs', 'variant_translation', "
"'gnomad_allele_frequency', 'clinvar_control', 'vep_functional_consequence', "
"'ldh_submission'"
)


def upgrade() -> None:
op.drop_constraint("ck_variant_annotation_type_valid", "variant_annotation_status", type_="check")
op.create_check_constraint(
"ck_variant_annotation_type_valid",
"variant_annotation_status",
f"annotation_type IN ({_TYPES_NEW})",
)


def downgrade() -> None:
op.execute("DELETE FROM variant_annotation_status WHERE annotation_type = 'cross_level_translation'")
op.drop_constraint("ck_variant_annotation_type_valid", "variant_annotation_status", type_="check")
op.create_check_constraint(
"ck_variant_annotation_type_valid",
"variant_annotation_status",
f"annotation_type IN ({_TYPES_OLD})",
)
Loading
Loading