Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions alembic/versions/1e08b947679d_add_functional_consequence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Add functional consequence

Revision ID: 1e08b947679d
Revises: 019eb75ad9ae
Create Date: 2025-09-17 11:15:52.091271

"""

from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "1e08b947679d"
down_revision = "019eb75ad9ae"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("mapped_variants", sa.Column("vep_functional_consequence", sa.String(), nullable=True))
op.add_column("mapped_variants", sa.Column("vep_access_date", sa.Date(), nullable=True))
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("mapped_variants", "vep_access_date")
op.drop_column("mapped_variants", "vep_functional_consequence")
# ### end Alembic commands ###
34 changes: 34 additions & 0 deletions alembic/versions/b22b450d409c_add_mapped_hgvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Add mapped hgvs

Revision ID: b22b450d409c
Revises: 1e08b947679d
Create Date: 2025-10-09 09:53:47.903249

"""

from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "b22b450d409c"
down_revision = "1e08b947679d"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("mapped_variants", sa.Column("hgvs_assay_level", sa.String(), nullable=True))
op.add_column("mapped_variants", sa.Column("hgvs_g", sa.String(), nullable=True))
op.add_column("mapped_variants", sa.Column("hgvs_c", sa.String(), nullable=True))
op.add_column("mapped_variants", sa.Column("hgvs_p", sa.String(), nullable=True))
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("mapped_variants", "hgvs_p")
op.drop_column("mapped_variants", "hgvs_c")
op.drop_column("mapped_variants", "hgvs_g")
op.drop_column("mapped_variants", "hgvs_assay_level")
# ### end Alembic commands ###
2 changes: 1 addition & 1 deletion src/mavedb/lib/clingen/allele_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

CLINGEN_API_URL = "https://reg.test.genome.network/allele"
CLINGEN_API_URL = "https://reg.genome.network/allele"


def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]:
Expand Down
161 changes: 141 additions & 20 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from mavedb.lib.mave.utils import is_csv_null
from mavedb.lib.validation.constants.general import null_values_list
from mavedb.lib.validation.utilities import is_null as validate_is_null
from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p
from mavedb.lib.variants import get_digest_from_post_mapped
from mavedb.models.contributor import Contributor
from mavedb.models.controlled_keyword import ControlledKeyword
from mavedb.models.doi_identifier import DoiIdentifier
Expand All @@ -35,6 +35,7 @@
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.gnomad_variant import GnomADVariant
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.publication_identifier import PublicationIdentifier
from mavedb.models.refseq_identifier import RefseqIdentifier
Expand Down Expand Up @@ -501,7 +502,7 @@ def find_publish_or_private_superseded_score_set_tail(
def get_score_set_variants_as_csv(
db: Session,
score_set: ScoreSet,
namespaces: List[Literal["scores", "counts"]],
namespaces: List[Literal["scores", "counts", "vep", "gnomad"]],
namespaced: Optional[bool] = None,
start: Optional[int] = None,
limit: Optional[int] = None,
Expand All @@ -518,8 +519,8 @@ def get_score_set_variants_as_csv(
The database session to use.
score_set : ScoreSet
The score set to get the variants from.
namespaces : List[Literal["scores", "counts"]]
The namespaces for data. Now there are only scores and counts. There will be ClinVar and gnomAD.
namespaces : List[Literal["scores", "counts", "vep", "gnomad"]]
The namespaces for data. Now there are only scores, counts, VEP, and gnomAD. ClinVar will be added in the future.
namespaced: Optional[bool] = None
Whether namespace the columns or not.
start : int, optional
Expand All @@ -531,8 +532,8 @@ def get_score_set_variants_as_csv(
include_custom_columns : bool, optional
Whether to include custom columns defined in the score set. Defaults to True.
include_post_mapped_hgvs : bool, optional
Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include
columns for both post-mapped HGVS genomic (g.) and protein (p.) notations.
Whether to include post-mapped HGVS notations and VEP functional consequence in the output. Defaults to False. If True, the output will include
columns for post-mapped HGVS genomic (g.) and protein (p.) notations, and VEP functional consequence.

Returns
_______
Expand All @@ -547,9 +548,12 @@ def get_score_set_variants_as_csv(
if include_post_mapped_hgvs:
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g")
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p")
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_c")
namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_at_assay_level")
namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest")
for namespace in namespaces:
namespaced_score_set_columns[namespace] = []

if include_custom_columns:
if "scores" in namespaced_score_set_columns:
namespaced_score_set_columns["scores"] = [
Expand All @@ -561,10 +565,51 @@ def get_score_set_variants_as_csv(
]
elif "scores" in namespaced_score_set_columns:
namespaced_score_set_columns["scores"].append(REQUIRED_SCORE_COLUMN)
if "vep" in namespaced_score_set_columns:
namespaced_score_set_columns["vep"].append("vep_functional_consequence")
if "gnomad" in namespaced_score_set_columns:
namespaced_score_set_columns["gnomad"].append("gnomad_af")
variants: Sequence[Variant] = []
mappings: Optional[list[Optional[MappedVariant]]] = None
gnomad_data: Optional[list[Optional[GnomADVariant]]] = None

if include_post_mapped_hgvs:
if "gnomad" in namespaces and include_post_mapped_hgvs:
variants_mappings_and_gnomad_query = (
select(Variant, MappedVariant, GnomADVariant)
.join(
MappedVariant,
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
isouter=True,
)
.join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True)
.where(
and_(
Variant.score_set_id == score_set.id,
or_(
and_(
GnomADVariant.db_name == "gnomAD",
GnomADVariant.db_version == "v4.1",
),
GnomADVariant.id.is_(None),
),
)
)
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
)
if start:
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.offset(start)
if limit:
variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.limit(limit)
variants_mappings_and_gnomad = db.execute(variants_mappings_and_gnomad_query).all()

variants = []
mappings = []
gnomad_data = []
for variant, mapping, gnomad in variants_mappings_and_gnomad:
variants.append(variant)
mappings.append(mapping)
gnomad_data.append(gnomad)
elif include_post_mapped_hgvs:
variants_and_mappings_query = (
select(Variant, MappedVariant)
.join(
Expand All @@ -586,6 +631,40 @@ def get_score_set_variants_as_csv(
for variant, mapping in variants_and_mappings:
variants.append(variant)
mappings.append(mapping)
elif "gnomad" in namespaces:
variants_and_gnomad_query = (
select(Variant, GnomADVariant)
.join(
MappedVariant,
and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)),
isouter=True,
)
.join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True)
.where(
and_(
Variant.score_set_id == score_set.id,
or_(
and_(
GnomADVariant.db_name == "gnomAD",
GnomADVariant.db_version == "v4.1",
),
GnomADVariant.id.is_(None),
),
)
)
.order_by(cast(func.split_part(Variant.urn, "#", 2), Integer))
)
if start:
variants_and_gnomad_query = variants_and_gnomad_query.offset(start)
if limit:
variants_and_gnomad_query = variants_and_gnomad_query.limit(limit)
variants_and_gnomad = db.execute(variants_and_gnomad_query).all()

variants = []
gnomad_data = []
for variant, gnomad in variants_and_gnomad:
variants.append(variant)
gnomad_data.append(gnomad)
else:
variants_query = (
select(Variant)
Expand All @@ -598,7 +677,11 @@ def get_score_set_variants_as_csv(
variants_query = variants_query.limit(limit)
variants = db.scalars(variants_query).all()
rows_data = variants_to_csv_rows(
variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings
variants,
columns=namespaced_score_set_columns,
namespaced=namespaced,
mappings=mappings,
gnomad_data=gnomad_data,
) # type: ignore
rows_columns = [
(
Expand Down Expand Up @@ -654,6 +737,7 @@ def variant_to_csv_row(
variant: Variant,
columns: dict[str, list[str]],
mapping: Optional[MappedVariant] = None,
gnomad_data: Optional[GnomADVariant] = None,
namespaced: Optional[bool] = None,
na_rep="NA",
) -> dict[str, Any]:
Expand All @@ -668,6 +752,10 @@ def variant_to_csv_row(
Columns to serialize.
namespaced: Optional[bool] = None
Namespace the columns or not.
mapping : variant.models.MappedVariant, optional
Mapped variant corresponding to the variant.
gnomad_data : variant.models.GnomADVariant, optional
gnomAD variant data corresponding to the variant.
na_rep : str
String to represent null values.

Expand All @@ -693,24 +781,29 @@ def variant_to_csv_row(
row[column_key] = value
for column_key in columns.get("mavedb", []):
if column_key == "post_mapped_hgvs_g":
hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
if hgvs_str is not None and is_hgvs_g(hgvs_str):
value = hgvs_str
else:
value = ""
value = str(mapping.hgvs_g) if mapping and mapping.hgvs_g else na_rep
elif column_key == "post_mapped_hgvs_p":
hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
if hgvs_str is not None and is_hgvs_p(hgvs_str):
value = hgvs_str
else:
value = ""
value = str(mapping.hgvs_p) if mapping and mapping.hgvs_p else na_rep
elif column_key == "post_mapped_hgvs_c":
value = str(mapping.hgvs_c) if mapping and mapping.hgvs_c else na_rep
elif column_key == "post_mapped_hgvs_at_assay_level":
value = str(mapping.hgvs_assay_level) if mapping and mapping.hgvs_assay_level else na_rep
elif column_key == "post_mapped_vrs_digest":
digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None
value = digest if digest is not None else ""
value = digest if digest is not None else na_rep
if is_null(value):
value = na_rep
key = f"mavedb.{column_key}" if namespaced else column_key
row[key] = value
for column_key in columns.get("vep", []):
if column_key == "vep_functional_consequence":
vep_functional_consequence = mapping.vep_functional_consequence if mapping else None
if vep_functional_consequence is not None:
value = vep_functional_consequence
else:
value = na_rep
key = f"vep.{column_key}" if namespaced else column_key
row[key] = value
for column_key in columns.get("scores", []):
parent = variant.data.get("score_data") if variant.data else None
value = str(parent.get(column_key)) if parent else na_rep
Expand All @@ -721,13 +814,23 @@ def variant_to_csv_row(
value = str(parent.get(column_key)) if parent else na_rep
key = f"counts.{column_key}" if namespaced else column_key
row[key] = value
for column_key in columns.get("gnomad", []):
if column_key == "gnomad_af":
gnomad_af = gnomad_data.allele_frequency if gnomad_data else None
if gnomad_af is not None:
value = str(gnomad_af)
else:
value = na_rep
key = f"gnomad.{column_key}" if namespaced else column_key
row[key] = value
return row


def variants_to_csv_rows(
variants: Sequence[Variant],
columns: dict[str, list[str]],
mappings: Optional[Sequence[Optional[MappedVariant]]] = None,
gnomad_data: Optional[Sequence[Optional[GnomADVariant]]] = None,
namespaced: Optional[bool] = None,
na_rep="NA",
) -> Iterable[dict[str, Any]]:
Expand All @@ -742,18 +845,36 @@ def variants_to_csv_rows(
Columns to serialize.
namespaced: Optional[bool] = None
Namespace the columns or not.
mappings : list[Optional[variant.models.MappedVariant]], optional
List of mapped variants corresponding to the variants.
gnomad_data : list[Optional[variant.models.GnomADVariant]], optional
List of gnomAD variant data corresponding to the variants.
na_rep : str
String to represent null values.

Returns
-------
list[dict[str, Any]]
"""
if mappings is not None:
if mappings is not None and gnomad_data is not None:
return map(
lambda zipped: variant_to_csv_row(
zipped[0], columns, mapping=zipped[1], gnomad_data=zipped[2], namespaced=namespaced, na_rep=na_rep
),
zip(variants, mappings, gnomad_data),
)
elif mappings is not None:
return map(
lambda pair: variant_to_csv_row(pair[0], columns, mapping=pair[1], namespaced=namespaced, na_rep=na_rep),
zip(variants, mappings),
)
elif gnomad_data is not None:
return map(
lambda pair: variant_to_csv_row(
pair[0], columns, gnomad_data=pair[1], namespaced=namespaced, na_rep=na_rep
),
zip(variants, gnomad_data),
)
return map(lambda v: variant_to_csv_row(v, columns, namespaced=namespaced, na_rep=na_rep), variants)


Expand Down
9 changes: 9 additions & 0 deletions src/mavedb/models/mapped_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ class MappedVariant(Base):

clingen_allele_id = Column(String, index=True, nullable=True)

vep_functional_consequence = Column(String, nullable=True)
vep_access_date = Column(Date, nullable=True)

# mapped hgvs
hgvs_assay_level = Column(String, nullable=True)
hgvs_g = Column(String, nullable=True)
hgvs_c = Column(String, nullable=True)
hgvs_p = Column(String, nullable=True)

clinical_controls: Mapped[list["ClinicalControl"]] = relationship(
"ClinicalControl",
secondary=mapped_variants_clinical_controls_association_table,
Expand Down
Loading