Skip to content
Draft
10 changes: 7 additions & 3 deletions alembic/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@
DB_DATABASE_NAME = os.getenv("DB_DATABASE_NAME")
DB_USERNAME = os.getenv("DB_USERNAME")
DB_PASSWORD = os.getenv("DB_PASSWORD")
config.set_section_option(
"alembic", "sqlalchemy.url", f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE_NAME}"
)

if not DB_USERNAME:
DB_URL = f"postgresql:///{DB_DATABASE_NAME}"
else:
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE_NAME}"

config.set_section_option("alembic", "sqlalchemy.url", DB_URL)

target_metadata = Base.metadata

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""add scoreset_fulltext materialized view

Revision ID: 0d3732aa62be
Revises: ec5d2787bec9
Create Date: 2024-10-15 14:59:16.297975

"""
from alembic import op

from mavedb.models.score_set_fulltext import _scoreset_fulltext_view

# revision identifiers, used by Alembic.
revision = '0d3732aa62be'
down_revision = '1d4933b4b6f7'
branch_labels = None
depends_on = None


def upgrade():
op.create_entity(_scoreset_fulltext_view)
op.execute("create index scoreset_fulltext_idx on scoreset_fulltext using gin (text)")


def downgrade():
op.execute("drop index scoreset_fulltext_idx")
op.drop_entity(_scoreset_fulltext_view)
55 changes: 52 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ SQLAlchemy = "~2.0.0"

# Optional dependencies for running this application as a server
alembic = { version = "~1.7.6", optional = true }
alembic-utils = { version = "0.8.1", optional = true }
arq = { version = "~0.25.0", optional = true }
authlib = { version = "~1.3.1", optional = true }
boto3 = { version = "~1.34.97", optional = true }
Expand Down Expand Up @@ -83,10 +84,12 @@ requests-mock = "~1.11.0"
ruff = "^0.6.8"
SQLAlchemy = { extras = ["mypy"], version = "~2.0.0" }


[tool.poetry.extras]
server = ["alembic", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"]
server = ["alembic", "alembic-utils", "arq", "authlib", "biocommons", "boto3", "cdot", "cryptography", "fastapi", "hgvs", "orcid", "psycopg2", "python-jose", "python-multipart", "requests", "starlette", "starlette-context", "slack-sdk", "uvicorn", "watchtower"]

[tool.black]
extend-exclude = "alembic/versions"
line-length = 120

[tool.mypy]
plugins = [
Expand Down
5 changes: 4 additions & 1 deletion src/mavedb/db/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
DB_PASSWORD = os.getenv("DB_PASSWORD")

# DB_URL = "sqlite:///./sql_app.db"
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE_NAME}"
if not DB_USERNAME:
DB_URL = f"postgresql:///{DB_DATABASE_NAME}"
else:
DB_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE_NAME}"

engine = create_engine(
# For PostgreSQL:
Expand Down
74 changes: 2 additions & 72 deletions src/mavedb/lib/score_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,22 @@
from mavedb.lib.validation.constants.general import null_values_list
from mavedb.models.contributor import Contributor
from mavedb.models.controlled_keyword import ControlledKeyword
from mavedb.models.doi_identifier import DoiIdentifier
from mavedb.models.ensembl_identifier import EnsemblIdentifier
from mavedb.models.ensembl_offset import EnsemblOffset
from mavedb.models.experiment import Experiment
from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation
from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.publication_identifier import PublicationIdentifier
from mavedb.models.refseq_identifier import RefseqIdentifier
from mavedb.models.refseq_offset import RefseqOffset
from mavedb.models.score_set import ScoreSet
from mavedb.models.score_set_publication_identifier import (
ScoreSetPublicationIdentifierAssociation,
)
from mavedb.models.score_set_fulltext import scoreset_fulltext_filter
from mavedb.models.target_accession import TargetAccession
from mavedb.models.target_gene import TargetGene
from mavedb.models.target_sequence import TargetSequence
from mavedb.models.taxonomy import Taxonomy
from mavedb.models.uniprot_identifier import UniprotIdentifier
from mavedb.models.uniprot_offset import UniprotOffset
from mavedb.models.user import User
from mavedb.models.variant import Variant
Expand Down Expand Up @@ -86,74 +83,7 @@ def search_score_sets(db: Session, owner_or_contributor: Optional[User], search:
query = query.filter(ScoreSet.published_date.is_(None))

if search.text:
lower_search_text = search.text.lower().strip()
query = query.filter(
or_(
ScoreSet.urn.icontains(lower_search_text),
ScoreSet.title.icontains(lower_search_text),
ScoreSet.short_description.icontains(lower_search_text),
ScoreSet.abstract_text.icontains(lower_search_text),
ScoreSet.target_genes.any(func.lower(TargetGene.name).icontains(lower_search_text)),
ScoreSet.target_genes.any(func.lower(TargetGene.category).icontains(lower_search_text)),
ScoreSet.target_genes.any(
TargetGene.target_sequence.has(
TargetSequence.taxonomy.has(func.lower(Taxonomy.organism_name).icontains(lower_search_text))
)
),
ScoreSet.target_genes.any(
TargetGene.target_sequence.has(
TargetSequence.taxonomy.has(func.lower(Taxonomy.common_name).icontains(lower_search_text))
)
),
ScoreSet.target_genes.any(
TargetGene.target_accession.has(func.lower(TargetAccession.assembly).icontains(lower_search_text))
),
# TODO(#94): add LICENSE, plus TAX_ID if numeric
ScoreSet.publication_identifiers.any(
func.lower(PublicationIdentifier.identifier).icontains(lower_search_text)
),
ScoreSet.publication_identifiers.any(
func.lower(PublicationIdentifier.doi).icontains(lower_search_text)
),
ScoreSet.publication_identifiers.any(
func.lower(PublicationIdentifier.abstract).icontains(lower_search_text)
),
ScoreSet.publication_identifiers.any(
func.lower(PublicationIdentifier.title).icontains(lower_search_text)
),
ScoreSet.publication_identifiers.any(
func.lower(PublicationIdentifier.publication_journal).icontains(lower_search_text)
),
ScoreSet.publication_identifiers.any(
func.jsonb_path_exists(
PublicationIdentifier.authors,
f"""$[*].name ? (@ like_regex "{lower_search_text}" flag "i")""",
)
),
ScoreSet.doi_identifiers.any(func.lower(DoiIdentifier.identifier).icontains(lower_search_text)),
ScoreSet.target_genes.any(
TargetGene.uniprot_offset.has(
UniprotOffset.identifier.has(
func.lower(UniprotIdentifier.identifier).icontains(lower_search_text)
)
)
),
ScoreSet.target_genes.any(
TargetGene.refseq_offset.has(
RefseqOffset.identifier.has(
func.lower(RefseqIdentifier.identifier).icontains(lower_search_text)
)
)
),
ScoreSet.target_genes.any(
TargetGene.ensembl_offset.has(
EnsemblOffset.identifier.has(
func.lower(EnsemblIdentifier.identifier).icontains(lower_search_text)
)
)
),
)
)
query = scoreset_fulltext_filter(query, search.text)

if search.targets:
query = query.filter(ScoreSet.target_genes.any(TargetGene.name.in_(search.targets)))
Expand Down
78 changes: 78 additions & 0 deletions src/mavedb/models/score_set_fulltext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import logging

from sqlalchemy import text
from mavedb.models.score_set import ScoreSet
from alembic_utils.pg_materialized_view import PGMaterializedView # type: ignore

logger = logging.getLogger(__name__)

# TODO(#94): add LICENSE, plus TAX_ID if numeric
# TODO(#89): The query below should be generated from SQLAlchemy
# models rather than hand-carved SQL

_scoreset_fulltext_view = PGMaterializedView(
schema="public",
signature="scoreset_fulltext",
definition=' union ' .join(
[
f"select id as scoreset_id, to_tsvector({c}) as text from scoresets"
for c in ('urn', 'title', 'short_description', 'abstract_text')
] + [
f"select scoreset_id, to_tsvector({c}) as text from target_genes"
for c in ('name', 'category')
] + [
f"select scoreset_id, to_tsvector(TX.{c}) as text from target_genes TG join target_sequences TS on \
(TG.target_sequence_id = TS.id) join taxonomies TX on (TS.taxonomy_id = TX.id)"
for c in ('organism_name', 'common_name')
] + [
"select scoreset_id, to_tsvector(TA.assembly) as text from target_genes TG join target_accessions TA on \
(TG.accession_id = TA.id)"
] + [
f"select scoreset_id, to_tsvector(PI.{c}) as text from scoreset_publication_identifiers SPI JOIN \
publication_identifiers PI ON (SPI.publication_identifier_id = PI.id)"
for c in ('identifier', 'doi', 'abstract', 'title', 'publication_journal')
] + [
"select scoreset_id, to_tsvector(jsonb_array_elements(authors)->'name') as text from \
scoreset_publication_identifiers SPI join publication_identifiers PI on \
SPI.publication_identifier_id = PI.id",
] + [
"select scoreset_id, to_tsvector(DI.identifier) as text from scoreset_doi_identifiers SD join \
doi_identifiers DI on (SD.doi_identifier_id = DI.id)",
] + [
f"select scoreset_id, to_tsvector(XI.identifier) as text from target_genes TG join {x}_offsets XO on \
(XO.target_gene_id = TG.id) join {x}_identifiers XI on (XI.id = XO.identifier_id)"
for x in ('uniprot', 'refseq', 'ensembl')
]
),
with_data=True
)


def scoreset_fulltext_create(session):
logger.warning("Creating %s", _scoreset_fulltext_view.signature)
session.execute(
_scoreset_fulltext_view.to_sql_statement_create()
)
session.commit()
logger.warning("Created %s", _scoreset_fulltext_view.signature)


def scoreset_fulltext_destroy(session):
logger.warning("Destroying %s", _scoreset_fulltext_view.signature)
session.execute(
_scoreset_fulltext_view.to_sql_statement_drop()
)
session.commit()
logger.warning("Destroyed %s", _scoreset_fulltext_view.signature)


def scoreset_fulltext_refresh(session):
session.execute(text(f'refresh materialized view {_scoreset_fulltext_view.signature}'))
session.commit()


def scoreset_fulltext_filter(query, string):
return query.filter(ScoreSet.id.in_(
text(f"select distinct scoreset_id from {_scoreset_fulltext_view.signature} \
where text @@ websearch_to_tsquery(:text)").params(text=string)
))
Loading
Loading