Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ requires-python = ">=3.11"

dependencies = [
"agct~=0.1.0",
"requests",
"biopython",
"tqdm",
"cdot",
"click",
"cool-seq-tool==0.4.0.dev3",
"ga4gh.vrs==2.0.0-a6",
"gene_normalizer[etl,pg]==0.3.0-dev2",
"httpx~=0.28",
"pydantic>=2",
"python-dotenv",
"setuptools>=68.0", # tmp -- ensure 3.12 compatibility
Expand All @@ -61,7 +61,7 @@ tests = [
"pytest-mock",
"pytest-cov",
"pytest-asyncio",
"requests-mock"
"respx"
]
dev = [
"ruff==0.2.0",
Expand Down
15 changes: 13 additions & 2 deletions src/api/routers/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from cool_seq_tool.schemas import AnnotationLayer
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse
from requests import HTTPError
from httpx import HTTPStatusError

from dcd_mapping.align import build_alignment_result
from dcd_mapping.annotate import (
Expand Down Expand Up @@ -64,6 +64,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
records = get_scoreset_records(metadata, True, store_path)
metadata = patch_target_sequence_type(metadata, records, force=False)
except ScoresetNotSupportedError as e:
_logger.error("Scoreset not supported for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=None,
Expand All @@ -72,6 +73,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
)
except ResourceAcquisitionError as e:
msg = f"Unable to acquire resource from MaveDB: {e}"
_logger.error(msg)
raise HTTPException(status_code=500, detail=msg) from e

if not records:
Expand All @@ -87,17 +89,21 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
alignment_results = build_alignment_result(metadata, True)
except BlatNotFoundError as e:
msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
_logger.error("BLAT not found for %s: %s", urn, e)
raise HTTPException(status_code=500, detail=msg) from e
except ResourceAcquisitionError as e:
msg = f"BLAT resource could not be acquired: {e}"
_logger.error(msg)
raise HTTPException(status_code=500, detail=msg) from e
except AlignmentError as e:
_logger.error("Alignment error for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
).model_dump(exclude_none=True)
)
except ScoresetNotSupportedError as e:
_logger.error("Scoreset not supported during alignment for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
Expand All @@ -111,11 +117,13 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
# on the target level and on the variant level for variants relative to that target
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
# underlying issues with data providers.
except HTTPError as e:
except HTTPStatusError as e:
msg = f"HTTP error occurred during transcript selection: {e}"
_logger.error(msg)
raise HTTPException(status_code=500, detail=msg) from e
except DataLookupError as e:
msg = f"Data lookup error occurred during transcript selection: {e}"
_logger.error(msg)
raise HTTPException(status_code=500, detail=msg) from e

vrs_results = {}
Expand All @@ -134,6 +142,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
UnsupportedReferenceSequencePrefixError,
MissingSequenceIdError,
) as e:
_logger.error("VRS mapping error for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
Expand Down Expand Up @@ -172,6 +181,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
VrsVersion.V_2,
)
except Exception as e:
_logger.error("Unexpected error during annotation for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
Expand Down Expand Up @@ -287,6 +297,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
del reference_sequences[target_gene].layers[layer]

except Exception as e:
_logger.error("Unexpected error during result assembly for %s: %s", urn, e)
return JSONResponse(
content=ScoresetMapping(
metadata=metadata, error_message=str(e).strip("'")
Expand Down
8 changes: 4 additions & 4 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from urllib.parse import urlparse

import requests
import httpx
from Bio.SearchIO import HSP
from Bio.SearchIO import parse as parse_blat
from Bio.SearchIO._model import Hit, QueryResult
Expand Down Expand Up @@ -84,7 +84,7 @@ def get_ref_genome_file(
if not genome_file.exists():
try:
http_download(url, genome_file, silent)
except requests.HTTPError as e:
except httpx.HTTPStatusError as e:
msg = f"HTTPError when fetching reference genome file from {url}"
_logger.error(msg)
raise ResourceAcquisitionError(msg) from e
Expand Down Expand Up @@ -378,11 +378,11 @@ def fetch_alignment(
alignment_results[accession_id] = None
else:
url = f"{CDOT_URL}/transcript/{accession_id}"
r = requests.get(url, timeout=30)
r = httpx.get(url, timeout=30)

try:
r.raise_for_status()
except requests.HTTPError as e:
except httpx.HTTPStatusError as e:
msg = f"Received HTTPError from {url} for scoreset {metadata.urn}"
_logger.error(msg)
raise ResourceAcquisitionError(msg) from e
Expand Down
6 changes: 3 additions & 3 deletions src/dcd_mapping/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from typing import Any

import hgvs
import httpx
import polars as pl
import requests
from biocommons.seqrepo import SeqRepo
from biocommons.seqrepo.seqaliasdb.seqaliasdb import sqlite3
from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
Expand Down Expand Up @@ -682,7 +682,7 @@ def get_overlapping_features_for_region(
url, headers={"Content-Type": "application/json"}
)
response.raise_for_status()
except requests.RequestException as e:
except httpx.HTTPError as e:
_logger.error(
"Failed to fetch overlapping features for region %s-%s on chromosome %s: %s",
start,
Expand Down Expand Up @@ -715,7 +715,7 @@ def get_uniprot_sequence(uniprot_id: str) -> str | None:
:raise HTTPError: if response comes with an HTTP error code
"""
url = f"https://www.ebi.ac.uk/proteins/api/proteins?accession={uniprot_id.split(':')[1]}&format=json"
response = requests.get(url, timeout=30)
response = httpx.get(url, timeout=30)
response.raise_for_status()
json = response.json()
return json[0]["sequence"]["sequence"]
4 changes: 2 additions & 2 deletions src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path

import click
from requests import HTTPError
from httpx import HTTPStatusError

from dcd_mapping.align import build_alignment_result
from dcd_mapping.annotate import (
Expand Down Expand Up @@ -205,7 +205,7 @@ async def map_scoreset(
# on the target level and on the variant level for variants relative to that target
# HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
# underlying issues with data providers.
except HTTPError as e:
except HTTPStatusError as e:
_emit_info(
f"HTTP error occurred during transcript selection: {e}",
silent,
Expand Down
21 changes: 11 additions & 10 deletions src/dcd_mapping/mavedb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pathlib import Path
from typing import Any

import requests
import httpx
from fastapi import HTTPException
from pydantic import ValidationError

Expand All @@ -27,6 +27,7 @@
MAVEDB_BASE_URL,
authentication_header,
http_download,
is_missing_value,
)
from dcd_mapping.schemas import (
ScoreRow,
Expand Down Expand Up @@ -56,7 +57,7 @@ def get_scoreset_urns() -> set[str]:

:return: set of URN strings
"""
r = requests.get(
r = httpx.get(
f"{MAVEDB_BASE_URL}/api/v1/experiments/",
timeout=30,
headers=authentication_header(),
Expand Down Expand Up @@ -100,14 +101,14 @@ def get_human_urns() -> list[str]:
scoreset_urns = get_scoreset_urns()
human_scoresets: list[str] = []
for urn in scoreset_urns:
r = requests.get(
r = httpx.get(
f"{MAVEDB_BASE_URL}/api/v1/score-sets/{urn}",
timeout=30,
headers=authentication_header(),
)
try:
r.raise_for_status()
except requests.exceptions.HTTPError:
except httpx.HTTPStatusError:
_logger.info("Unable to retrieve scoreset data for URN %s", urn)
continue
data = r.json()
Expand Down Expand Up @@ -155,10 +156,10 @@ def get_raw_scoreset_metadata(
metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
if not metadata_file.exists():
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
r = requests.get(url, timeout=30, headers=authentication_header())
r = httpx.get(url, timeout=30, headers=authentication_header())
try:
r.raise_for_status()
except requests.HTTPError as e:
except httpx.HTTPStatusError as e:
msg = f"Received HTTPError from {url} for scoreset {scoreset_urn}"
_logger.error(msg)
raise ResourceAcquisitionError(msg) from e
Expand Down Expand Up @@ -246,13 +247,13 @@ def _load_scoreset_records(
with path.open() as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if row["score"] == "NA":
if is_missing_value(row["score"]):
row["score"] = None
else:
row["score"] = row["score"]
if row["hgvs_nt"] != "NA":
if not is_missing_value(row["hgvs_nt"]):
prefix = row["hgvs_nt"].split(":")[0] if ":" in row["hgvs_nt"] else None
elif row["hgvs_pro"] != "NA":
elif not is_missing_value(row["hgvs_pro"]):
prefix = (
row["hgvs_pro"].split(":")[0] if ":" in row["hgvs_pro"] else None
)
Expand Down Expand Up @@ -317,7 +318,7 @@ def get_scoreset_records(
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{metadata.urn}/scores"
try:
http_download(url, scores_csv, silent)
except requests.HTTPError as e:
except httpx.HTTPStatusError as e:
msg = f"HTTPError when fetching scores CSV from {url}"
_logger.error(msg)
raise ResourceAcquisitionError(msg) from e
Expand Down
55 changes: 44 additions & 11 deletions src/dcd_mapping/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,30 @@
from pathlib import Path

import click
import requests
import httpx
from tqdm import tqdm

_logger = logging.getLogger(__name__)

# Common representations of missing/null data in CSV files
MISSING_VALUE_REPRESENTATIONS = frozenset(
{
"NA",
"N/A",
"na",
"n/a",
"NaN",
"nan",
"null",
"NULL",
"None",
"none",
"",
"-",
".",
}
)

MAVEDB_API_KEY = os.environ.get("MAVEDB_API_KEY")
MAVEDB_BASE_URL = os.environ.get("MAVEDB_BASE_URL")
ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") # TODO
Expand All @@ -24,6 +43,22 @@
LOCAL_STORE_PATH.mkdir(exist_ok=True, parents=True)


def is_missing_value(value: str | None) -> bool:
"""Check if a value represents missing/null data.

This function recognizes multiple common representations of missing data
that may appear in CSV files from external sources, making the codebase
more resilient to upstream changes in NA representation.

:param value: The value to check
:return: True if the value represents missing data, False otherwise
"""
if value is None:
return True
# Strip whitespace and check against known missing value representations
return value.strip() in MISSING_VALUE_REPRESENTATIONS


def authentication_header() -> dict | None:
"""Fetch with api key envvar, if available."""
return {"X-API-key": MAVEDB_API_KEY} if MAVEDB_API_KEY is not None else None
Expand All @@ -36,13 +71,11 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
:param out_path: location to save file to
:param silent: show TQDM progress bar if true
:return: Path if download successful
:raise requests.HTTPError: if request is unsuccessful
:raise httpx.HTTPStatusError: if request is unsuccessful
"""
if not silent:
click.echo(f"Downloading {out_path.name} to {out_path.parents[0].absolute()}")
with requests.get(
url, stream=True, timeout=60, headers=authentication_header()
) as r:
with httpx.stream("GET", url, timeout=60, headers=authentication_header()) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with out_path.open("wb") as h:
Expand All @@ -54,20 +87,20 @@ def http_download(url: str, out_path: Path, silent: bool = True) -> Path:
desc=out_path.name,
ncols=80,
) as progress_bar:
for chunk in r.iter_content(chunk_size=8192):
for chunk in r.iter_bytes(chunk_size=8192):
if chunk:
h.write(chunk)
progress_bar.update(len(chunk))
else:
for chunk in r.iter_content(chunk_size=8192):
for chunk in r.iter_bytes(chunk_size=8192):
if chunk:
h.write(chunk)
return out_path


def request_with_backoff(
url: str, max_retries: int = 5, backoff_factor: float = 0.3, **kwargs
) -> requests.Response:
) -> httpx.Response:
"""HTTP GET with exponential backoff only for retryable errors.

Retries on:
Expand All @@ -80,9 +113,9 @@ def request_with_backoff(
attempt = 0
while attempt < max_retries:
try:
kwargs.setdefault("timeout", 60) # Default timeout of 10 seconds
response = requests.get(url, **kwargs) # noqa: S113
except (requests.Timeout, requests.ConnectionError):
kwargs.setdefault("timeout", 60)
response = httpx.get(url, **kwargs)
except (httpx.TimeoutException, httpx.ConnectError):
# Retry on transient network failures
if attempt == max_retries - 1:
raise
Expand Down
2 changes: 1 addition & 1 deletion src/dcd_mapping/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Provide dcd mapping version"""

dcd_mapping_version = "2026.1.0"
dcd_mapping_version = "2026.1.1"
Loading