From 57367ee02586e10b6907650cbb0b2e4ffcd86bd9 Mon Sep 17 00:00:00 2001 From: Parashar Date: Thu, 19 Feb 2026 11:48:07 +0100 Subject: [PATCH 1/2] Add require_artifacts flag and LLM validation error Bump version to 0.15.0. Introduce a require_artifacts parameter to CyteType.run (default True) so callers can choose whether artifact build/upload failures should abort the run; when False the run continues without uploaded_files and logs a warning. Add LLMValidationError and map the API error code LLM_VALIDATION_FAILED to this exception, and export it from the api package. Add tests covering artifact-failure behavior (raising by default and continuing when require_artifacts=False). --- cytetype/__init__.py | 2 +- cytetype/api/__init__.py | 2 ++ cytetype/api/exceptions.py | 7 ++++ cytetype/main.py | 29 ++++++++++++---- tests/test_cytetype_integration.py | 53 ++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+), 7 deletions(-) diff --git a/cytetype/__init__.py b/cytetype/__init__.py index 2caccb7..5f47347 100644 --- a/cytetype/__init__.py +++ b/cytetype/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.14.1" +__version__ = "0.15.0" import requests diff --git a/cytetype/api/__init__.py b/cytetype/api/__init__.py index d6a29ef..d8c5331 100644 --- a/cytetype/api/__init__.py +++ b/cytetype/api/__init__.py @@ -23,6 +23,7 @@ QuotaExceededError, JobNotFoundError, JobFailedError, + LLMValidationError, TimeoutError, NetworkError, ) @@ -49,6 +50,7 @@ "QuotaExceededError", "JobNotFoundError", "JobFailedError", + "LLMValidationError", "TimeoutError", "NetworkError", ] diff --git a/cytetype/api/exceptions.py b/cytetype/api/exceptions.py index 18a52cd..3ed2366 100644 --- a/cytetype/api/exceptions.py +++ b/cytetype/api/exceptions.py @@ -50,6 +50,12 @@ class JobFailedError(APIError): pass +class LLMValidationError(APIError): + """LLM validation failed - LLM_VALIDATION_FAILED.""" + + pass + + # Client-side errors with default messages class TimeoutError(CyteTypeError): """Client-side timeout waiting for results.""" @@ -80,6 +86,7 @@ def __init__( "QUOTA_EXCEEDED": QuotaExceededError, "JOB_NOT_FOUND": JobNotFoundError, "JOB_FAILED": JobFailedError, + "LLM_VALIDATION_FAILED": LLMValidationError, "JOB_PROCESSING": APIError, # Generic - expected during polling "JOB_NOT_COMPLETED": APIError, # Generic "HTTP_ERROR": APIError, # Generic diff --git a/cytetype/main.py b/cytetype/main.py index 982b8b1..e7520b2 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -268,6 +268,7 @@ def run( obs_duckdb_path: str = "obs.duckdb", upload_timeout_seconds: int = 3600, cleanup_artifacts: bool = False, + require_artifacts: bool = True, show_progress: bool = True, override_existing_results: bool = False, ) -> anndata.AnnData: @@ -310,6 +311,9 @@ def run( Defaults to 3600. cleanup_artifacts (bool, optional): Whether to delete generated artifact files after run completes or fails. Defaults to False. + require_artifacts (bool, optional): Whether to raise an error if artifact building or + uploading fails. When True (default), any artifact failure stops the run. Set to + False to skip artifacts and continue with annotation only. Defaults to True. show_progress (bool, optional): Whether to display progress updates with spinner and cluster status. Set to False to disable all visual progress output. Defaults to True. override_existing_results (bool, optional): Whether to override existing results with the @@ -362,12 +366,25 @@ def run( artifact_paths = [vars_h5_path, obs_duckdb_path] try: - uploaded_file_refs = self._build_and_upload_artifacts( - vars_h5_path=vars_h5_path, - obs_duckdb_path=obs_duckdb_path, - upload_timeout_seconds=upload_timeout_seconds, - ) - payload["uploaded_files"] = uploaded_file_refs + try: + uploaded_file_refs = self._build_and_upload_artifacts( + vars_h5_path=vars_h5_path, + obs_duckdb_path=obs_duckdb_path, + upload_timeout_seconds=upload_timeout_seconds, + ) + payload["uploaded_files"] = uploaded_file_refs + except Exception as exc: + if require_artifacts: + logger.error( + "Artifact build/upload failed. Rerun with `require_artifacts=False` to skip this error.", + "Please report the error below in a new issue at https://github.com/NygenAnalytics/CyteType", + f"\n({type(exc).__name__}: {exc}). ", + ) + raise + logger.warning( + "Artifact build/upload failed. Continuing without artifacts. " + "Set `require_artifacts=True` to see the full traceback." + ) # Save query if requested if save_query: diff --git a/tests/test_cytetype_integration.py b/tests/test_cytetype_integration.py index 5f4f55c..b46bdd3 100644 --- a/tests/test_cytetype_integration.py +++ b/tests/test_cytetype_integration.py @@ -149,6 +149,59 @@ def test_cytetype_run_auto_uploads_artifacts( } +@patch("cytetype.main.wait_for_completion") +@patch("cytetype.main.submit_annotation_job") +def test_cytetype_run_artifact_failure_raises_by_default( + mock_submit: MagicMock, + mock_wait: MagicMock, + mock_adata: anndata.AnnData, + mock_api_response: dict[str, Any], + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Test run() raises when artifact build fails and require_artifacts=True (default).""" + mock_submit.return_value = "job_no_artifacts" + mock_wait.return_value = mock_api_response + + monkeypatch.setattr( + "cytetype.main.save_features_matrix", + MagicMock(side_effect=RuntimeError("disk full")), + ) + + ct = CyteType(mock_adata, group_key="leiden") + with pytest.raises(RuntimeError, match="disk full"): + ct.run(study_context="Test") + + +@patch("cytetype.main.wait_for_completion") +@patch("cytetype.main.submit_annotation_job") +def test_cytetype_run_artifact_failure_continues_when_not_required( + mock_submit: MagicMock, + mock_wait: MagicMock, + mock_adata: anndata.AnnData, + mock_api_response: dict[str, Any], + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Test run() proceeds without uploaded_files when require_artifacts=False.""" + mock_submit.return_value = "job_no_artifacts" + mock_wait.return_value = mock_api_response + + monkeypatch.setattr( + "cytetype.main.save_features_matrix", + MagicMock(side_effect=RuntimeError("disk full")), + ) + + ct = CyteType(mock_adata, group_key="leiden") + result = ct.run(study_context="Test", require_artifacts=False) + + # Job should still complete successfully + assert result is not None + assert mock_submit.called + + # Payload must not contain uploaded_files + payload = mock_submit.call_args.args[2] + assert "uploaded_files" not in payload + + @patch("cytetype.main.wait_for_completion") @patch("cytetype.main.submit_annotation_job") def test_cytetype_run_cleanup_artifacts( From 8b8a4336d3aed137da45e7a08c3c577e0afb1602 Mon Sep 17 00:00:00 2001 From: Parashar Date: Thu, 19 Feb 2026 12:00:16 +0100 Subject: [PATCH 2/2] docs update --- README.md | 5 +--- cytetype/main.py | 8 +++--- docs/configuration.md | 57 +++++++++++++++++++++++++++++------------ docs/troubleshooting.md | 3 ++- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 2bff690..0e340b9 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,7 @@ sc.pl.umap(adata, color='cytetype_annotation_clusters') ``` 🚀 [Try it in Google Colab](https://colab.research.google.com/drive/1aRLsI3mx8JR8u5BKHs48YUbLsqRsh2N7?usp=sharing) -> **Note:** No API keys required for default configuration. See [custom LLM configuration](docs/configuration.md#llm-configuration) for advanced options. -> -> `run()` now handles artifact packaging and upload automatically (`vars.h5` + `obs.duckdb`) before annotation. -> Generated artifact files are kept on disk by default; use `cleanup_artifacts=True` to remove them after run completion/failure. +> **Note:** No API keys required for default configuration. See [Configuration](docs/configuration.md) for LLM setup, artifact handling, and advanced options. **Using R/Seurat?** → [CyteTypeR](https://github.com/NygenAnalytics/CyteTypeR) diff --git a/cytetype/main.py b/cytetype/main.py index e7520b2..5189cf4 100644 --- a/cytetype/main.py +++ b/cytetype/main.py @@ -376,9 +376,11 @@ def run( except Exception as exc: if require_artifacts: logger.error( - "Artifact build/upload failed. Rerun with `require_artifacts=False` to skip this error.", - "Please report the error below in a new issue at https://github.com/NygenAnalytics/CyteType", - f"\n({type(exc).__name__}: {exc}). ", + "Artifact build/upload failed. " + "Rerun with `require_artifacts=False` to skip this error.\n" + "Please report the error below in a new issue at " + "https://github.com/NygenAnalytics/CyteType\n" + f"({type(exc).__name__}: {exc})" ) raise logger.warning( diff --git a/docs/configuration.md b/docs/configuration.md index 0321ace..489bec9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -29,14 +29,6 @@ adata = annotator.run( ) ``` -`run()` now performs the full upload pipeline internally: -- Creates `vars.h5` from `adata.X` -- Creates `obs.duckdb` from `adata.obs` -- Uploads both artifacts to the CyteType API -- Calls `/annotate` with uploaded file references - -If artifact creation or upload fails, `run()` fails fast. - ## LLM Configuration You can provide your own LLM providers/models: ```python @@ -64,20 +56,42 @@ adata = annotator.run( ) ``` -## Advanced +## Artifacts + +`run()` automatically builds and uploads two artifact files before submitting an annotation job: + +- **`vars.h5`** — a compressed HDF5 file containing the normalized expression matrix (`adata.X`) and variable metadata (`adata.var`). Used by the server for on-demand gene expression lookups during annotation and in the interactive report. +- **`obs.duckdb`** — a DuckDB database containing the observation metadata (`adata.obs`). Used by the server to power metadata queries and filtering in the interactive report. + +Both files are created locally and then uploaded to the CyteType API. The uploaded references are attached to the `/annotate` payload so the server can link them to the job. + +### Artifact Parameters + ```python adata = annotator.run( ... - poll_interval_seconds=30, # How often to poll (default) - timeout_seconds=7200, # Max wait time (default: 2 hours) - api_url="https://custom-api.example.com", # Custom API endpoint if needed - vars_h5_path="vars.h5", # Local artifact output path - obs_duckdb_path="obs.duckdb", # Local artifact output path - upload_timeout_seconds=3600, # Per-upload socket read timeout - cleanup_artifacts=False, # Keep artifacts by default + vars_h5_path="vars.h5", # Local output path for vars artifact + obs_duckdb_path="obs.duckdb", # Local output path for obs artifact + upload_timeout_seconds=3600, # Socket read timeout per upload (seconds) + cleanup_artifacts=False, # Delete local artifact files after run + require_artifacts=True, # Raise on artifact failure (set False to skip) ) ``` +| Parameter | Default | Description | +|-----------|---------|-------------| +| `vars_h5_path` | `"vars.h5"` | Local path where the vars HDF5 file is written | +| `obs_duckdb_path` | `"obs.duckdb"` | Local path where the obs DuckDB file is written | +| `upload_timeout_seconds` | `3600` | Socket read timeout for each artifact upload | +| `cleanup_artifacts` | `False` | Delete local artifact files after run completes or fails | +| `require_artifacts` | `True` | Raise on artifact build/upload failure. Set to `False` to skip artifacts and continue with annotation only | + +### Error Handling + +By default (`require_artifacts=True`), any failure during artifact building or uploading stops the run and surfaces the full error. The error message includes a link to report the issue on GitHub. + +If you want the annotation to proceed even when artifacts fail (e.g. due to disk space or network issues), set `require_artifacts=False`. The job will submit without artifacts — annotation still works, but the interactive report will not have expression lookups or metadata filtering. + ### Memory Recommendation for Large Datasets For large datasets, open your AnnData object in backed mode to reduce memory usage while building `vars.h5`: @@ -86,4 +100,15 @@ For large datasets, open your AnnData object in backed mode to reduce memory usa import scanpy as sc adata = sc.read_h5ad("input.h5ad", backed="r") +``` + +## Advanced + +```python +adata = annotator.run( + ... + poll_interval_seconds=30, # How often to poll (default) + timeout_seconds=7200, # Max wait time (default: 2 hours) + api_url="https://custom-api.example.com", # Custom API endpoint if needed +) ``` \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 56cc768..e4394da 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -5,4 +5,5 @@ - Make sure you have valid gene symbols in the AnnData object and are passing the correct gene symbols column name to parameter `gene_symbols_column`. - If you are using a custom LLM, make sure you have the correct API key and base URL. - For large datasets, load AnnData in backed mode (`sc.read_h5ad(..., backed="r")`) to reduce memory use during artifact generation. -- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files. \ No newline at end of file +- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files. +- If artifact building or uploading fails, `run()` will raise an error by default. Set `require_artifacts=False` to skip artifacts and continue with annotation only. \ No newline at end of file