Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,7 @@ sc.pl.umap(adata, color='cytetype_annotation_clusters')
```
🚀 [Try it in Google Colab](https://colab.research.google.com/drive/1aRLsI3mx8JR8u5BKHs48YUbLsqRsh2N7?usp=sharing)

> **Note:** No API keys required for default configuration. See [custom LLM configuration](docs/configuration.md#llm-configuration) for advanced options.
>
> `run()` now handles artifact packaging and upload automatically (`vars.h5` + `obs.duckdb`) before annotation.
> Generated artifact files are kept on disk by default; use `cleanup_artifacts=True` to remove them after run completion/failure.
> **Note:** No API keys required for default configuration. See [Configuration](docs/configuration.md) for LLM setup, artifact handling, and advanced options.

**Using R/Seurat?** → [CyteTypeR](https://github.com/NygenAnalytics/CyteTypeR)

Expand Down
2 changes: 1 addition & 1 deletion cytetype/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.14.1"
__version__ = "0.15.0"

import requests

Expand Down
2 changes: 2 additions & 0 deletions cytetype/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
QuotaExceededError,
JobNotFoundError,
JobFailedError,
LLMValidationError,
TimeoutError,
NetworkError,
)
Expand All @@ -49,6 +50,7 @@
"QuotaExceededError",
"JobNotFoundError",
"JobFailedError",
"LLMValidationError",
"TimeoutError",
"NetworkError",
]
7 changes: 7 additions & 0 deletions cytetype/api/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ class JobFailedError(APIError):
pass


class LLMValidationError(APIError):
"""LLM validation failed - LLM_VALIDATION_FAILED."""

pass


# Client-side errors with default messages
class TimeoutError(CyteTypeError):
"""Client-side timeout waiting for results."""
Expand Down Expand Up @@ -80,6 +86,7 @@ def __init__(
"QUOTA_EXCEEDED": QuotaExceededError,
"JOB_NOT_FOUND": JobNotFoundError,
"JOB_FAILED": JobFailedError,
"LLM_VALIDATION_FAILED": LLMValidationError,
"JOB_PROCESSING": APIError, # Generic - expected during polling
"JOB_NOT_COMPLETED": APIError, # Generic
"HTTP_ERROR": APIError, # Generic
Expand Down
31 changes: 25 additions & 6 deletions cytetype/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ def run(
obs_duckdb_path: str = "obs.duckdb",
upload_timeout_seconds: int = 3600,
cleanup_artifacts: bool = False,
require_artifacts: bool = True,
show_progress: bool = True,
override_existing_results: bool = False,
) -> anndata.AnnData:
Expand Down Expand Up @@ -310,6 +311,9 @@ def run(
Defaults to 3600.
cleanup_artifacts (bool, optional): Whether to delete generated artifact files after run
completes or fails. Defaults to False.
require_artifacts (bool, optional): Whether to raise an error if artifact building or
uploading fails. When True (default), any artifact failure stops the run. Set to
False to skip artifacts and continue with annotation only. Defaults to True.
show_progress (bool, optional): Whether to display progress updates with spinner and
cluster status. Set to False to disable all visual progress output. Defaults to True.
override_existing_results (bool, optional): Whether to override existing results with the
Expand Down Expand Up @@ -362,12 +366,27 @@ def run(

artifact_paths = [vars_h5_path, obs_duckdb_path]
try:
uploaded_file_refs = self._build_and_upload_artifacts(
vars_h5_path=vars_h5_path,
obs_duckdb_path=obs_duckdb_path,
upload_timeout_seconds=upload_timeout_seconds,
)
payload["uploaded_files"] = uploaded_file_refs
try:
uploaded_file_refs = self._build_and_upload_artifacts(
vars_h5_path=vars_h5_path,
obs_duckdb_path=obs_duckdb_path,
upload_timeout_seconds=upload_timeout_seconds,
)
payload["uploaded_files"] = uploaded_file_refs
except Exception as exc:
if require_artifacts:
logger.error(
"Artifact build/upload failed. "
"Rerun with `require_artifacts=False` to skip this error.\n"
"Please report the error below in a new issue at "
"https://github.com/NygenAnalytics/CyteType\n"
f"({type(exc).__name__}: {exc})"
)
raise
logger.warning(
"Artifact build/upload failed. Continuing without artifacts. "
"Set `require_artifacts=True` to see the full traceback."
)

# Save query if requested
if save_query:
Expand Down
57 changes: 41 additions & 16 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,6 @@ adata = annotator.run(
)
```

`run()` now performs the full upload pipeline internally:
- Creates `vars.h5` from `adata.X`
- Creates `obs.duckdb` from `adata.obs`
- Uploads both artifacts to the CyteType API
- Calls `/annotate` with uploaded file references

If artifact creation or upload fails, `run()` fails fast.

## LLM Configuration
You can provide your own LLM providers/models:
```python
Expand Down Expand Up @@ -64,20 +56,42 @@ adata = annotator.run(
)
```

## Advanced
## Artifacts

`run()` automatically builds and uploads two artifact files before submitting an annotation job:

- **`vars.h5`** — a compressed HDF5 file containing the normalized expression matrix (`adata.X`) and variable metadata (`adata.var`). Used by the server for on-demand gene expression lookups during annotation and in the interactive report.
- **`obs.duckdb`** — a DuckDB database containing the observation metadata (`adata.obs`). Used by the server to power metadata queries and filtering in the interactive report.

Both files are created locally and then uploaded to the CyteType API. The uploaded references are attached to the `/annotate` payload so the server can link them to the job.

### Artifact Parameters

```python
adata = annotator.run(
...
poll_interval_seconds=30, # How often to poll (default)
timeout_seconds=7200, # Max wait time (default: 2 hours)
api_url="https://custom-api.example.com", # Custom API endpoint if needed
vars_h5_path="vars.h5", # Local artifact output path
obs_duckdb_path="obs.duckdb", # Local artifact output path
upload_timeout_seconds=3600, # Per-upload socket read timeout
cleanup_artifacts=False, # Keep artifacts by default
vars_h5_path="vars.h5", # Local output path for vars artifact
obs_duckdb_path="obs.duckdb", # Local output path for obs artifact
upload_timeout_seconds=3600, # Socket read timeout per upload (seconds)
cleanup_artifacts=False, # Delete local artifact files after run
require_artifacts=True, # Raise on artifact failure (set False to skip)
)
```

| Parameter | Default | Description |
|-----------|---------|-------------|
| `vars_h5_path` | `"vars.h5"` | Local path where the vars HDF5 file is written |
| `obs_duckdb_path` | `"obs.duckdb"` | Local path where the obs DuckDB file is written |
| `upload_timeout_seconds` | `3600` | Socket read timeout for each artifact upload |
| `cleanup_artifacts` | `False` | Delete local artifact files after run completes or fails |
| `require_artifacts` | `True` | Raise on artifact build/upload failure. Set to `False` to skip artifacts and continue with annotation only |

### Error Handling

By default (`require_artifacts=True`), any failure during artifact building or uploading stops the run and surfaces the full error. The error message includes a link to report the issue on GitHub.

If you want the annotation to proceed even when artifacts fail (e.g. due to disk space or network issues), set `require_artifacts=False`. The job will submit without artifacts — annotation still works, but the interactive report will not have expression lookups or metadata filtering.

### Memory Recommendation for Large Datasets

For large datasets, open your AnnData object in backed mode to reduce memory usage while building `vars.h5`:
Expand All @@ -86,4 +100,15 @@ For large datasets, open your AnnData object in backed mode to reduce memory usa
import scanpy as sc

adata = sc.read_h5ad("input.h5ad", backed="r")
```

## Advanced

```python
adata = annotator.run(
...
poll_interval_seconds=30, # How often to poll (default)
timeout_seconds=7200, # Max wait time (default: 2 hours)
api_url="https://custom-api.example.com", # Custom API endpoint if needed
)
```
3 changes: 2 additions & 1 deletion docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
- Make sure you have valid gene symbols in the AnnData object and are passing the correct gene symbols column name to parameter `gene_symbols_column`.
- If you are using a custom LLM, make sure you have the correct API key and base URL.
- For large datasets, load AnnData in backed mode (`sc.read_h5ad(..., backed="r")`) to reduce memory use during artifact generation.
- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
- If artifact building or uploading fails, `run()` will raise an error by default. Set `require_artifacts=False` to skip artifacts and continue with annotation only.
53 changes: 53 additions & 0 deletions tests/test_cytetype_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,59 @@ def test_cytetype_run_auto_uploads_artifacts(
}


@patch("cytetype.main.wait_for_completion")
@patch("cytetype.main.submit_annotation_job")
def test_cytetype_run_artifact_failure_raises_by_default(
mock_submit: MagicMock,
mock_wait: MagicMock,
mock_adata: anndata.AnnData,
mock_api_response: dict[str, Any],
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test run() raises when artifact build fails and require_artifacts=True (default)."""
mock_submit.return_value = "job_no_artifacts"
mock_wait.return_value = mock_api_response

monkeypatch.setattr(
"cytetype.main.save_features_matrix",
MagicMock(side_effect=RuntimeError("disk full")),
)

ct = CyteType(mock_adata, group_key="leiden")
with pytest.raises(RuntimeError, match="disk full"):
ct.run(study_context="Test")


@patch("cytetype.main.wait_for_completion")
@patch("cytetype.main.submit_annotation_job")
def test_cytetype_run_artifact_failure_continues_when_not_required(
mock_submit: MagicMock,
mock_wait: MagicMock,
mock_adata: anndata.AnnData,
mock_api_response: dict[str, Any],
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test run() proceeds without uploaded_files when require_artifacts=False."""
mock_submit.return_value = "job_no_artifacts"
mock_wait.return_value = mock_api_response

monkeypatch.setattr(
"cytetype.main.save_features_matrix",
MagicMock(side_effect=RuntimeError("disk full")),
)

ct = CyteType(mock_adata, group_key="leiden")
result = ct.run(study_context="Test", require_artifacts=False)

# Job should still complete successfully
assert result is not None
assert mock_submit.called

# Payload must not contain uploaded_files
payload = mock_submit.call_args.args[2]
assert "uploaded_files" not in payload


@patch("cytetype.main.wait_for_completion")
@patch("cytetype.main.submit_annotation_job")
def test_cytetype_run_cleanup_artifacts(
Expand Down