diff --git a/.gitignore b/.gitignore index cae9b3e..225ce03 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ .venv/ .env eo_api.egg-info/ -data/downloads \ No newline at end of file +data/downloads +docs/ diff --git a/README.md b/README.md index ce64dec..224f48b 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,24 @@ Docs: http://127.0.0.1:8000/docs +Workflow (single payload contract): + +`POST /workflows/dhis2-datavalue-set` + +```json +{ + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 3, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "dry_run": true +} +``` + OGC API http://127.0.0.1:8000/ogcapi diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml new file mode 100644 index 0000000..51eb8a3 --- /dev/null +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -0,0 +1,71 @@ +workflow_id: dhis2_datavalue_set_v1 +version: 1 +publication: + publishable: true + strategy: on_success + intent: feature_collection + exposure: ogc + inputs: + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + output_file: + from_step: build_dhis2_payload + output: output_file +outputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + data_value_set: + from_step: build_dhis2_payload + output: data_value_set + output_file: + from_step: build_dhis2_payload + output: output_file +steps: + - id: get_features + component: feature_source + version: v1 + - id: download + component: download_dataset + version: v1 + inputs: + bbox: + from_step: get_features + output: bbox + - id: temporal_agg + component: temporal_aggregation + version: v1 + inputs: + bbox: + from_step: get_features + output: bbox + - id: spatial_agg + component: spatial_aggregation + version: v1 + inputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + temporal_dataset: + from_step: temporal_agg + output: temporal_dataset + - id: build_dhis2_payload + component: build_datavalueset + version: v1 + inputs: + records: + from_step: spatial_agg + output: records diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml new file mode 100644 index 0000000..88c932d --- /dev/null +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -0,0 +1,49 @@ +workflow_id: dhis2_datavalue_set_without_temporal_aggregation_v1 +version: 1 +publication: + publishable: false + exposure: registry_only +outputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + data_value_set: + from_step: build_dhis2_payload + output: data_value_set + output_file: + from_step: build_dhis2_payload + output: output_file +steps: + - id: get_features + component: feature_source + version: v1 + - id: download + component: download_dataset + version: v1 + inputs: + bbox: + from_step: get_features + output: bbox + - id: spatial_agg + component: spatial_aggregation + version: v1 + inputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + - id: build_dhis2_payload + component: build_datavalueset + version: v1 + inputs: + records: + from_step: spatial_agg + output: records diff --git a/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md b/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md new file mode 100644 index 0000000..f883607 --- /dev/null +++ b/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md @@ -0,0 +1,352 @@ +# OGC Workflow Demo Reference + +## Purpose + +This document is a stable operator/demo reference for the current branch shape. + +It answers: + +1. how to fetch CHIRPS3 and WorldPop data +2. how to run workflow-backed executions +3. how to inspect jobs and results +4. what the difference is between processes and jobs +5. how to browse published collections +6. why both `/ogcapi` and `/pygeoapi` currently exist + +## Current Route Model + +The current runtime split is: + +1. `/workflows` = native workflow control plane +2. `/ogcapi` = native OGC API - Processes / Jobs adapter +3. `/pygeoapi` = mounted collection/items browse shell for published resources +4. `/publications` = publication registry +5. `/raster` = raster capabilities / rendering layer + +Important current note: + +Some older docs still refer to `/ogcapi/collections`. In the current codebase, collection browsing is under `/pygeoapi/collections`. + +## Dataset IDs Used In Examples + +1. `chirps3_precipitation_daily` +2. `worldpop_population_yearly` + +WorldPop usually needs `country_code`. + +## A. Direct Dataset Fetch Examples + +These examples fetch/cache source data directly. They do not create workflow jobs. + +### A1. Fetch CHIRPS3 + +```bash +curl -X POST http://localhost:8000/components/download-dataset \ + -H 'Content-Type: application/json' \ + -d '{ + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "overwrite": false + }' +``` + +### A2. Fetch WorldPop + +```bash +curl -X POST http://localhost:8000/components/download-dataset \ + -H 'Content-Type: application/json' \ + -d '{ + "dataset_id": "worldpop_population_yearly", + "start": "2020", + "end": "2020", + "country_code": "ETH", + "overwrite": false + }' +``` + +### A3. Alternative Legacy Download Routes + +These older routes still exist, but they are not job-backed and are less aligned with the newer component/workflow model. + +```bash +curl "http://localhost:8000/manage/chirps3_precipitation_daily/download?start=2024-01-01&end=2024-01-31" +curl "http://localhost:8000/manage/worldpop_population_yearly/download?start=2020&end=2020" +``` + +## B. Workflow Execution Example + +This is the canonical native workflow execution shape: + +Important current note: + +`publishable` in workflow policy does not automatically publish every successful run. +The caller must explicitly opt in with `"publish": true`, and the workflow must also allow publication. + +```bash +curl -X POST http://localhost:8000/workflows/dhis2-datavalue-set \ + -H 'Content-Type: application/json' \ + -d '{ + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "publish": true, + "dry_run": true + } + }' +``` + +If this succeeds, the response includes a `run_id`. That `run_id` is the native workflow job ID. + +Important current note: + +This path depends on DHIS2-backed feature resolution. If DHIS2 is unavailable or misconfigured, this request can fail with `503`. + +## C. Processes vs Jobs + +A `process` is a reusable executable capability. + +A `job` is one concrete execution of a process. + +In the current OGC layer, there is one exposed generic process: + +1. `generic-dhis2-workflow` + +So the model is: + +1. process = "what can this server execute?" +2. job = "one recorded run of that execution capability" + +The native workflow layer also persists jobs, but it does not use OGC process terminology in its route naming. + +## D. How To Inspect Jobs And Results + +### D1. Native Workflow View + +List jobs: + +```bash +curl http://localhost:8000/workflows/jobs +``` + +Get one job: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id} +``` + +Get persisted result payload: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id}/result +``` + +Get persisted trace/log payload: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id}/trace +``` + +Delete one job and cascade its owned artifacts: + +```bash +curl -X DELETE http://localhost:8000/workflows/jobs/{job_id} +``` + +### D2. OGC View Over The Same Execution Layer + +List OGC processes: + +```bash +curl http://localhost:8000/ogcapi/processes +``` + +Describe the exposed generic process: + +```bash +curl http://localhost:8000/ogcapi/processes/generic-dhis2-workflow +``` + +List OGC jobs: + +```bash +curl http://localhost:8000/ogcapi/jobs +``` + +Get one OGC job: + +```bash +curl http://localhost:8000/ogcapi/jobs/{job_id} +``` + +Get OGC job results: + +```bash +curl http://localhost:8000/ogcapi/jobs/{job_id}/results +``` + +Get OGC job results plus extra native metadata: + +```bash +curl "http://localhost:8000/ogcapi/jobs/{job_id}/results?extended=true" +``` + +Download the native output artifact if available: + +```bash +curl -OJ http://localhost:8000/ogcapi/jobs/{job_id}/download +``` + +## E. How To Browse Published Collections + +### E1. Inspect Publication Registry + +List publications: + +```bash +curl http://localhost:8000/publications +``` + +Get CHIRPS3 source publication: + +```bash +curl http://localhost:8000/publications/dataset-chirps3_precipitation_daily +``` + +Get WorldPop source publication: + +```bash +curl http://localhost:8000/publications/dataset-worldpop_population_yearly +``` + +### E2. Browse Published Collections + +List collections: + +```bash +curl http://localhost:8000/pygeoapi/collections +curl "http://localhost:8000/pygeoapi/collections?f=html" +``` + +Open CHIRPS3 collection: + +```bash +curl http://localhost:8000/pygeoapi/collections/chirps3_precipitation_daily +``` + +Open WorldPop collection: + +```bash +curl http://localhost:8000/pygeoapi/collections/worldpop_population_yearly +``` + +### E3. Raster-Specific Checks + +These two source datasets are primarily coverage/raster resources, so their raster capabilities are also important: + +```bash +curl http://localhost:8000/raster/chirps3_precipitation_daily/capabilities +curl http://localhost:8000/raster/worldpop_population_yearly/capabilities +``` + +### E4. Derived Workflow Output Collections + +If a publishable workflow run succeeds, the system can register a derived publication: + +1. the workflow result is persisted as a job +2. a `PublishedResource` may be created +3. that publication may become visible under `/pygeoapi/collections/{collection_id}` + +The easiest path is: + +1. run a publishable workflow +2. open `/workflows/jobs/{job_id}` +3. follow its `collection` link if present + +## F. Why Both `/ogcapi` And `/pygeoapi` Exist + +Current intent: + +1. `/ogcapi` is the native canonical OGC process/job surface +2. `/pygeoapi` is the current generic browse shell for collection/items publication + +This means: + +1. execution semantics stay FastAPI-owned +2. publication truth stays FastAPI-owned through `PublishedResource` +3. generic collection/items browsing is still delegated to `pygeoapi` where it adds value + +This is a pragmatic transition state, not necessarily the final public shape. + +## G. Publication Bridge Lifecycle + +The publication bridge is file-backed and explicit. + +Current lifecycle: + +1. a source dataset or workflow output becomes eligible for publication +2. the backend registers publication truth as a `PublishedResource` JSON record +3. the pygeoapi projection layer reads those publication records +4. the system generates pygeoapi YAML/OpenAPI documents from that publication state +5. the mounted `/pygeoapi` app serves collections/items from that generated configuration + +In short: + +```text +workflow/source dataset + -> publication registration + -> PublishedResource JSON + -> generated pygeoapi YAML + -> /pygeoapi collection +``` + +Current storage/projection locations: + +1. publication state: + - `data/downloads/published_resources/*.json` +2. generated pygeoapi projection: + - `data/downloads/pygeoapi/pygeoapi-config.generated.yml` + - `data/downloads/pygeoapi/pygeoapi-openapi.generated.yml` + +Important architectural point: + +The JSON publication record is the source of truth. +The pygeoapi YAML is generated serving configuration, not the primary publication database. + +## H. Can The System Eventually Live With Just One? + +Yes. + +The accepted architectural direction is: + +1. `/ogcapi` remains canonical +2. `/pygeoapi` is secondary and potentially transitional + +That means the long-term convergence options are: + +1. move collection/resource routes into native FastAPI under `/ogcapi` +2. or keep pygeoapi as an internal implementation component while exposing one canonical `/ogcapi` surface + +For now, keeping both is reasonable because it preserves a clean native process/job model while still reusing pygeoapi's browse capabilities. + +## I. Short Demo Sequence + +Good operator/demo flow: + +1. `GET /workflows` +2. `GET /ogcapi/processes` +3. `GET /publications` +4. `GET /pygeoapi/collections?f=html` +5. `GET /pygeoapi/collections/chirps3_precipitation_daily` +6. `GET /raster/chirps3_precipitation_daily/capabilities` +7. run one workflow if DHIS2 is available +8. `GET /workflows/jobs/{job_id}` +9. `GET /ogcapi/jobs/{job_id}/results` +10. browse the derived collection if the run published one diff --git a/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md b/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md new file mode 100644 index 0000000..72b6433 --- /dev/null +++ b/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md @@ -0,0 +1,138 @@ +# Pygeoapi Maintainability Argument + +## Purpose + +This note captures the justification for the current publication bridge architecture, especially in response to the critique that an earlier integration used only a few lines of code via: + +```python +from pygeoapi.starlette_app import APP as pygeoapi_app +from pygeoapi.starlette_app import CONFIG +``` + +The short version is: + +- the old approach looked simpler because it leaned on pygeoapi runtime globals +- the current approach is more maintainable because EO API now owns publication truth explicitly +- pygeoapi is still used for standard OGC serving, but it is no longer the owner of backend publication state + +## The Real Question + +The architectural question is not: + +- "Should we use the standard pygeoapi implementation or build our own?" + +The real question is: + +- "Which part of the system should own publication state and execution-linked publication lifecycle?" + +The current branch does **not** replace pygeoapi as a standards-oriented serving component. + +Instead, it moves ownership of publication truth into the EO API backend and uses pygeoapi as one serving projection of that truth. + +## What The Thin Old Approach Did Well + +The old approach had a real strength: + +- it made pygeoapi integration look very small and direct + +That is useful for: + +- quick demos +- minimal bootstrapping +- proving that pygeoapi can be mounted and served successfully + +For a spike or early prototype, this is a good move. + +## What The Thin Old Approach Hid + +The small wrapper did not remove complexity. +It mostly relocated complexity into pygeoapi runtime globals and config handling. + +That creates ambiguity around: + +1. where publication truth actually lives +2. how dynamic publication updates should happen +3. how job/output lifecycle links to collections +4. how cleanup and deletion should remove published resources +5. how source and derived resources should share one publication model +6. how nightly refreshes should update collection extents and metadata + +So the old approach was shorter, but it encouraged a hidden ownership model: + +- pygeoapi `APP` and `CONFIG` start to feel like the publication database + +That is the maintainability problem. + +## Why The Current Approach Is Cleaner + +The current branch introduces an explicit publication bridge: + +```text +workflow/source dataset + -> publication registration + -> PublishedResource JSON + -> generated pygeoapi YAML + -> /pygeoapi collection +``` + +This gives each layer one responsibility: + +1. EO API backend owns execution truth +2. EO API backend owns publication truth through `PublishedResource` +3. generated pygeoapi config is a serving projection of that truth +4. pygeoapi remains the standards-oriented collection/items serving layer + +This is cleaner because: + +- publication state is explicit +- job-to-publication linkage is explicit +- cleanup semantics are explicit +- source and derived resources share one model +- pygeoapi is replaceable because it consumes projection rather than owning truth + +## Why This Is More Maintainable Long Term + +Long-term maintainability improves because the current architecture: + +1. avoids treating pygeoapi runtime globals as the domain model +2. gives EO API a backend-owned publication record that can evolve independently +3. supports dynamic workflow outputs without hardcoding unknown future collections +4. allows retention/cleanup to remove publications coherently +5. allows metadata refresh and extent updates to happen in backend-owned publication state first +6. reduces framework lock-in because pygeoapi becomes an adapter, not the publication brain + +In short: + +- the old approach optimized for a short integration seam +- the current approach optimizes for explicit ownership and lifecycle clarity + +That is the better long-term tradeoff. + +## What We Still Reuse From Pygeoapi + +This architecture does **not** argue against pygeoapi. + +We still rely on pygeoapi for: + +1. standards-oriented collection/items serving +2. provider-backed feature/coverage publication +3. generic browse behavior +4. HTML/JSON OGC browse surfaces while that still adds value + +So the current model is not: + +- "replace the standard implementation" + +It is: + +- "keep the standard implementation for serving, but stop using it as the owner of backend publication state" + +## Short Defense Statement + +Use this if you need a short explanation: + +> The earlier pygeoapi integration was shorter because it outsourced state handling to pygeoapi’s app/config layer. The current approach is more maintainable because EO API now owns workflow-aware publication truth explicitly, while pygeoapi remains the standard serving layer fed by generated configuration. + +## Even Shorter Version + +> We still use pygeoapi for standard OGC serving. What changed is that publication truth now lives in EO API instead of pygeoapi runtime globals. That is why the current design is more maintainable. diff --git a/docs/internal/SESSION_HANDOFF_2026-03-18.md b/docs/internal/SESSION_HANDOFF_2026-03-18.md new file mode 100644 index 0000000..96421a4 --- /dev/null +++ b/docs/internal/SESSION_HANDOFF_2026-03-18.md @@ -0,0 +1,180 @@ +# Session Handoff - 2026-03-18 + +## Stop Point + +This is a clean demo checkpoint. + +The system now has: + +1. native workflow execution and job persistence as backend truth +2. publication registration via `PublishedResource` +3. dynamic OGC collection/detail/items routes backed directly by live publication state +4. a pluggable analytics viewer mounted outside the OGC core +5. OGC HTML items pages that can switch between `Browse` and embedded `Analytics` modes + +The important operational improvement is that collection publish/delete visibility no longer requires restart. + +--- + +## What Changed This Session + +### 1. Dynamic OGC collection surface + +Implemented in: + +1. [src/eo_api/ogc/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/ogc/routes.py) + +Behavior: + +1. `/ogcapi/collections` +2. `/ogcapi/collections/{collection_id}` +3. `/ogcapi/collections/{collection_id}/items` + +are now served natively from live publication truth instead of relying on startup-loaded `pygeoapi` collection state. + +Result: + +1. new derived publications appear immediately +2. deleted workflow-output collections disappear immediately +3. no restart is needed for collection visibility changes + +### 2. OGC HTML became first-class + +The OGC HTML pages are now intentionally controlled rather than inherited utility pages. + +Current state: + +1. collections page uses a scalable list/table layout +2. collection pages have clearer representation labeling +3. collection items pages have explicit OGC navigation and back links +4. items pages support period filtering in HTML +5. items pages now have two modes: + - `Browse` + - `Analytics` + +### 3. Analytics viewer remained pluggable but is now embedded + +Implemented in: + +1. [src/eo_api/analytics_viewer/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/analytics_viewer/routes.py) + +Current model: + +1. `/analytics/...` still exists as the pluggable analytics module +2. the OGC items HTML page can embed that module in-place +3. this keeps the implementation swappable while avoiding a detached user journey + +### 4. Published workflow output representation improved + +For derived feature collections: + +1. published properties were cleaned to focus on: + - `org_unit` + - `org_unit_name` + - `period` + - `value` +2. precipitation views now use a blue value ramp +3. OGC collection tables distinguish: + - source dataset + - native workflow output + - OGC representation type + +### 5. Workflow runtime contracts were tightened + +Implemented in: + +1. [src/eo_api/workflows/services/engine.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/services/engine.py) +2. [src/eo_api/components/services.py](/home/abyot/coding/EO/eo-api/src/eo_api/components/services.py) + +Behavior: + +1. workflow step handoff uses typed artifacts instead of a loose context dict +2. temporal aggregation can no-op/pass through when source period already matches requested period +3. orchestration wires artifacts; components own pass-through decisions + +### 6. Retention cleanup exists + +Implemented in: + +1. [src/eo_api/workflows/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/routes.py) +2. [src/eo_api/workflows/services/job_store.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/services/job_store.py) + +Endpoint: + +1. `POST /workflows/jobs/cleanup` + +Policy knobs: + +1. `dry_run` +2. `keep_latest` +3. `older_than_hours` + +Cleanup cascades through: + +1. job record +2. run trace +3. native workflow output +4. derived publication record +5. derived publication asset + +--- + +## Current UX Shape + +The intended human-facing entry path is now: + +1. `/ogcapi/collections?f=html` +2. select a collection +3. open `/ogcapi/collections/{id}/items?f=html` +4. switch between: + - `Browse` + - `Analytics` + +This keeps the user inside the OGC page flow while still using the pluggable analytics module underneath. + +--- + +## Standards Boundary + +The current discipline is: + +1. OGC JSON/resource shape stays standards-oriented +2. HTML is allowed to be product-friendly +3. current `period=` handling is an HTML convenience +4. long-term machine filtering should move toward CQL2 rather than grow more ad hoc parameters + +--- + +## Verification State + +At stop: + +1. `uv run pytest -q tests/test_workflows.py` passes +2. `make lint` passes + +--- + +## Recommended Next Step + +The next meaningful architectural step is: + +1. strengthen error handling and response envelopes across the workflow/OGC surfaces + +Followed by: + +1. decide whether period filtering should begin moving toward CQL2-style handling for machine clients +2. continue tightening component contracts only where real ambiguity remains + +--- + +## Demo Notes + +Good demo flow: + +1. execute a publishable workflow +2. show `/workflows/jobs/{job_id}` +3. show `/ogcapi/collections` +4. open the derived workflow-output collection +5. open items HTML +6. switch between `Browse` and `Analytics` +7. optionally delete the job and refresh collections to show immediate disappearance without restart diff --git a/docs/workflow-orchestration.md b/docs/workflow-orchestration.md new file mode 100644 index 0000000..279f0c8 --- /dev/null +++ b/docs/workflow-orchestration.md @@ -0,0 +1,609 @@ +# Workflow Orchestration Design (Single Endpoint, Componentized Runtime) + +## Purpose + +This document describes the implemented approach for generating a DHIS2 DataValueSet from gridded EO datasets through one workflow endpoint and reusable components. + +It documents: + +1. What has been achieved. +2. The architecture and execution flow. +3. Public API contract and normalization rules. +4. Runtime metadata, observability, and error handling. +5. Current componentization strategy and extension path. + +--- + +## What Is Implemented + +The current implementation provides: + +1. One canonical workflow execution endpoint: + - `POST /workflows/dhis2-datavalue-set` +2. One public wrapped request payload contract (`{"request": WorkflowRequest}`). +3. Internal normalization into a canonical execution model (`WorkflowExecuteRequest`). +4. A fixed generic orchestration chain with exactly 5 components: + - `feature_source` + - `download_dataset` + - `temporal_aggregation` + - `spatial_aggregation` + - `build_datavalueset` +5. Per-component runtime instrumentation (`WorkflowRuntime`) with timing, status, and summarized inputs/outputs. +6. Run-log persistence for both success and failure. +7. Structured error responses, including upstream connectivity failures. +8. Optional inclusion of detailed component run traces in API responses. +9. Discoverable standalone component endpoints under `/components` for direct execution and future orchestrator integration. +10. Declarative workflow assembly via YAML (`data/workflows/dhis2_datavalue_set.yaml`) executed by the workflow engine. +11. Registry-driven component dispatch in engine (no component-specific `if/elif` chain). +12. Step-level YAML config support with strict validation and `$request.` interpolation. +13. Stable workflow error contract with `error_code` and `failed_component_version`. + +--- + +## Final API Surface + +### Primary Workflow Endpoint + +- `POST /workflows/dhis2-datavalue-set` +- `POST /workflows/execute` (inline assembly execution: post `workflow.steps` + `request` payload) +- `POST /workflows/validate` (validate discovered/inline workflow + request compatibility without execution) + +### Workflow Discovery Endpoint + +- `GET /workflows` (discovered workflow catalog from `data/workflows/*.yaml` with `workflow_id`, `version`, and component chain) + +### Component Discovery/Execution Endpoints + +- `GET /components` (public catalog; hides internal orchestration-only config schema) +- `GET /components?include_internal=true` (internal/debug catalog including component config schema) +- `POST /components/feature-source` +- `POST /components/download-dataset` +- `POST /components/temporal-aggregation` +- `POST /components/spatial-aggregation` +- `POST /components/build-datavalue-set` + +`/components/*` endpoints are for reusable task-level execution. The workflow endpoint remains the single end-to-end API for generating DHIS2 DataValueSet output. + +--- + +## Public Workflow Request Contract + +The workflow endpoint accepts one wrapped payload shape: + +```json +{ + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": false + } +} +``` + +Important fields: + +1. `dataset_id` (required) +2. `workflow_id` (optional, default `dhis2_datavalue_set_v1`, must exist in discovered workflow YAMLs) +3. Time window (required as one of): + - `start_date` + `end_date`, or + - `start_year` + `end_year` +4. Spatial scope (required as one of): + - `org_unit_level`, or + - `org_unit_ids` +5. `data_element` (required) +6. `include_component_run_details` (optional, default `false`) + +Notes: + +1. `feature_id_property` defaults to `"id"` and controls which feature property maps to DHIS2 org unit ID in spatial aggregation/DataValueSet construction. +2. `country_code` is accepted in request and passed to dataset downloaders (instead of forcing `.env` only). + +--- + +## Normalization and Mapping Approach + +File: `src/eo_api/workflows/services/simple_mapper.py` + +Public wrapped payload (`request`) is normalized to internal `WorkflowExecuteRequest` with component-ready nested configs: + +1. `feature_source` config: + - `org_unit_level` -> `source_type=dhis2_level` + - `org_unit_ids` -> `source_type=dhis2_ids` +2. `temporal_aggregation` config: + - `target_period_type` from `temporal_resolution` + - `method` from `temporal_reducer` +3. `spatial_aggregation` config: + - `method` from `spatial_reducer` +4. `dhis2` config: + - `data_element_uid` from `data_element` + +Time normalization depends on dataset registry metadata (`period_type`): + +1. Yearly datasets: + - normalize to `YYYY` +2. Hourly/Daily/Monthly datasets: + - normalize to month windows (`YYYY-MM`) for downloader compatibility +3. Fallback: + - pass date strings as provided + +This mapping keeps the public contract simple while preserving an extensible internal orchestration model. + +--- + +## Architecture + +### API Routing Layer + +Files: + +1. `src/eo_api/workflows/routes.py` +2. `src/eo_api/components/routes.py` +3. `src/eo_api/main.py` + +Responsibilities: + +1. Expose one workflow endpoint and reusable component endpoints. +2. Keep payload and response models explicit with Pydantic. +3. Delegate execution logic to service layers. + +### Workflow Engine Layer + +File: `src/eo_api/workflows/services/engine.py` + +Responsibilities: + +1. Validate dataset existence via registry. +2. Execute the 5 components in fixed order. +3. Collect runtime telemetry for each component. +4. Persist run logs on both success and error paths. +5. Return workflow result with optional component-run detail inclusion. + +### Workflow Definition Layer + +Files: + +1. `src/eo_api/workflows/services/definitions.py` +2. `data/workflows/dhis2_datavalue_set.yaml` + +Responsibilities: + +1. Discover, load, and validate declarative workflow definitions from `data/workflows/*.yaml`. +2. Enforce supported component names. +3. Enforce supported component versions (currently `v1`) and validate per-step `config`. +4. Enforce terminal `build_datavalueset` step for this end-to-end workflow. +5. Enforce output-to-input compatibility across the full accumulated context (not just adjacent steps). +6. Drive runtime execution order from YAML through a registry-dispatch model. + +### Reusable Component Service Layer + +File: `src/eo_api/components/services.py` + +Responsibilities: + +1. Provide discoverable component catalog metadata. +2. Implement component functions used by: + - workflow engine, and + - `/components/*` task endpoints. +3. Reuse existing EO API capabilities (`downloader`, `accessor`, temporal/spatial aggregators, DataValueSet builder). + +--- + +## Layering Rationale + +The repository uses three layers with different responsibilities: + +1. `data_xxx` services (`data_manager`, `data_accessor`, `data_registry`) + - Core domain capabilities (download, load/subset, dataset metadata). + - No workflow-specific orchestration state required. +2. `components/` + - Thin reusable wrappers around core capabilities. + - Standardized component contracts for discovery (`GET /components`) and direct task execution. + - Runtime-friendly boundaries for future orchestrators (Prefect/Airflow). +3. `workflows/` + - End-to-end orchestration, request normalization, workflow selection, runtime tracing, and run-log persistence. + - Declarative assembly from `data/workflows/*.yaml`. + +Example: + +1. `download_dataset` workflow/component step delegates actual download work to `src/eo_api/data_manager/services/downloader.py`. +2. The wrapper adds orchestration-level concerns (preflight, context wiring, component runtime metadata) without duplicating downloader logic. + +This separation keeps core data services reusable and prevents workflow-specific concerns from leaking into the low-level data modules. + +--- + +## Component Chain (Exact Runtime Order) + +The workflow engine executes these components, no more and no less: + +1. `feature_source` +2. `download_dataset` +3. `temporal_aggregation` +4. `spatial_aggregation` +5. `build_datavalueset` + +Details: + +1. `feature_source` + - Resolves features from DHIS2 org unit level/ids or GeoJSON source config. + - Returns `FeatureCollection` and `bbox`. +2. `download_dataset` + - Runs connectivity preflight and downloads source data using `data_manager/services/downloader.py`. + - Supports request-supplied `country_code` where needed (e.g., WorldPop). +3. `temporal_aggregation` + - Loads/subsets data and performs period aggregation with selected reducer. +4. `spatial_aggregation` + - Aggregates gridded data over feature geometries. + - Produces normalized record rows (`org_unit`, `time`, `value`). +5. `build_datavalueset` + - Builds valid DHIS2 DataValueSet JSON from records. + - Serializes output to file and returns both payload and output path. + +Execution order and step metadata are currently defined in: + +- `data/workflows/dhis2_datavalue_set.yaml` + +Workflow step schema now supports: + +1. `component` +2. `version` (default `v1`) +3. `config` (default `{}`) + +### Remote Component Execution + +All five components support either local (default) or remote API execution. + +Common step config options: + +1. `execution_mode`: `local` or `remote` (default `local`) +2. `remote_url`: required when `execution_mode=remote` (expects component-compatible POST endpoint) +3. `remote_timeout_sec`: request timeout (default `30`) +4. `remote_retries`: number of attempts (default `1`) +5. `remote_retry_delay_sec`: delay between attempts in seconds (default `1`) +6. Component-specific options remain available (for example `overwrite`, `country_code`, `method`, `period_type`) + +Example: + +```yaml +steps: + - component: feature_source + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/feature-source" + - component: download_dataset + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/download-dataset" + - component: temporal_aggregation + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/temporal-aggregation" + - component: spatial_aggregation + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/spatial-aggregation" + - component: build_datavalueset + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/build-datavalue-set" +``` + +The default YAML remains the same 5-step sequence, but the engine reads it declaratively and dispatches components through a registry map. + +--- + +## Runtime Observability and Housekeeping + +File: `src/eo_api/workflows/services/runtime.py` + +For each component run, runtime captures: + +1. `component` +2. `status` +3. `started_at` +4. `ended_at` +5. `duration_ms` +6. `inputs` (summarized) +7. `outputs` (summarized) +8. `error` (on failure) + +Each workflow execution gets a unique `run_id`. + +### Response-Level Control of Run Details + +`include_component_run_details` controls response verbosity: + +1. If `false`: + - `component_runs: []` + - `component_run_details_included: false` + - `component_run_details_available: true` +2. If `true`: + - `component_runs` contains full per-component run records + - `component_run_details_included: true` + - `component_run_details_available: true` + +This keeps default responses clean while preserving debuggability when explicitly requested. + +--- + +## Run Logs + +File: `src/eo_api/workflows/services/run_logs.py` + +Workflow run logs are persisted under: + +- `/workflow_runs/` + +Persisted fields include: + +1. `run_id` +2. `status` (`completed` or `failed`) +3. normalized request payload +4. `component_runs` +5. output file path (when completed) +6. error details (when failed) +7. `error_code` (when failed) +8. `failed_component` (when failed) +9. `failed_component_version` (when failed) + +--- + +## Error Handling Strategy + +1. `422` for request/definition/config validation failures. +2. `404` when `dataset_id` does not exist in registry. +3. `503` for upstream connectivity failures: + - `error: "upstream_unreachable"` + - `error_code: "UPSTREAM_UNREACHABLE"` +4. `500` for other execution failures: + - `error: "workflow_execution_failed"` + - `error_code: "EXECUTION_FAILED"` (or other stable mapped codes) + +Failure responses include: + +1. `error_code` +2. `failed_component` +3. `failed_component_version` +4. `run_id` + +--- + +## Testing and Quality Gates + +Primary tests: + +- `tests/test_workflows.py` + +Coverage includes: + +1. Single workflow endpoint behavior. +2. Payload validation and normalization paths. +3. Exact 5-component orchestration order. +4. Component detail include/exclude behavior. +5. Upstream connectivity error mapping. +6. Component catalog endpoint expectations. +7. Declarative workflow definition loading and default step validation. +8. Engine execution follows the definition-provided step order. + +Quality gates: + +1. `make lint` (ruff, mypy, pyright) +2. `uv run pytest -q` + +--- + +## Why This Approach + +This design intentionally balances: + +1. Simplicity for clients: + - one end-to-end endpoint and one public payload. +2. Generic dataset support: + - dataset-specific behavior comes from registry metadata and downloader wiring, not endpoint proliferation. +3. Reusability: + - component services are discoverable and executable independently. +4. Future orchestration readiness: + - component boundaries and run metadata are explicit, making Prefect/Airflow integration straightforward. + +--- + +## Sequence Diagram + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as /workflows route + participant M as simple_mapper + participant E as engine + participant RT as WorkflowRuntime + participant CS as components.services + participant RL as run_logs + + C->>R: POST /workflows/dhis2-datavalue-set (flat payload) + R->>M: normalize_simple_request(payload) + M-->>R: WorkflowExecuteRequest + R->>E: execute_workflow(request, include_component_run_details) + + E->>RT: run(feature_source) + RT->>CS: feature_source_component(...) + CS-->>RT: features, bbox + + E->>RT: run(download_dataset) + RT->>CS: download_dataset_component(...) + CS-->>RT: status + + E->>RT: run(temporal_aggregation) + RT->>CS: temporal_aggregation_component(...) + CS-->>RT: aggregated dataset + + E->>RT: run(spatial_aggregation) + RT->>CS: spatial_aggregation_component(...) + CS-->>RT: records + + E->>RT: run(build_datavalueset) + RT->>CS: build_datavalueset_component(...) + CS-->>RT: data_value_set, output_file + + E->>RL: persist_run_log(completed|failed) + RL-->>E: run_log_file + E-->>R: WorkflowExecuteResponse + R-->>C: 200 response (trimmed or detailed component runs) +``` + +Failure path: + +1. Any component exception is captured by runtime on the failing step. +2. Engine persists failed run log with `run_id` and `failed_component`. +3. Engine returns structured error: + - `503` with `error=upstream_unreachable` for connectivity failures. + - `500` with `error=workflow_execution_failed` for all other failures. + +--- + +## Manual E2E Testing + +Use the following commands to validate discovery, execution, and error behavior end-to-end. + +1. Start API: + +```bash +uvicorn eo_api.main:app --reload +``` + +2. Verify discovered workflows: + +```bash +curl -s http://127.0.0.1:8000/workflows | jq +``` + +3. Run default 5-step workflow: + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": true + } + }' | jq +``` + +Expected component order: + +1. `feature_source` +2. `download_dataset` +3. `temporal_aggregation` +4. `spatial_aggregation` +5. `build_datavalueset` + +4. Run 4-step workflow (without temporal aggregation): + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "request": { + "workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "spatial_reducer": "mean", + "include_component_run_details": true + } + }' | jq +``` + +Expected component order: + +1. `feature_source` +2. `download_dataset` +3. `spatial_aggregation` +4. `build_datavalueset` + +5. Negative test for unknown workflow: + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "request": { + "workflow_id": "does_not_exist", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 2, + "data_element": "DE_UID" + } + }' | jq +``` + +Expected result: `422` with allowed/discovered `workflow_id` values in error detail. + +6. Validate inline assembly (no execution): + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/validate" \ + -H "Content-Type: application/json" \ + -d '{ + "workflow": { + "workflow_id": "adhoc_validate_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}} + ] + }, + "request": { + "workflow_id": "adhoc_validate_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 2, + "data_element": "DE_UID" + } + }' | jq +``` + +Expected result: `200` with `valid: true`, resolved step configs, and no execution side effects. + +--- + +## Next Technical Step + +Prioritize orchestration-tool readiness (Prefect/Airflow wrappers over the current workflow service) before any OGC-first migration. + +Rationale: + +1. Delivers immediate operational value (scheduling, retries, long-running reliability) with minimal API churn. +2. Reuses existing componentization, dispatcher, and run metadata. +3. Avoids a high-risk architecture pivot while the current workflow contract is stabilizing. + +For detailed option synthesis and implementation scope, see: + +- `docs/internal/roadmap_v2.md` (Post-V2 Decision Synthesis) diff --git a/pygeoapi-config.yml b/pygeoapi-config.yml new file mode 100644 index 0000000..893a817 --- /dev/null +++ b/pygeoapi-config.yml @@ -0,0 +1,40 @@ +server: + bind: + host: 0.0.0.0 + port: 5000 + url: http://127.0.0.1:8000/pygeoapi + mimetype: application/json; charset=UTF-8 + encoding: utf-8 + languages: + - en-US + limits: + default_items: 20 + max_items: 50 + map: + url: https://tile.openstreetmap.org/{z}/{x}/{y}.png + attribution: OpenStreetMap + +metadata: + identification: + title: + en: DHIS2 EO API + description: + en: OGC API facade for EO services + keywords: + en: + - EO + - DHIS2 + terms_of_service: https://dhis2.org + url: https://dhis2.org + license: + name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + provider: + name: DHIS2 EO API + url: https://dhis2.org + contact: + name: DHIS2 + position: Team + email: climate@dhis2.org + +resources: {} diff --git a/pygeoapi-openapi.yml b/pygeoapi-openapi.yml new file mode 100644 index 0000000..d70eca1 --- /dev/null +++ b/pygeoapi-openapi.yml @@ -0,0 +1,362 @@ +components: + parameters: + bbox: + description: Only features that have a geometry that intersects the bounding + box are selected.The bounding box is provided as four or six numbers, depending + on whether the coordinate reference system includes a vertical axis (height + or depth). + explode: false + in: query + name: bbox + required: false + schema: + items: + type: number + maxItems: 6 + minItems: 4 + type: array + style: form + bbox-crs: + description: Indicates the coordinate reference system for the given bbox coordinates. + explode: false + in: query + name: bbox-crs + required: false + schema: + format: uri + type: string + style: form + bbox-crs-epsg: + description: Indicates the EPSG for the given bbox coordinates. + explode: false + in: query + name: bbox-crs + required: false + schema: + default: 4326 + type: integer + style: form + crs: + description: Indicates the coordinate reference system for the results. + explode: false + in: query + name: crs + required: false + schema: + format: uri + type: string + style: form + f: + description: The optional f parameter indicates the output format which the + server shall provide as part of the response document. The default format + is GeoJSON. + explode: false + in: query + name: f + required: false + schema: + default: json + enum: + - json + - html + - jsonld + type: string + style: form + lang: + description: The optional lang parameter instructs the server return a response + in a certain language, if supported. If the language is not among the available + values, the Accept-Language header language will be used if it is supported. + If the header is missing, the default server language is used. Note that providers + may only support a single language (or often no language at all), that can + be different from the server language. Language strings can be written in + a complex (e.g. "fr-CA,fr;q=0.9,en-US;q=0.8,en;q=0.7"), simple (e.g. "de") + or locale-like (e.g. "de-CH" or "fr_BE") fashion. + in: query + name: lang + required: false + schema: + default: en-US + enum: + - en-US + type: string + offset: + description: The optional offset parameter indicates the index within the result + set from which the server shall begin presenting results in the response document. The + first element has an index of 0 (default). + explode: false + in: query + name: offset + required: false + schema: + default: 0 + minimum: 0 + type: integer + style: form + resourceId: + description: Configuration resource identifier + in: path + name: resourceId + required: true + schema: + type: string + skipGeometry: + description: This option can be used to skip response geometries for each feature. + explode: false + in: query + name: skipGeometry + required: false + schema: + default: false + type: boolean + style: form + vendorSpecificParameters: + description: Additional "free-form" parameters that are not explicitly defined + in: query + name: vendorSpecificParameters + schema: + additionalProperties: true + type: object + style: form + responses: + '200': + description: successful operation + '204': + description: no content + Queryables: + content: + application/json: + schema: + $ref: '#/components/schemas/queryables' + description: successful queryables operation + default: + content: + application/json: + schema: + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/schemas/exception.yaml + description: Unexpected error + schemas: + queryable: + properties: + description: + description: a human-readable narrative describing the queryable + type: string + language: + default: en + description: the language used for the title and description + type: string + queryable: + description: the token that may be used in a CQL predicate + type: string + title: + description: a human readable title for the queryable + type: string + type: + description: the data type of the queryable + type: string + type-ref: + description: a reference to the formal definition of the type + format: url + type: string + required: + - queryable + - type + type: object + queryables: + properties: + queryables: + items: + $ref: '#/components/schemas/queryable' + type: array + required: + - queryables + type: object +info: + contact: + name: DHIS2 EO API + url: https://dhis2.org + x-ogc-serviceContact: + addresses: [] + emails: + - value: climate@dhis2.org + name: DHIS2 + position: Team + description: OGC API facade for EO services + license: + name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + termsOfService: https://dhis2.org + title: DHIS2 EO API + version: 0.22.0 + x-keywords: + - EO + - DHIS2 +openapi: 3.0.2 +paths: + /: + get: + description: Landing page + operationId: getLandingPage + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: Landing page + tags: + - server + /collections: + get: + description: Collections + operationId: getCollections + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: Collections + tags: + - server + /conformance: + get: + description: API conformance definition + operationId: getConformanceDeclaration + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: API conformance definition + tags: + - server + /jobs: + get: + description: Retrieve a list of jobs + operationId: getJobs + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve jobs list + tags: + - jobs + /jobs/{jobId}: + delete: + description: Cancel / delete job + operationId: deleteJob + parameters: + - &id001 + description: job identifier + in: path + name: jobId + required: true + schema: + type: string + responses: + '204': + $ref: '#/components/responses/204' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Cancel / delete job + tags: + - jobs + get: + description: Retrieve job details + operationId: getJob + parameters: + - *id001 + - $ref: '#/components/parameters/f' + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve job details + tags: + - jobs + /jobs/{jobId}/results: + get: + description: Retrieve job results + operationId: getJobResults + parameters: + - *id001 + - $ref: '#/components/parameters/f' + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve job results + tags: + - jobs + /openapi: + get: + description: This document + operationId: getOpenapi + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + - description: UI to render the OpenAPI document + explode: false + in: query + name: ui + required: false + schema: + default: swagger + enum: + - swagger + - redoc + type: string + style: form + responses: + '200': + $ref: '#/components/responses/200' + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + default: + $ref: '#/components/responses/default' + summary: This document + tags: + - server +servers: +- description: OGC API facade for EO services + url: http://127.0.0.1:8000/ogcapi +tags: +- description: OGC API facade for EO services + externalDocs: + description: information + url: https://dhis2.org + name: server +- name: coverages +- name: edr +- name: records +- name: features +- name: maps +- name: processes +- name: jobs +- name: tiles +- name: stac + diff --git a/pyproject.toml b/pyproject.toml index d319dfd..7486d0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "earthkit-transforms==0.5.*", "metpy>=1.7,<2", "zarr==3.1.5", + "titiler-xarray>=1.2.0", ] [tool.ruff] diff --git a/src/eo_api/analytics_viewer/__init__.py b/src/eo_api/analytics_viewer/__init__.py new file mode 100644 index 0000000..f3546a5 --- /dev/null +++ b/src/eo_api/analytics_viewer/__init__.py @@ -0,0 +1,5 @@ +"""Time-aware analytics viewer module.""" + +from . import routes as routes + +__all__ = ["routes"] diff --git a/src/eo_api/analytics_viewer/routes.py b/src/eo_api/analytics_viewer/routes.py new file mode 100644 index 0000000..ec74515 --- /dev/null +++ b/src/eo_api/analytics_viewer/routes.py @@ -0,0 +1,539 @@ +"""Pluggable time-aware analytics viewer routes.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from fastapi import APIRouter, HTTPException +from fastapi.responses import HTMLResponse + +from ..publications import services as publication_services +from ..publications.schemas import PublishedResourceKind +from ..shared.api_errors import api_error + +router = APIRouter() + + +@router.get("/publications/{resource_id}") +def get_publication_analytics_config(resource_id: str) -> dict[str, Any]: + """Return viewer configuration for one published resource.""" + resource = publication_services.get_published_resource(resource_id) + if resource is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown resource_id '{resource_id}'", + resource_id=resource_id, + ), + ) + if resource.kind != PublishedResourceKind.FEATURE_COLLECTION or resource.path is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="analytics_target_invalid", + error_code="ANALYTICS_TARGET_INVALID", + message=f"Resource '{resource_id}' is not a feature collection viewer target", + resource_id=resource_id, + ), + ) + + data_url = _data_url_for_path(resource.path) + return { + "resource_id": resource.resource_id, + "title": resource.title, + "description": resource.description, + "dataset_id": resource.dataset_id, + "workflow_id": resource.workflow_id, + "job_id": resource.job_id, + "data_url": data_url, + "ogc_items_url": f"/pygeoapi/collections/{resource.resource_id}/items", + "links": { + "ogc_home": "/ogcapi", + "publication": f"/publications/{resource.resource_id}", + "collection": f"/pygeoapi/collections/{resource.resource_id}", + "items": f"/pygeoapi/collections/{resource.resource_id}/items", + }, + } + + +@router.get("/publications/{resource_id}/viewer", response_class=HTMLResponse) +def get_publication_analytics_viewer(resource_id: str, embed: bool = False) -> HTMLResponse: + """Return an interactive time-aware analytics viewer for one published resource.""" + config = get_publication_analytics_config(resource_id) + return HTMLResponse(_render_viewer_html(config, embed=embed)) + + +def _data_url_for_path(path_value: str) -> str: + path = Path(path_value).resolve() + downloads_root = publication_services.DOWNLOAD_DIR.resolve() + if downloads_root not in path.parents: + raise HTTPException( + status_code=409, + detail=api_error( + error="published_asset_invalid", + error_code="PUBLISHED_ASSET_INVALID", + message="Published resource path is outside mounted download storage", + ), + ) + relative_path = path.relative_to(downloads_root).as_posix() + return f"/data/{relative_path}" + + +def _render_viewer_html(config: dict[str, Any], *, embed: bool = False) -> str: + config_json = json.dumps(config) + shell_padding = "0" if embed else "28px 24px 40px" + shell_max_width = "100%" if embed else "1440px" + shell_margin = "0" if embed else "0 auto" + body_background = ( + "transparent" + if embed + else """ + radial-gradient(circle at top left, rgba(221, 141, 85, 0.18), transparent 32%), + radial-gradient(circle at right, rgba(65, 130, 180, 0.16), transparent 28%), + linear-gradient(180deg, #f8f4ee 0%, var(--bg) 100%) + """ + ) + hero_html = ( + "" + if embed + else f""" + +
Analytics Viewer
+

{config["title"]}

+

+ Time-aware choropleth view over the published workflow output. This viewer is intentionally isolated from the + OGC/publication core so it can be swapped or removed without changing the publication contract. +

+ """ + ) + return f""" + + + + + {config["title"]} Analytics Viewer + + + + + +
+ {hero_html} +
+
+
+ + + +
+
+
+
+ +
+
+ + +""" diff --git a/src/eo_api/components/__init__.py b/src/eo_api/components/__init__.py new file mode 100644 index 0000000..dc357fd --- /dev/null +++ b/src/eo_api/components/__init__.py @@ -0,0 +1,6 @@ +"""Reusable workflow components exposed as API capabilities.""" + +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/components/routes.py b/src/eo_api/components/routes.py new file mode 100644 index 0000000..feab9c4 --- /dev/null +++ b/src/eo_api/components/routes.py @@ -0,0 +1,136 @@ +"""Component discovery and execution endpoints.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +from fastapi import APIRouter, Query + +from ..data_manager.services.constants import BBOX +from . import services +from .schemas import ( + BuildDataValueSetRunRequest, + BuildDataValueSetRunResponse, + ComponentCatalogResponse, + DownloadDatasetRunRequest, + DownloadDatasetRunResponse, + FeatureSourceRunRequest, + FeatureSourceRunResponse, + SpatialAggregationRunRequest, + SpatialAggregationRunResponse, + TemporalAggregationRunRequest, + TemporalAggregationRunResponse, +) + +router = APIRouter() + + +def _to_jsonable_scalar(value: Any) -> Any: + """Convert numpy scalars/datetimes to JSON-safe native values.""" + if isinstance(value, np.datetime64): + return np.datetime_as_string(value, unit="s") + if isinstance(value, np.generic): + return value.item() + return value + + +def _json_safe_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Ensure record rows are JSON-serializable.""" + return [{key: _to_jsonable_scalar(value) for key, value in record.items()} for record in records] + + +@router.get("", response_model=ComponentCatalogResponse, response_model_exclude_none=True) +def list_components(include_internal: bool = Query(default=False)) -> ComponentCatalogResponse: + """List all discoverable reusable components.""" + return ComponentCatalogResponse(components=services.component_catalog(include_internal=include_internal)) + + +@router.post("/feature-source", response_model=FeatureSourceRunResponse) +def run_feature_source(payload: FeatureSourceRunRequest) -> FeatureSourceRunResponse: + """Resolve feature source to features and bbox.""" + features, bbox = services.feature_source_component(payload.feature_source) + return FeatureSourceRunResponse( + bbox=bbox, + feature_count=len(features["features"]), + features=features if payload.include_features else None, + ) + + +@router.post("/download-dataset", response_model=DownloadDatasetRunResponse) +def run_download_dataset(payload: DownloadDatasetRunRequest) -> DownloadDatasetRunResponse: + """Download dataset files for the selected period/scope.""" + dataset = services.require_dataset(payload.dataset_id) + bbox = payload.bbox or BBOX + services.download_dataset_component( + dataset=dataset, + start=payload.start, + end=payload.end, + overwrite=payload.overwrite, + country_code=payload.country_code, + bbox=bbox, + ) + return DownloadDatasetRunResponse( + status="completed", + dataset_id=payload.dataset_id, + start=payload.start, + end=payload.end, + ) + + +@router.post("/temporal-aggregation", response_model=TemporalAggregationRunResponse) +def run_temporal_aggregation(payload: TemporalAggregationRunRequest) -> TemporalAggregationRunResponse: + """Aggregate a dataset temporally.""" + dataset = services.require_dataset(payload.dataset_id) + ds = services.temporal_aggregation_component( + dataset=dataset, + start=payload.start, + end=payload.end, + bbox=payload.bbox, + target_period_type=payload.target_period_type, + method=payload.method, + ) + return TemporalAggregationRunResponse( + dataset_id=payload.dataset_id, + sizes={str(k): int(v) for k, v in ds.sizes.items()}, + dims=[str(d) for d in ds.dims], + ) + + +@router.post("/spatial-aggregation", response_model=SpatialAggregationRunResponse) +def run_spatial_aggregation(payload: SpatialAggregationRunRequest) -> SpatialAggregationRunResponse: + """Aggregate a dataset spatially to features.""" + dataset = services.require_dataset(payload.dataset_id) + features, bbox = services.feature_source_component(payload.feature_source) + records = services.spatial_aggregation_component( + dataset=dataset, + start=payload.start, + end=payload.end, + bbox=payload.bbox or bbox, + features=features, + method=payload.method, + feature_id_property=payload.feature_id_property, + ) + json_records = _json_safe_records(records) + return SpatialAggregationRunResponse( + dataset_id=payload.dataset_id, + record_count=len(json_records), + preview=json_records[: payload.max_preview_rows], + records=json_records if payload.include_records else None, + ) + + +@router.post("/build-datavalue-set", response_model=BuildDataValueSetRunResponse) +def run_build_datavalueset(payload: BuildDataValueSetRunRequest) -> BuildDataValueSetRunResponse: + """Build and serialize a DHIS2 DataValueSet from records.""" + data_value_set, output_file = services.build_datavalueset_component( + dataset_id=payload.dataset_id, + period_type=payload.period_type, + records=payload.records, + dhis2=payload.dhis2, + ) + return BuildDataValueSetRunResponse( + value_count=len(data_value_set.get("dataValues", [])), + output_file=output_file, + data_value_set=data_value_set, + ) diff --git a/src/eo_api/components/schemas.py b/src/eo_api/components/schemas.py new file mode 100644 index 0000000..5064b9c --- /dev/null +++ b/src/eo_api/components/schemas.py @@ -0,0 +1,184 @@ +"""Schemas for component discovery, manifests, and execution endpoints.""" + +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, Field + +from ..workflows.schemas import ( + AggregationMethod, + Dhis2DataValueSetConfig, + FeatureSourceConfig, + PeriodType, +) + + +class ComponentEndpoint(BaseModel): + """HTTP endpoint metadata for a component.""" + + path: str + method: str + + +class ComponentDefinition(BaseModel): + """Component metadata for discovery.""" + + name: str + version: str = "v1" + description: str + inputs: list[str] + outputs: list[str] + workflow_inputs_required: list[str] = Field(default_factory=list) + workflow_inputs_optional: list[str] = Field(default_factory=list) + input_schema: dict[str, Any] = Field(default_factory=dict) + config_schema: dict[str, Any] | None = None + output_schema: dict[str, Any] = Field(default_factory=dict) + error_codes: list[str] = Field(default_factory=list) + endpoint: ComponentEndpoint + + +class ComponentRuntimeManifest(BaseModel): + """Runtime metadata for one registered component.""" + + type: Literal["python"] = "python" + supported_execution_modes: list[str] = Field(default_factory=lambda: ["local"]) + local_handler: str | None = None + remote_handler: str | None = None + remote_request_bindings: dict[str, Any] = Field(default_factory=dict) + remote_response_bindings: dict[str, str] = Field(default_factory=dict) + + +class ComponentManifest(BaseModel): + """Internal manifest used to register a component.""" + + name: str + version: str = "v1" + description: str + inputs: list[str] + outputs: list[str] + workflow_inputs_required: list[str] = Field(default_factory=list) + workflow_inputs_optional: list[str] = Field(default_factory=list) + input_schema: dict[str, Any] = Field(default_factory=dict) + config_schema: dict[str, Any] | None = None + output_schema: dict[str, Any] = Field(default_factory=dict) + error_codes: list[str] = Field(default_factory=list) + endpoint: ComponentEndpoint + runtime: ComponentRuntimeManifest + + def to_definition(self) -> ComponentDefinition: + """Project internal manifest to public discovery metadata.""" + return ComponentDefinition( + name=self.name, + version=self.version, + description=self.description, + inputs=self.inputs, + outputs=self.outputs, + workflow_inputs_required=self.workflow_inputs_required, + workflow_inputs_optional=self.workflow_inputs_optional, + input_schema=self.input_schema, + config_schema=self.config_schema, + output_schema=self.output_schema, + error_codes=self.error_codes, + endpoint=self.endpoint, + ) + + +class ComponentCatalogResponse(BaseModel): + """List of discoverable components.""" + + components: list[ComponentDefinition] + + +class FeatureSourceRunRequest(BaseModel): + """Execute feature source component.""" + + feature_source: FeatureSourceConfig + include_features: bool = False + + +class FeatureSourceRunResponse(BaseModel): + """Feature source component result.""" + + bbox: list[float] + feature_count: int + features: dict[str, Any] | None = None + + +class DownloadDatasetRunRequest(BaseModel): + """Execute dataset download component.""" + + dataset_id: str + start: str + end: str + overwrite: bool = False + country_code: str | None = None + bbox: list[float] | None = None + + +class DownloadDatasetRunResponse(BaseModel): + """Download component result.""" + + status: str + dataset_id: str + start: str + end: str + + +class TemporalAggregationRunRequest(BaseModel): + """Execute temporal aggregation component from cached dataset.""" + + dataset_id: str + start: str + end: str + target_period_type: PeriodType + method: AggregationMethod = AggregationMethod.SUM + bbox: list[float] | None = None + + +class TemporalAggregationRunResponse(BaseModel): + """Temporal aggregation result summary.""" + + dataset_id: str + sizes: dict[str, int] + dims: list[str] + + +class SpatialAggregationRunRequest(BaseModel): + """Execute spatial aggregation component from cached dataset.""" + + dataset_id: str + start: str + end: str + feature_source: FeatureSourceConfig + method: AggregationMethod = AggregationMethod.MEAN + bbox: list[float] | None = None + feature_id_property: str = "id" + include_records: bool = False + max_preview_rows: int = 20 + + +class SpatialAggregationRunResponse(BaseModel): + """Spatial aggregation result with sample rows.""" + + dataset_id: str + record_count: int + preview: list[dict[str, Any]] + records: list[dict[str, Any]] | None = None + + +class BuildDataValueSetRunRequest(BaseModel): + """Execute build_datavalueset component directly from records.""" + + dataset_id: str + period_type: PeriodType + records: list[dict[str, Any]] = Field(default_factory=list) + dhis2: Dhis2DataValueSetConfig + + +class BuildDataValueSetRunResponse(BaseModel): + """Build_datavalueset component output.""" + + value_count: int + output_file: str + data_value_set: dict[str, Any] diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py new file mode 100644 index 0000000..b9c803d --- /dev/null +++ b/src/eo_api/components/services.py @@ -0,0 +1,972 @@ +"""Component service implementations and discovery metadata.""" + +from __future__ import annotations + +import time +from collections.abc import Mapping +from dataclasses import dataclass +from typing import Any, Final + +import httpx +import xarray as xr +from fastapi import HTTPException +from pydantic import BaseModel, ConfigDict, ValidationError + +from ..data_accessor.services.accessor import get_data +from ..data_manager.services import downloader +from ..data_registry.services.datasets import get_dataset +from ..workflows.schemas import ( + AggregationMethod, + Dhis2DataValueSetConfig, + FeatureSourceConfig, + PeriodType, +) +from ..workflows.services.datavalueset import build_data_value_set +from ..workflows.services.features import resolve_features +from ..workflows.services.preflight import check_upstream_connectivity +from ..workflows.services.spatial import aggregate_to_features +from ..workflows.services.temporal import aggregate_temporal +from .schemas import ComponentDefinition, ComponentEndpoint, ComponentManifest, ComponentRuntimeManifest + +type WorkflowStepExecutor = Any + + +@dataclass(frozen=True) +class ComponentRuntimeDefinition: + """Runtime binding for one workflow-executable component version.""" + + component: str + version: str + executor: WorkflowStepExecutor + config_model: type[BaseModel] + + +_ERROR_CODES_V1: Final[list[str]] = [ + "INPUT_VALIDATION_FAILED", + "CONFIG_VALIDATION_FAILED", + "OUTPUT_VALIDATION_FAILED", + "UPSTREAM_UNREACHABLE", + "EXECUTION_FAILED", +] + +_COMPONENT_REGISTRY: Final[dict[str, ComponentManifest]] = { + "feature_source@v1": ComponentManifest( + name="feature_source", + version="v1", + description="Resolve feature source and compute bbox.", + inputs=["feature_source"], + outputs=["features", "bbox"], + workflow_inputs_required=[], + workflow_inputs_optional=[], + input_schema={ + "type": "object", + "properties": {"feature_source": {"type": "object"}}, + "required": ["feature_source"], + }, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, + output_schema={ + "type": "object", + "properties": { + "features": {"type": "object"}, + "bbox": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + }, + "required": ["features", "bbox"], + }, + error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/feature-source", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.feature_source", + remote_handler="workflow.feature_source", + remote_request_bindings={ + "feature_source": "$request.feature_source", + "include_features": True, + }, + remote_response_bindings={ + "features": "features", + "bbox": "bbox", + }, + ), + ), + "download_dataset@v1": ComponentManifest( + name="download_dataset", + version="v1", + description="Download dataset files for period and bbox.", + inputs=["dataset_id", "start", "end", "overwrite", "country_code", "bbox"], + outputs=["status"], + workflow_inputs_required=["bbox"], + workflow_inputs_optional=[], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "overwrite": {"type": "boolean"}, + "country_code": {"type": ["string", "null"]}, + "bbox": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + }, + "required": ["dataset_id", "start", "end", "overwrite", "bbox"], + }, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"status": {"type": "string"}}}, + error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/download-dataset", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.download_dataset", + remote_handler="workflow.download_dataset", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "start": "$request.start", + "end": "$request.end", + "overwrite": "$request.overwrite", + "country_code": "$request.country_code", + "bbox": "$resolved.bbox", + }, + remote_response_bindings={"status": "status"}, + ), + ), + "temporal_aggregation@v1": ComponentManifest( + name="temporal_aggregation", + version="v1", + description="Aggregate dataset over time dimension.", + inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], + outputs=["temporal_dataset"], + workflow_inputs_required=["bbox"], + workflow_inputs_optional=[], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "target_period_type": {"type": "string"}, + "method": {"type": "string"}, + "bbox": {"type": ["array", "null"], "items": {"type": "number"}}, + }, + "required": ["dataset_id", "start", "end", "target_period_type", "method"], + }, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"temporal_dataset": {"type": "object"}}}, + error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/temporal-aggregation", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local"], + local_handler="workflow.temporal_aggregation", + remote_handler=None, + ), + ), + "spatial_aggregation@v1": ComponentManifest( + name="spatial_aggregation", + version="v1", + description="Aggregate gridded dataset to features.", + inputs=["dataset_id", "start", "end", "feature_source", "method"], + outputs=["records"], + workflow_inputs_required=["bbox", "features"], + workflow_inputs_optional=["temporal_dataset"], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "feature_source": {"type": "object"}, + "method": {"type": "string"}, + }, + "required": ["dataset_id", "start", "end", "feature_source", "method"], + }, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"records": {"type": "array"}}}, + error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/spatial-aggregation", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.spatial_aggregation", + remote_handler="workflow.spatial_aggregation", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "start": "$request.start", + "end": "$request.end", + "feature_source": "$request.feature_source", + "method": "$request.spatial_aggregation.method", + "bbox": "$resolved.bbox", + "feature_id_property": "$request.dhis2.org_unit_property", + "include_records": True, + }, + remote_response_bindings={"records": "records"}, + ), + ), + "build_datavalueset@v1": ComponentManifest( + name="build_datavalueset", + version="v1", + description="Build and serialize DHIS2 DataValueSet JSON.", + inputs=["dataset_id", "period_type", "records", "dhis2"], + outputs=["data_value_set", "output_file"], + workflow_inputs_required=["records"], + workflow_inputs_optional=[], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "period_type": {"type": "string"}, + "records": {"type": "array"}, + "dhis2": {"type": "object"}, + }, + "required": ["dataset_id", "period_type", "records", "dhis2"], + }, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, + output_schema={ + "type": "object", + "properties": {"data_value_set": {"type": "object"}, "output_file": {"type": "string"}}, + "required": ["data_value_set", "output_file"], + }, + error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/build-datavalue-set", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.build_datavalueset", + remote_handler="workflow.build_datavalueset", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "period_type": "$request.temporal_aggregation.target_period_type", + "records": "$resolved.records", + "dhis2": "$request.dhis2", + }, + remote_response_bindings={ + "data_value_set": "data_value_set", + "output_file": "output_file", + }, + ), + ), +} + + +def component_catalog(*, include_internal: bool = False) -> list[ComponentDefinition]: + """Return discoverable component definitions. + + By default, internal orchestration-only metadata (config_schema) is hidden. + """ + components = [manifest.to_definition() for manifest in _COMPONENT_REGISTRY.values()] + if include_internal: + return components + return [component.model_copy(update={"config_schema": None}) for component in components] + + +def component_registry() -> dict[str, ComponentManifest]: + """Return manifest registry entries keyed by component@version.""" + return dict(_COMPONENT_REGISTRY) + + +class _RemoteCapableStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 + + +def feature_source_component(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: + """Run feature source component.""" + return resolve_features(config) + + +def download_dataset_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + overwrite: bool, + country_code: str | None, + bbox: list[float], +) -> None: + """Run connectivity preflight and download dataset files.""" + check_upstream_connectivity(dataset) + downloader.download_dataset( + dataset=dataset, + start=start, + end=end, + overwrite=overwrite, + background_tasks=None, + country_code=country_code, + bbox=bbox, + ) + + +def temporal_aggregation_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + bbox: list[float] | None, + target_period_type: PeriodType, + method: AggregationMethod, +) -> xr.Dataset: + """Load dataset and aggregate over time.""" + ds = get_data(dataset=dataset, start=start, end=end, bbox=bbox) + source_period_type = _dataset_period_type(dataset) + if source_period_type == target_period_type: + return ds + return aggregate_temporal(ds=ds, period_type=target_period_type, method=method) + + +def spatial_aggregation_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + bbox: list[float] | None, + features: dict[str, Any], + method: AggregationMethod, + feature_id_property: str, + aggregated_dataset: xr.Dataset | None = None, +) -> list[dict[str, Any]]: + """Load dataset and aggregate spatially to provided features.""" + ds = ( + aggregated_dataset + if aggregated_dataset is not None + else get_data(dataset=dataset, start=start, end=end, bbox=bbox) + ) + return aggregate_to_features( + ds=ds, + variable=dataset["variable"], + features=features, + method=method.value, + feature_id_property=feature_id_property, + ) + + +def build_datavalueset_component( + *, + dataset_id: str, + period_type: PeriodType, + records: list[dict[str, Any]], + dhis2: Dhis2DataValueSetConfig, +) -> tuple[dict[str, Any], str]: + """Build and serialize DHIS2 DataValueSet from records.""" + return build_data_value_set(records=records, dataset_id=dataset_id, period_type=period_type, config=dhis2) + + +def run_feature_source_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for feature_source.""" + del dataset, step + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + outputs = runtime.run( + "feature_source", + _invoke_registered_remote_component, + component_key="feature_source@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + features = outputs["features"] + bbox = outputs["bbox"] + else: + features, bbox = runtime.run( + "feature_source", + feature_source_component, + config=request.feature_source, + ) + return {"features": features, "bbox": bbox} + + +def run_download_dataset_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for download_dataset.""" + del step + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode not in {"local", "remote"}: + raise ValueError("download_dataset.execution_mode must be 'local' or 'remote'") + bbox = resolved_inputs["bbox"] + if execution_mode == "remote": + remote_url = step_config.get("remote_url") + if not isinstance(remote_url, str) or not remote_url: + raise ValueError("download_dataset remote mode requires non-empty 'remote_url'") + outputs = runtime.run( + "download_dataset", + _invoke_registered_remote_component, + component_key="download_dataset@v1", + remote_url=remote_url, + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + if not isinstance(outputs, dict): + raise RuntimeError("download_dataset remote mode must return a mapping of outputs") + return outputs + else: + runtime.run( + "download_dataset", + download_dataset_component, + dataset=dataset, + start=request.start, + end=request.end, + overwrite=request.overwrite, + country_code=request.country_code, + bbox=bbox, + ) + return {"status": "downloaded"} + + +def run_temporal_aggregation_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for temporal_aggregation.""" + del step + target_period_type = request.temporal_aggregation.target_period_type + method = request.temporal_aggregation.method + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + raise ValueError("temporal_aggregation does not declare a remote HTTP contract") + else: + temporal_ds = runtime.run( + "temporal_aggregation", + temporal_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=resolved_inputs["bbox"], + target_period_type=target_period_type, + method=method, + ) + return {"temporal_dataset": temporal_ds} + + +def run_spatial_aggregation_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for spatial_aggregation.""" + del step + method = request.spatial_aggregation.method + feature_id_property = request.dhis2.org_unit_property + execution_mode = str(step_config.get("execution_mode", "local")).lower() + temporal_dataset = resolved_inputs.get("temporal_dataset") + if execution_mode == "remote": + if temporal_dataset is not None: + raise ValueError( + "remote spatial_aggregation does not yet support workflow temporal_aggregation output; " + "use local spatial_aggregation for temporally aggregated workflows" + ) + outputs = runtime.run( + "spatial_aggregation", + _invoke_registered_remote_component, + component_key="spatial_aggregation@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + records = outputs["records"] + else: + records = runtime.run( + "spatial_aggregation", + spatial_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=resolved_inputs["bbox"], + features=resolved_inputs["features"], + method=method, + feature_id_property=feature_id_property, + aggregated_dataset=temporal_dataset, + ) + return {"records": records} + + +def run_build_datavalueset_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for build_datavalueset.""" + del dataset, step + period_type = request.temporal_aggregation.target_period_type + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + outputs = runtime.run( + "build_datavalueset", + _invoke_registered_remote_component, + component_key="build_datavalueset@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + data_value_set = outputs["data_value_set"] + output_file = outputs["output_file"] + else: + data_value_set, output_file = runtime.run( + "build_datavalueset", + build_datavalueset_component, + records=resolved_inputs["records"], + dataset_id=request.dataset_id, + period_type=period_type, + dhis2=request.dhis2, + ) + return {"data_value_set": data_value_set, "output_file": output_file} + + +def require_dataset(dataset_id: str) -> dict[str, Any]: + """Resolve dataset or raise 404.""" + dataset = get_dataset(dataset_id) + if dataset is None: + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + return dataset + + +def workflow_runtime_registry() -> dict[str, ComponentRuntimeDefinition]: + """Workflow runtime bindings keyed by component@version.""" + handler_registry = _workflow_runtime_handler_registry() + runtime_bindings: dict[str, ComponentRuntimeDefinition] = {} + for key, manifest in _COMPONENT_REGISTRY.items(): + local_handler = manifest.runtime.local_handler + if local_handler is None: + continue + executor = handler_registry.get(local_handler) + if executor is None: + raise RuntimeError(f"Unknown local runtime handler '{local_handler}' for component '{key}'") + runtime_bindings[key] = ComponentRuntimeDefinition( + component=manifest.name, + version=manifest.version, + executor=executor, + config_model=_RemoteCapableStepConfig, + ) + return runtime_bindings + + +def _workflow_runtime_handler_registry() -> dict[str, WorkflowStepExecutor]: + """Resolve local workflow runtime handlers from manifest identifiers.""" + return { + "workflow.feature_source": run_feature_source_step, + "workflow.download_dataset": run_download_dataset_step, + "workflow.temporal_aggregation": run_temporal_aggregation_step, + "workflow.spatial_aggregation": run_spatial_aggregation_step, + "workflow.build_datavalueset": run_build_datavalueset_step, + } + + +def validate_component_runtime_config(component: str, version: str, config: dict[str, Any]) -> None: + """Validate runtime config for one workflow-executable component.""" + manifest = _COMPONENT_REGISTRY.get(f"{component}@{version}") + if manifest is None: + raise ValueError(f"No component manifest registered for '{component}@{version}'") + runtime_definition = workflow_runtime_registry().get(f"{component}@{version}") + if runtime_definition is None: + raise ValueError(f"No runtime config schema registered for component '{component}@{version}'") + try: + validated = runtime_definition.config_model.model_validate(config) + except ValidationError as exc: + raise ValueError(f"Invalid config for component '{component}@{version}': {exc}") from exc + mode = str(getattr(validated, "execution_mode", "local")).lower() + if mode not in {"local", "remote"}: + raise ValueError( + f"Invalid config for component '{component}@{version}': execution_mode must be local or remote" + ) + if mode not in set(manifest.runtime.supported_execution_modes): + allowed = ", ".join(manifest.runtime.supported_execution_modes) + raise ValueError( + f"Invalid config for component '{component}@{version}': execution_mode '{mode}' not supported; " + f"allowed values: {allowed}" + ) + remote_url = getattr(validated, "remote_url", None) + remote_timeout_sec = getattr(validated, "remote_timeout_sec", 30.0) + remote_retries = getattr(validated, "remote_retries", 1) + remote_retry_delay_sec = getattr(validated, "remote_retry_delay_sec", 1.0) + + has_remote_config = bool( + (isinstance(remote_url, str) and remote_url.strip()) + or float(remote_timeout_sec) != 30.0 + or int(remote_retries) != 1 + or float(remote_retry_delay_sec) != 1.0 + ) + + if mode == "local" and has_remote_config: + raise ValueError( + f"Invalid config for component '{component}@{version}': " + "remote_url/remote_timeout_sec/remote_retries/remote_retry_delay_sec are only allowed in remote mode" + ) + if mode == "remote" and (not isinstance(remote_url, str) or not remote_url.strip()): + raise ValueError( + f"Invalid config for component '{component}@{version}': remote_url is required for remote mode" + ) + + +def _dataset_period_type(dataset: Mapping[str, Any]) -> PeriodType | None: + raw_value = dataset.get("period_type") + if not isinstance(raw_value, str): + return None + normalized = raw_value.strip().lower() + try: + return PeriodType(normalized) + except ValueError: + return None + + +def _invoke_remote_download_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + overwrite: bool, + country_code: str | None, + bbox: list[float], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> None: + payload = { + "dataset_id": dataset_id, + "start": start, + "end": end, + "overwrite": overwrite, + "country_code": country_code, + "bbox": bbox, + } + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + return + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote download invocation failed without exception context") + raise last_exc + + +def _invoke_registered_remote_component( + *, + component_key: str, + remote_url: str, + request: Any, + resolved_inputs: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + """Invoke a manifest-registered HTTP component as a black box.""" + manifest = _COMPONENT_REGISTRY.get(component_key) + if manifest is None: + raise RuntimeError(f"Unknown component manifest '{component_key}'") + payload = _resolve_runtime_bindings(manifest.runtime.remote_request_bindings, request, resolved_inputs) + response = _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + return _extract_remote_outputs(manifest=manifest, response=response) + + +def _invoke_remote_feature_source_component( + *, + remote_url: str, + feature_source: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], list[float]]: + result = _post_remote_json( + remote_url=remote_url, + payload={"feature_source": feature_source, "include_features": True}, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + features = result.get("features") + bbox = result.get("bbox") + if not isinstance(features, dict) or not isinstance(bbox, list): + raise RuntimeError("Remote feature_source response missing features/bbox") + return features, [float(x) for x in bbox] + + +def _invoke_remote_temporal_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + target_period_type: str, + method: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + return _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "start": start, + "end": end, + "bbox": bbox, + "target_period_type": target_period_type, + "method": method, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + + +def _invoke_remote_spatial_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + feature_source: dict[str, Any], + method: str, + feature_id_property: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> list[dict[str, Any]]: + result = _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "start": start, + "end": end, + "feature_source": feature_source, + "method": method, + "bbox": bbox, + "feature_id_property": feature_id_property, + "include_records": True, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + records = result.get("records") + if not isinstance(records, list): + raise RuntimeError("Remote spatial_aggregation response missing records") + return records + + +def _invoke_remote_build_datavalueset_component( + *, + remote_url: str, + dataset_id: str, + period_type: str, + records: list[dict[str, Any]], + dhis2: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], str]: + result = _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "period_type": period_type, + "records": records, + "dhis2": dhis2, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + data_value_set = result.get("data_value_set") + output_file = result.get("output_file") + if not isinstance(data_value_set, dict) or not isinstance(output_file, str): + raise RuntimeError("Remote build_datavalueset response missing data_value_set/output_file") + return data_value_set, output_file + + +def _post_remote_json( + *, + remote_url: str, + payload: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + body = response.json() + if not isinstance(body, dict): + raise RuntimeError("Remote component returned non-object JSON response") + return body + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote component invocation failed without exception context") + raise last_exc + + +def _resolve_runtime_bindings( + bindings: dict[str, Any], + request: Any, + resolved_inputs: dict[str, Any], +) -> dict[str, Any]: + """Resolve manifest-declared HTTP payload bindings.""" + return { + key: _resolve_runtime_value(value, request=request, resolved_inputs=resolved_inputs) + for key, value in bindings.items() + } + + +def _resolve_runtime_value(value: Any, *, request: Any, resolved_inputs: dict[str, Any]) -> Any: + """Resolve one runtime binding value.""" + if isinstance(value, str) and value.startswith("$request."): + return _dump_runtime_value(_lookup_object_path(request, value.removeprefix("$request."))) + if isinstance(value, str) and value.startswith("$resolved."): + return _dump_runtime_value(_lookup_mapping_path(resolved_inputs, value.removeprefix("$resolved."))) + if isinstance(value, dict): + return { + key: _resolve_runtime_value(item, request=request, resolved_inputs=resolved_inputs) + for key, item in value.items() + } + if isinstance(value, list): + return [_resolve_runtime_value(item, request=request, resolved_inputs=resolved_inputs) for item in value] + return value + + +def _lookup_object_path(obj: Any, path: str) -> Any: + """Resolve dotted attribute path from object or mapping.""" + current = obj + for part in path.split("."): + if isinstance(current, Mapping): + current = current[part] + else: + current = getattr(current, part) + return current + + +def _lookup_mapping_path(mapping: Mapping[str, Any], path: str) -> Any: + """Resolve dotted path from mapping.""" + current: Any = mapping + for part in path.split("."): + if not isinstance(current, Mapping): + raise KeyError(path) + current = current[part] + return current + + +def _dump_runtime_value(value: Any) -> Any: + """Convert pydantic/enums to JSON-friendly values for HTTP payloads.""" + if hasattr(value, "model_dump"): + return value.model_dump(mode="json") + if hasattr(value, "value"): + return value.value + return value + + +def _extract_remote_outputs(*, manifest: ComponentManifest, response: dict[str, Any]) -> dict[str, Any]: + """Project HTTP response into declared workflow outputs.""" + bindings = manifest.runtime.remote_response_bindings + if not bindings: + return {output_name: response[output_name] for output_name in manifest.outputs} + extracted: dict[str, Any] = {} + for output_name, response_key in bindings.items(): + extracted[output_name] = response.get(response_key) + return extracted diff --git a/src/eo_api/data_accessor/__init__.py b/src/eo_api/data_accessor/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_accessor/__init__.py +++ b/src/eo_api/data_accessor/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_accessor/routes.py b/src/eo_api/data_accessor/routes.py index 8e3a6f5..6afe16d 100644 --- a/src/eo_api/data_accessor/routes.py +++ b/src/eo_api/data_accessor/routes.py @@ -1,45 +1,54 @@ -"""FastAPI router exposing dataset endpoints.""" +"""FastAPI router exposing dataset retrieval endpoints.""" from typing import Any -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response +from fastapi import APIRouter, HTTPException from fastapi.responses import FileResponse from starlette.background import BackgroundTask -from .services.accessor import cleanup_file, get_data, xarray_to_temporary_netcdf from ..data_registry.routes import _get_dataset_or_404 +from ..shared.api_errors import api_error +from .services.accessor import ( + cleanup_file, + get_coverage_summary, + get_data, + get_point_values, + get_preview_summary, + xarray_to_temporary_netcdf, +) router = APIRouter() + @router.get("/{dataset_id}") def get_file( dataset_id: str, start: str, end: str, - xmin: float = None, - ymin: float = None, - xmax: float = None, - ymax: float = None, - format: str = 'netcdf', + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + format: str = "netcdf", ) -> FileResponse: """Get a dataset filtered to a timeperiod and bbox as a downloadable raster file.""" dataset = _get_dataset_or_404(dataset_id) # get filtered data - if all([xmin, ymin, xmax, ymax]): + bbox: list[float] | None + if xmin is not None and ymin is not None and xmax is not None and ymax is not None: bbox = [xmin, ymin, xmax, ymax] else: bbox = None ds = get_data(dataset, start, end, bbox) # save to temporary file - if format.lower() == 'netcdf': + if format.lower() == "netcdf": # convert to netcdf file_path = xarray_to_temporary_netcdf(ds) else: - raise ValueError(f'Unsupported output format: {format}') + raise ValueError(f"Unsupported output format: {format}") # return as file return FileResponse( @@ -47,4 +56,116 @@ def get_file( media_type="application/x-netcdf", filename="eo-api-raster-download.nc", background=BackgroundTask(cleanup_file, file_path), - ) \ No newline at end of file + ) + + +@router.get("/{dataset_id}/point") +def get_point_value( + dataset_id: str, + lon: float, + lat: float, + start: str | None = None, + end: str | None = None, +) -> dict[str, Any]: + """Return one dataset's value series at a requested lon/lat point.""" + dataset = _get_dataset_or_404(dataset_id) + try: + return get_point_values(dataset, lon=lon, lat=lat, start=start, end=end) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="point_query_invalid", + error_code="POINT_QUERY_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc + + +@router.get("/{dataset_id}/preview") +def get_dataset_preview( + dataset_id: str, + start: str | None = None, + end: str | None = None, + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return summary stats and a small raster sample for preview workflows.""" + dataset = _get_dataset_or_404(dataset_id) + bbox: list[float] | None + if any(value is not None for value in (xmin, ymin, xmax, ymax)): + if not all(value is not None for value in (xmin, ymin, xmax, ymax)): + raise HTTPException( + status_code=422, + detail=api_error( + error="preview_invalid", + error_code="PREVIEW_INVALID", + message="Provide all of xmin, ymin, xmax, ymax together", + resource_id=dataset_id, + ), + ) + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None + bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] + else: + bbox = None + + try: + return get_preview_summary(dataset, start=start, end=end, bbox=bbox, max_cells=max_cells) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="preview_invalid", + error_code="PREVIEW_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc + + +@router.get("/{dataset_id}/coverage") +def get_dataset_coverage_summary( + dataset_id: str, + start: str | None = None, + end: str | None = None, + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return a lightweight coverage-style response for a raster subset.""" + dataset = _get_dataset_or_404(dataset_id) + bbox: list[float] | None + if any(value is not None for value in (xmin, ymin, xmax, ymax)): + if not all(value is not None for value in (xmin, ymin, xmax, ymax)): + raise HTTPException( + status_code=422, + detail=api_error( + error="coverage_invalid", + error_code="COVERAGE_INVALID", + message="Provide all of xmin, ymin, xmax, ymax together", + resource_id=dataset_id, + ), + ) + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None + bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] + else: + bbox = None + + try: + return get_coverage_summary(dataset, start=start, end=end, bbox=bbox, max_cells=max_cells) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="coverage_invalid", + error_code="COVERAGE_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc diff --git a/src/eo_api/data_accessor/services/__init__.py b/src/eo_api/data_accessor/services/__init__.py index 512ee18..209f276 100644 --- a/src/eo_api/data_accessor/services/__init__.py +++ b/src/eo_api/data_accessor/services/__init__.py @@ -1 +1,3 @@ -from . import accessor \ No newline at end of file +from . import accessor as accessor + +__all__ = ["accessor"] diff --git a/src/eo_api/data_accessor/services/accessor.py b/src/eo_api/data_accessor/services/accessor.py index d254a94..81cdc65 100644 --- a/src/eo_api/data_accessor/services/accessor.py +++ b/src/eo_api/data_accessor/services/accessor.py @@ -1,25 +1,31 @@ """Loading raster data from downloaded files into xarray.""" -import os -import json import logging +import os import tempfile from typing import Any +import numpy as np import xarray as xr from ...data_manager.services.downloader import get_cache_files, get_zarr_path -from ...data_manager.services.utils import get_time_dim, get_lon_lat_dims +from ...data_manager.services.utils import get_lon_lat_dims, get_time_dim from ...shared.time import numpy_datetime_to_period_string logger = logging.getLogger(__name__) -def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: list = None) -> xr.Dataset: + +def get_data( + dataset: dict[str, Any], + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, +) -> xr.Dataset: """Load an xarray raster dataset for a given time range and bbox.""" logger.info("Opening dataset") zarr_path = get_zarr_path(dataset) if zarr_path: - logger.info(f'Using optimized zarr file: {zarr_path}') + logger.info(f"Using optimized zarr file: {zarr_path}") ds = xr.open_zarr(zarr_path, consolidated=True) else: logger.warning( @@ -40,8 +46,8 @@ def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: if bbox is not None: logger.info(f"Subsetting xy to {bbox}") - xmin,ymin,xmax,ymax = list(map(float, bbox)) - lon_dim,lat_dim = get_lon_lat_dims(ds) + xmin, ymin, xmax, ymax = list(map(float, bbox)) + lon_dim, lat_dim = get_lon_lat_dims(ds) # TODO: this assumes y axis increases towards north and is not very stable # ...and also does not consider partial pixels at the edges # ...should probably switch to rioxarray.clip instead @@ -49,30 +55,173 @@ def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: return ds # type: ignore[no-any-return] + def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: """Return temporal and spatial coverage metadata for downloaded data.""" - ds = get_data(dataset) + try: + if not ds: + return {"temporal_coverage": None, "spatial_coverage": None} + + time_dim = get_time_dim(ds) + lon_dim, lat_dim = get_lon_lat_dims(ds) + + start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset["period_type"]) # type: ignore[arg-type] + end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset["period_type"]) # type: ignore[arg-type] + + xmin, xmax = ds[lon_dim].min().item(), ds[lon_dim].max().item() + ymin, ymax = ds[lat_dim].min().item(), ds[lat_dim].max().item() + + return { + "coverage": { + "temporal": {"start": start, "end": end}, + "spatial": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}, + } + } + finally: + ds.close() + + +def get_point_values( + dataset: dict[str, Any], + *, + lon: float, + lat: float, + start: str | None = None, + end: str | None = None, +) -> dict[str, Any]: + """Return dataset values at one point across the requested time range.""" + ds = get_data(dataset, start=start, end=end, bbox=None) + try: + if not ds.data_vars: + raise ValueError(f"Dataset '{dataset['id']}' has no data variables available") + + lon_dim, lat_dim = get_lon_lat_dims(ds) + time_dim = get_time_dim(ds) + lon_values = ds[lon_dim] + lat_values = ds[lat_dim] + + xmin, xmax = float(lon_values.min().item()), float(lon_values.max().item()) + ymin, ymax = float(lat_values.min().item()), float(lat_values.max().item()) + if lon < xmin or lon > xmax or lat < ymin or lat > ymax: + raise ValueError( + f"Requested point ({lon}, {lat}) is outside dataset coverage ([{xmin}, {ymin}] to [{xmax}, {ymax}])" + ) + + variable_name = str(dataset.get("variable") or str(next(iter(ds.data_vars)))) + if variable_name not in ds.data_vars: + variable_name = str(next(iter(ds.data_vars))) + data_array = ds[variable_name] + point = data_array.sel({lon_dim: lon, lat_dim: lat}, method="nearest") + + actual_lon = float(point.coords[lon_dim].item()) + actual_lat = float(point.coords[lat_dim].item()) + series: list[dict[str, Any]] = [] + for raw_time, raw_value in zip(point[time_dim].values, point.values.tolist(), strict=False): + value = _to_float(raw_value) + series.append( + { + "period": str(numpy_datetime_to_period_string(np.asarray(raw_time), dataset["period_type"])), + "value": value, + } + ) - if not ds: - return {"temporal_coverage": None, "spatial_coverage": None} + if not series: + raise ValueError(f"Dataset '{dataset['id']}' returned no values for the requested time range") + + return { + "dataset_id": dataset["id"], + "variable": variable_name, + "requested": {"lon": lon, "lat": lat, "start": start, "end": end}, + "resolved_point": {"lon": actual_lon, "lat": actual_lat}, + "value_count": len(series), + "values": series, + } + finally: + ds.close() - time_dim = get_time_dim(ds) - lon_dim, lat_dim = get_lon_lat_dims(ds) - start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset['period_type']) # type: ignore[arg-type] - end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset['period_type']) # type: ignore[arg-type] +def get_preview_summary( + dataset: dict[str, Any], + *, + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return summary statistics and a small sample for preview-oriented clients.""" + ds = get_data(dataset, start=start, end=end, bbox=bbox) + try: + if not ds.data_vars: + raise ValueError(f"Dataset '{dataset['id']}' has no data variables available") - xmin, xmax = ds[lon_dim].min().item(), ds[lon_dim].max().item() - ymin, ymax = ds[lat_dim].min().item(), ds[lat_dim].max().item() + variable_name = str(dataset.get("variable") or str(next(iter(ds.data_vars)))) + if variable_name not in ds.data_vars: + variable_name = str(next(iter(ds.data_vars))) + data_array = ds[variable_name] + lon_dim, lat_dim = get_lon_lat_dims(data_array) + time_dim = get_time_dim(data_array) + valid = data_array.where(~xr.apply_ufunc(np.isnan, data_array)) + sample = _build_preview_sample( + valid, + dataset=dataset, + lon_dim=lon_dim, + lat_dim=lat_dim, + time_dim=time_dim, + max_cells=max_cells, + ) + + return { + "dataset_id": dataset["id"], + "variable": variable_name, + "requested": {"start": start, "end": end, "bbox": bbox}, + "dims": {str(k): int(v) for k, v in valid.sizes.items()}, + "stats": { + "min": _to_float(valid.min(skipna=True).item()), + "max": _to_float(valid.max(skipna=True).item()), + "mean": _to_float(valid.mean(skipna=True).item()), + "value_count": int(valid.count().item()), + }, + "sample": sample, + } + finally: + ds.close() + + +def get_coverage_summary( + dataset: dict[str, Any], + *, + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return a lightweight coverage-style summary for a raster subset.""" + preview = get_preview_summary( + dataset, + start=start, + end=end, + bbox=bbox, + max_cells=max_cells, + ) + full_coverage = get_data_coverage(dataset).get("coverage", {}) return { + "dataset_id": preview["dataset_id"], + "variable": preview["variable"], + "requested": preview["requested"], "coverage": { - "temporal": {"start": start, "end": end}, - "spatial": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}, - } + "spatial": full_coverage.get("spatial"), + "temporal": full_coverage.get("temporal"), + }, + "subset": { + "dims": preview["dims"], + "stats": preview["stats"], + "sample": preview["sample"], + }, } + def xarray_to_temporary_netcdf(ds: xr.Dataset) -> str: """Write a dataset to a temporary NetCDF file and return the path.""" fd = tempfile.NamedTemporaryFile(suffix=".nc", delete=False) @@ -81,6 +230,62 @@ def xarray_to_temporary_netcdf(ds: xr.Dataset) -> str: ds.to_netcdf(path) return path + def cleanup_file(path: str) -> None: """Remove a file from disk.""" os.remove(path) + + +def _to_float(value: Any) -> float | None: + if value is None: + return None + scalar = np.asarray(value).item() + if np.isnan(scalar): + return None + return float(scalar) + + +def _build_preview_sample( + data_array: xr.DataArray, + *, + dataset: dict[str, Any], + lon_dim: str, + lat_dim: str, + time_dim: str, + max_cells: int, +) -> list[dict[str, Any]]: + """Build a small JSON-safe sample from a raster subset.""" + max_cells = max(1, max_cells) + sample_records: list[dict[str, Any]] = [] + + time_values = data_array[time_dim].values + lat_values = data_array[lat_dim].values + lon_values = data_array[lon_dim].values + + time_step = max(1, int(np.ceil(len(time_values) / max_cells))) + lat_step = max(1, int(np.ceil(len(lat_values) / max_cells))) + lon_step = max(1, int(np.ceil(len(lon_values) / max_cells))) + + for time_index in range(0, len(time_values), time_step): + for lat_index in range(0, len(lat_values), lat_step): + for lon_index in range(0, len(lon_values), lon_step): + value = data_array.isel({time_dim: time_index, lat_dim: lat_index, lon_dim: lon_index}).item() + numeric_value = _to_float(value) + if numeric_value is None: + continue + sample_records.append( + { + "period": str( + numpy_datetime_to_period_string( + np.asarray(time_values[time_index]), + dataset["period_type"], + ) + ), + "lat": float(lat_values[lat_index]), + "lon": float(lon_values[lon_index]), + "value": numeric_value, + } + ) + if len(sample_records) >= max_cells: + return sample_records + return sample_records diff --git a/src/eo_api/data_manager/__init__.py b/src/eo_api/data_manager/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_manager/__init__.py +++ b/src/eo_api/data_manager/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_manager/routes.py b/src/eo_api/data_manager/routes.py index c6c5679..f369dbe 100644 --- a/src/eo_api/data_manager/routes.py +++ b/src/eo_api/data_manager/routes.py @@ -1,14 +1,9 @@ """FastAPI router exposing dataset endpoints.""" -from typing import Any +from fastapi import APIRouter, BackgroundTasks -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response -from fastapi.responses import FileResponse -from starlette.background import BackgroundTask - -from .services import constants, downloader from ..data_registry.routes import _get_dataset_or_404 +from .services import downloader router = APIRouter() @@ -17,9 +12,9 @@ def download_dataset( dataset_id: str, start: str, + background_tasks: BackgroundTasks, end: str | None = None, overwrite: bool = False, - background_tasks: BackgroundTasks = None, ) -> dict[str, str]: """Download dataset as local netcdf files direct from the source.""" dataset = _get_dataset_or_404(dataset_id) @@ -30,10 +25,9 @@ def download_dataset( @router.get("/{dataset_id}/build_zarr", response_model=dict) def build_dataset_zarr( dataset_id: str, - background_tasks: BackgroundTasks = None, + background_tasks: BackgroundTasks, ) -> dict[str, str]: """Optimize dataset downloads by collecting all files to a single zarr archive.""" dataset = _get_dataset_or_404(dataset_id) - if background_tasks is not None: - background_tasks.add_task(downloader.build_dataset_zarr, dataset) + background_tasks.add_task(downloader.build_dataset_zarr, dataset) return {"status": "Building zarr file from dataset downloads"} diff --git a/src/eo_api/data_manager/services/__init__.py b/src/eo_api/data_manager/services/__init__.py index 2ba6614..a20a096 100644 --- a/src/eo_api/data_manager/services/__init__.py +++ b/src/eo_api/data_manager/services/__init__.py @@ -1 +1,5 @@ -from . import constants, downloader, utils \ No newline at end of file +from . import constants as constants +from . import downloader as downloader +from . import utils as utils + +__all__ = ["constants", "downloader", "utils"] diff --git a/src/eo_api/data_manager/services/constants.py b/src/eo_api/data_manager/services/constants.py index 32bd880..f9b6a86 100644 --- a/src/eo_api/data_manager/services/constants.py +++ b/src/eo_api/data_manager/services/constants.py @@ -1,17 +1,61 @@ -"""Module-level constants loaded at import time (DHIS2 org units, bbox, env config).""" +"""Module-level constants for downloader defaults. + +This module must stay import-safe. DHIS2-backed defaults are best-effort only, +so startup should not fail when DHIS2 is temporarily unavailable. +""" import json +import logging import os import geopandas as gpd from ...shared.dhis2_adapter import create_client, get_org_units_geojson -# load geojson from dhis2 at startup and keep in-memory -# TODO: should probably save to file instead -client = create_client() -ORG_UNITS_GEOJSON = get_org_units_geojson(client, level=2) -BBOX = list(map(float, gpd.read_file(json.dumps(ORG_UNITS_GEOJSON)).total_bounds)) +LOGGER = logging.getLogger(__name__) +_DEFAULT_BBOX = [-180.0, -90.0, 180.0, 90.0] + + +def _bbox_from_env() -> list[float] | None: + raw_bbox = os.getenv("EO_API_DEFAULT_BBOX") + if not raw_bbox: + return None + parts = [part.strip() for part in raw_bbox.split(",")] + if len(parts) != 4: + LOGGER.warning("Ignoring EO_API_DEFAULT_BBOX with invalid value: %s", raw_bbox) + return None + try: + return [float(part) for part in parts] + except ValueError: + LOGGER.warning("Ignoring EO_API_DEFAULT_BBOX with non-numeric values: %s", raw_bbox) + return None + + +def _load_org_unit_defaults() -> tuple[dict[str, object], list[float]]: + try: + client = create_client() + org_units_geojson = get_org_units_geojson(client, level=2) + bbox = list(map(float, gpd.read_file(json.dumps(org_units_geojson)).total_bounds)) + return org_units_geojson, bbox + except Exception as exc: + fallback_bbox = _bbox_from_env() or _DEFAULT_BBOX + dhis2_base_url = os.getenv("DHIS2_BASE_URL", "") + LOGGER.warning( + ( + "Failed to load DHIS2 org-unit defaults at startup from DHIS2_BASE_URL=%s. " + "The server will continue using fallback bbox %s and an empty org-unit GeoJSON cache. " + "This usually means the DHIS2 server is down, unreachable, or the credentials are invalid. " + "Original error: %s" + ), + dhis2_base_url, + fallback_bbox, + exc, + ) + return {"type": "FeatureCollection", "features": []}, fallback_bbox + + +# Best-effort startup defaults. Runtime flows can still provide explicit bbox. +ORG_UNITS_GEOJSON, BBOX = _load_org_unit_defaults() # env variables we need from .env # TODO: should probably centralize to shared config module diff --git a/src/eo_api/data_manager/services/downloader.py b/src/eo_api/data_manager/services/downloader.py index 93c8a27..c74c06c 100644 --- a/src/eo_api/data_manager/services/downloader.py +++ b/src/eo_api/data_manager/services/downloader.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) SCRIPT_DIR = Path(__file__).parent.resolve() -_download_dir = SCRIPT_DIR.parent.parent.parent.parent / 'data' / 'downloads' +_download_dir = SCRIPT_DIR.parent.parent.parent.parent / "data" / "downloads" if CACHE_OVERRIDE: _download_dir = Path(CACHE_OVERRIDE) DOWNLOAD_DIR = _download_dir @@ -29,6 +29,8 @@ def download_dataset( end: str | None, overwrite: bool, background_tasks: BackgroundTasks | None, + country_code: str | None = None, + bbox: list[float] | None = None, ) -> None: """Download dataset from source and store as local NetCDF cache files.""" cache_info = dataset["cache_info"] @@ -48,15 +50,22 @@ def download_dataset( sig = inspect.signature(eo_download_func) if "bbox" in sig.parameters: - params["bbox"] = BBOX + params["bbox"] = bbox or BBOX elif "country_code" in sig.parameters: - if COUNTRY_CODE: - params["country_code"] = COUNTRY_CODE + resolved_country_code = country_code or COUNTRY_CODE + if resolved_country_code: + params["country_code"] = resolved_country_code else: - raise Exception('Downloading WorldPop data requires COUNTRY_CODE environment variable') + raise Exception( + "Downloading WorldPop data requires country_code input (or COUNTRY_CODE environment variable)" + ) + + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) if background_tasks is not None: background_tasks.add_task(eo_download_func, **params) + else: + eo_download_func(**params) def build_dataset_zarr(dataset: dict[str, Any]) -> None: diff --git a/src/eo_api/data_manager/services/utils.py b/src/eo_api/data_manager/services/utils.py index aa797cc..7e3bc0c 100644 --- a/src/eo_api/data_manager/services/utils.py +++ b/src/eo_api/data_manager/services/utils.py @@ -2,9 +2,6 @@ from typing import Any -import numpy as np -import pandas as pd - def get_time_dim(ds: Any) -> str: """Return the name of the time dimension in a dataset or dataframe.""" diff --git a/src/eo_api/data_registry/__init__.py b/src/eo_api/data_registry/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_registry/__init__.py +++ b/src/eo_api/data_registry/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_registry/routes.py b/src/eo_api/data_registry/routes.py index ffa306d..029b921 100644 --- a/src/eo_api/data_registry/routes.py +++ b/src/eo_api/data_registry/routes.py @@ -2,15 +2,14 @@ from typing import Any -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response -from fastapi.responses import FileResponse -from starlette.background import BackgroundTask +from fastapi import APIRouter, HTTPException +from ..shared.api_errors import api_error from .services import datasets router = APIRouter() + @router.get("/") def list_datasets() -> list[dict[str, Any]]: """Return list of available datasets from registry.""" @@ -21,7 +20,15 @@ def _get_dataset_or_404(dataset_id: str) -> dict[str, Any]: """Look up a dataset by ID or raise 404.""" dataset = datasets.get_dataset(dataset_id) if not dataset: - raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + raise HTTPException( + status_code=404, + detail=api_error( + error="dataset_not_found", + error_code="DATASET_NOT_FOUND", + message=f"Dataset '{dataset_id}' not found", + resource_id=dataset_id, + ), + ) return dataset @@ -30,7 +37,8 @@ def get_dataset(dataset_id: str) -> dict[str, Any]: """Get a single dataset by ID.""" # Note: have to import inside function to avoid circular import from ..data_accessor.services.accessor import get_data_coverage + dataset = _get_dataset_or_404(dataset_id) coverage = get_data_coverage(dataset) dataset.update(coverage) - return dataset \ No newline at end of file + return dataset diff --git a/src/eo_api/data_registry/services/__init__.py b/src/eo_api/data_registry/services/__init__.py index 08014f6..9d0231a 100644 --- a/src/eo_api/data_registry/services/__init__.py +++ b/src/eo_api/data_registry/services/__init__.py @@ -1 +1,3 @@ -from . import datasets \ No newline at end of file +from . import datasets as datasets + +__all__ = ["datasets"] diff --git a/src/eo_api/data_registry/services/datasets.py b/src/eo_api/data_registry/services/datasets.py index 371e8fc..9bcc5de 100644 --- a/src/eo_api/data_registry/services/datasets.py +++ b/src/eo_api/data_registry/services/datasets.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) SCRIPT_DIR = Path(__file__).parent.resolve() -CONFIGS_DIR = SCRIPT_DIR.parent.parent.parent.parent / 'data' / 'datasets' +CONFIGS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "datasets" def list_datasets() -> list[dict[str, Any]]: diff --git a/src/eo_api/main.py b/src/eo_api/main.py index e12ab58..06fcbc1 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -1,10 +1,18 @@ """DHIS2 EO API -- Earth observation data API for DHIS2.""" -from fastapi import FastAPI +from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, Response +from fastapi.staticfiles import StaticFiles +from rio_tiler.errors import TileOutsideBounds import eo_api.startup # noqa: F401 # pyright: ignore[reportUnusedImport] -from eo_api import data_accessor, data_manager, data_registry, system +from eo_api import analytics_viewer, components, data_accessor, data_manager, data_registry, raster, system, workflows +from eo_api.ogc import routes as ogc_routes +from eo_api.ogc_api import ogc_api_app +from eo_api.publications import generated_routes as publication_generated_routes +from eo_api.publications import routes as publication_routes +from eo_api.shared.api_errors import api_error app = FastAPI() @@ -16,7 +24,32 @@ allow_headers=["*"], ) -app.include_router(system.routes.router, tags=['System']) -app.include_router(data_registry.routes.router, prefix='/registry', tags=['Data registry']) -app.include_router(data_manager.routes.router, prefix='/manage', tags=['Data manager']) -app.include_router(data_accessor.routes.router, prefix='/retrieve', tags=['Data retrieval']) + +@app.exception_handler(TileOutsideBounds) +async def tile_outside_bounds_handler(request: Request, exc: TileOutsideBounds) -> Response: + """Return a normal 404 when a requested tile lies outside dataset coverage.""" + if "/tiles/" in request.url.path: + return Response(status_code=404) + return JSONResponse( + status_code=404, + content=api_error( + error="tile_outside_bounds", + error_code="TILE_OUTSIDE_BOUNDS", + message=str(exc), + ), + ) + + +app.include_router(system.routes.router, tags=["System"]) +app.include_router(data_registry.routes.router, prefix="/registry", tags=["Data registry"]) +app.include_router(data_manager.routes.router, prefix="/manage", tags=["Data manager"]) +app.include_router(data_accessor.routes.router, prefix="/retrieve", tags=["Data retrieval"]) +app.include_router(raster.routes.router, prefix="/raster", tags=["Raster"]) +app.include_router(workflows.routes.router, prefix="/workflows", tags=["Workflows"]) +app.include_router(publication_routes.router, prefix="/publications", tags=["Publications"]) +app.include_router(publication_generated_routes.router, prefix="/publications", tags=["Publications"]) +app.include_router(analytics_viewer.routes.router, prefix="/analytics", tags=["Analytics"]) +app.include_router(components.routes.router, prefix="/components", tags=["Components"]) +app.include_router(ogc_routes.router, prefix="/ogcapi", tags=["OGC API"]) +app.mount("/data", StaticFiles(directory="data/downloads"), name="Data") +app.mount("/pygeoapi", ogc_api_app) diff --git a/src/eo_api/ogc/__init__.py b/src/eo_api/ogc/__init__.py new file mode 100644 index 0000000..a3635b0 --- /dev/null +++ b/src/eo_api/ogc/__init__.py @@ -0,0 +1 @@ +"""OGC adapter routes package.""" diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py new file mode 100644 index 0000000..5261b21 --- /dev/null +++ b/src/eo_api/ogc/routes.py @@ -0,0 +1,940 @@ +"""Thin OGC API adapter routes over the native workflow engine.""" + +from __future__ import annotations + +import uuid +from html import escape +from pathlib import Path +from typing import Any + +from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, Response +from fastapi.responses import FileResponse, HTMLResponse + +from ..data_manager.services.downloader import DOWNLOAD_DIR +from ..publications.schemas import PublishedResourceExposure +from ..publications.services import collection_id_for_resource, get_published_resource +from ..shared.api_errors import api_error +from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus, WorkflowRequest +from ..workflows.services.definitions import load_workflow_definition +from ..workflows.services.engine import execute_workflow +from ..workflows.services.job_store import get_job, get_job_result, initialize_job, list_jobs +from ..workflows.services.simple_mapper import normalize_simple_request +from .schemas import ( + OGCJobResultsExtended, + OGCJobResultsResponse, + OGCOutputFormatInfo, + OGCOutputReference, + OGCOutputValue, +) + +router = APIRouter() + +_PROCESS_ID = "generic-dhis2-workflow" +_PROCESS_TITLE = "Generic DHIS2 workflow" +_OGC_PROCESSES_CONFORMANCE = [ + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/core", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/oas30", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/json", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/job-list", +] + + +@router.get("", response_model=None) +def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: + """Return a native OGC landing page for processes and jobs.""" + base_url = str(request.base_url).rstrip("/") + body = { + "title": "DHIS2 EO API", + "description": ( + "Native OGC API landing page for workflow processes and jobs. " + "Collections and items are served by the mounted geospatial publication layer." + ), + "links": [ + {"rel": "self", "type": "application/json", "href": _request_href(request, f="json")}, + {"rel": "alternate", "type": "text/html", "href": _request_href(request, f="html")}, + {"rel": "service-desc", "type": "application/vnd.oai.openapi+json;version=3.0", "href": "/ogcapi/openapi"}, + {"rel": "conformance", "type": "application/json", "href": f"{base_url}/ogcapi/conformance"}, + {"rel": "data", "type": "application/json", "href": f"{base_url}/pygeoapi/collections"}, + {"rel": "processes", "type": "application/json", "href": f"{base_url}/ogcapi/processes"}, + {"rel": "jobs", "type": "application/json", "href": f"{base_url}/ogcapi/jobs"}, + ], + "navigation": [ + { + "title": "Browse Collections", + "description": "Open the OGC publication surface for collections and items.", + "href": f"{base_url}/pygeoapi/collections?f=html", + }, + { + "title": "List Processes", + "description": "View the exposed OGC process catalog backed by the native workflow engine.", + "href": f"{base_url}/ogcapi/processes", + }, + { + "title": "List Jobs", + "description": "Inspect OGC job records backed by the native job store.", + "href": f"{base_url}/ogcapi/jobs", + }, + { + "title": "Conformance", + "description": "See the native OGC API - Processes conformance declarations.", + "href": f"{base_url}/ogcapi/conformance", + }, + ], + } + if _wants_html(request, f): + return HTMLResponse(_render_ogc_root_html(body)) + return body + + +@router.get("/conformance") +def get_ogc_conformance(request: Request) -> dict[str, Any]: + """Return native OGC API - Processes conformance declarations.""" + return { + "conformsTo": _OGC_PROCESSES_CONFORMANCE, + "links": [ + {"rel": "self", "type": "application/json", "href": str(request.url)}, + { + "rel": "service-desc", + "type": "application/vnd.oai.openapi+json;version=3.0", + "href": str(request.base_url).rstrip("/") + "/ogcapi/openapi", + }, + ], + } + + +@router.get("/openapi") +def get_ogc_openapi(request: Request) -> dict[str, Any]: + """Return a minimal native service description for the process/job surface.""" + base_url = str(request.base_url).rstrip("/") + return { + "openapi": "3.0.2", + "info": { + "title": "DHIS2 EO API - Native OGC Processes", + "version": "0.1.0", + "description": ( + "Native OGC API - Processes service description for the FastAPI-owned process and job surface." + ), + }, + "servers": [{"url": f"{base_url}/ogcapi"}], + "paths": { + "/": {"get": {"summary": "Landing page", "responses": {"200": {"description": "Landing page"}}}}, + "/conformance": { + "get": {"summary": "Conformance", "responses": {"200": {"description": "Conformance classes"}}} + }, + "/processes": {"get": {"summary": "List processes", "responses": {"200": {"description": "Process list"}}}}, + "/processes/{process_id}": { + "get": { + "summary": "Describe process", + "parameters": [ + { + "name": "process_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Process description"}}, + } + }, + "/processes/{process_id}/execution": { + "post": { + "summary": "Execute process", + "parameters": [ + { + "name": "process_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + }, + { + "name": "Prefer", + "in": "header", + "required": False, + "schema": {"type": "string"}, + }, + ], + "requestBody": { + "required": True, + "content": { + "application/json": { + "schema": {"$ref": "#/components/schemas/WorkflowExecuteEnvelopeRequest"} + } + }, + }, + "responses": { + "200": {"description": "Synchronous execution result"}, + "202": {"description": "Accepted asynchronous execution"}, + }, + } + }, + "/jobs": {"get": {"summary": "List jobs", "responses": {"200": {"description": "Job list"}}}}, + "/jobs/{job_id}": { + "get": { + "summary": "Get job", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Job status"}}, + } + }, + "/jobs/{job_id}/results": { + "get": { + "summary": "Get job results", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + }, + { + "name": "extended", + "in": "query", + "required": False, + "schema": {"type": "boolean", "default": False}, + }, + ], + "responses": {"200": {"description": "OGC-compliant process results"}}, + } + }, + "/jobs/{job_id}/download": { + "get": { + "summary": "Download output artifact", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Artifact download"}}, + } + }, + }, + "components": { + "schemas": { + "WorkflowRequest": WorkflowRequest.model_json_schema(ref_template="#/components/schemas/{model}"), + "WorkflowExecuteEnvelopeRequest": WorkflowExecuteEnvelopeRequest.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + "OGCJobResultsResponse": OGCJobResultsResponse.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + "OGCJobResultsExtended": OGCJobResultsExtended.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + } + }, + } + + +@router.get("/processes") +def list_processes(request: Request) -> dict[str, Any]: + """List exposed OGC processes.""" + return { + "processes": [ + { + "id": _PROCESS_ID, + "title": _PROCESS_TITLE, + "description": "Execute the generic DHIS2 EO workflow and persist a native job record.", + "jobControlOptions": ["sync-execute", "async-execute"], + "outputTransmission": ["value", "reference"], + "links": [ + { + "rel": "self", + "type": "application/json", + "href": str(request.url_for("describe_ogc_process", process_id=_PROCESS_ID)), + } + ], + } + ] + } + + +@router.get("/processes/{process_id}", name="describe_ogc_process") +def describe_process(process_id: str, request: Request) -> dict[str, Any]: + """Describe the single exposed generic workflow process.""" + _require_process(process_id) + request_schema = WorkflowRequest.model_json_schema() + return { + "id": _PROCESS_ID, + "title": _PROCESS_TITLE, + "description": "OGC-facing adapter over the reusable native workflow engine.", + "jobControlOptions": ["sync-execute", "async-execute"], + "outputTransmission": ["value", "reference"], + "inputs": { + "request": { + "title": "Workflow Request", + "description": "Flat request contract normalized by the native workflow mapper.", + "schema": request_schema, + "minOccurs": 1, + "maxOccurs": 1, + } + }, + "outputs": { + "outputs": { + "title": "Workflow Outputs", + "description": "Declared workflow outputs returned as OGC output objects on the job results endpoint.", + "schema": OGCJobResultsResponse.model_json_schema(), + } + }, + "links": [ + { + "rel": "execute", + "type": "application/json", + "href": str(request.url_for("execute_ogc_process", process_id=_PROCESS_ID)), + } + ], + } + + +@router.post("/processes/{process_id}/execution", name="execute_ogc_process") +def execute_process( + process_id: str, + payload: WorkflowExecuteEnvelopeRequest, + request: Request, + response: Response, + background_tasks: BackgroundTasks, + prefer: str | None = Header(default=None), +) -> dict[str, Any]: + """Execute the generic workflow synchronously or submit it asynchronously.""" + _require_process(process_id) + normalized, _warnings = normalize_simple_request(payload.request) + + if prefer is not None and "respond-async" in prefer.lower(): + job_id = str(uuid.uuid4()) + workflow = load_workflow_definition(payload.request.workflow_id) + initialize_job( + job_id=job_id, + request=normalized, + request_payload=payload.request.model_dump(exclude_none=True), + workflow=workflow, + workflow_definition_source="catalog", + workflow_id=payload.request.workflow_id, + workflow_version=workflow.version, + status=WorkflowJobStatus.ACCEPTED, + process_id=_PROCESS_ID, + ) + background_tasks.add_task( + _run_async_workflow_job, + job_id, + normalized, + payload.request.workflow_id, + payload.request.model_dump(exclude_none=True), + payload.request.include_component_run_details, + ) + job_url = str(request.url_for("get_ogc_job", job_id=job_id)) + results_url = str(request.url_for("get_ogc_job_results", job_id=job_id)) + response.status_code = 202 + response.headers["Location"] = job_url + return { + "jobID": job_id, + "status": WorkflowJobStatus.ACCEPTED, + "location": job_url, + "jobUrl": job_url, + "resultsUrl": results_url, + } + + result = execute_workflow( + normalized, + workflow_id=payload.request.workflow_id, + request_params=payload.request.model_dump(exclude_none=True), + include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="catalog", + ) + job_url = str(request.url_for("get_ogc_job", job_id=result.run_id)) + results_url = str(request.url_for("get_ogc_job_results", job_id=result.run_id)) + publication = get_published_resource(f"workflow-output-{result.run_id}") + links: list[dict[str, Any]] = [ + {"rel": "monitor", "type": "application/json", "href": job_url}, + {"rel": "results", "type": "application/json", "href": results_url}, + ] + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + links.append( + { + "rel": "collection", + "type": "application/json", + "href": _collection_href(request, collection_id_for_resource(publication)), + } + ) + return { + "jobID": result.run_id, + "processID": _PROCESS_ID, + "status": WorkflowJobStatus.SUCCESSFUL, + "outputs": result.model_dump(mode="json"), + "links": links, + } + + +@router.get("/jobs") +def list_ogc_jobs(process_id: str | None = None) -> dict[str, Any]: + """List OGC-visible jobs backed by the native job store.""" + jobs = list_jobs(process_id=process_id, status=None) + return {"jobs": [job.model_dump(mode="json") for job in jobs]} + + +@router.get("/jobs/{job_id}", name="get_ogc_job") +def get_ogc_job(job_id: str, request: Request) -> dict[str, Any]: + """Fetch one OGC job view from the native job store.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + publication = get_published_resource(f"workflow-output-{job.job_id}") + links: list[dict[str, Any]] = [ + { + "rel": "self", + "type": "application/json", + "href": str(request.url_for("get_ogc_job", job_id=job.job_id)), + }, + { + "rel": "results", + "type": "application/json", + "href": str(request.url_for("get_ogc_job_results", job_id=job.job_id)), + }, + ] + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + links.append( + { + "rel": "collection", + "type": "application/json", + "href": _collection_href(request, collection_id_for_resource(publication)), + } + ) + return { + "jobID": job.job_id, + "processID": job.process_id, + "status": job.status, + "created": job.created_at, + "updated": job.updated_at, + "links": links, + } + + +@router.get("/jobs/{job_id}/results", name="get_ogc_job_results") +def get_ogc_job_results(job_id: str, request: Request, extended: bool = False) -> dict[str, Any]: + """Return OGC API - Processes compliant results for a completed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + result = get_job_result(job_id) + if result is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) + return _to_ogc_results(result=result, job_id=job_id, request=request, include_extended=extended) + + +@router.get("/jobs/{job_id}/download", name="download_ogc_job_output") +def download_ogc_job_output(job_id: str) -> FileResponse: + """Download the native artifact for a completed OGC job when available.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + result = get_job_result(job_id) + if result is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) + output_file = result.get("output_file") + if not isinstance(output_file, str) or not output_file: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_output_unavailable", + error_code="JOB_OUTPUT_UNAVAILABLE", + message=f"No downloadable output artifact is available for job '{job_id}'", + job_id=job_id, + ), + ) + output_path = Path(output_file).resolve() + downloads_root = DOWNLOAD_DIR.resolve() + if downloads_root not in output_path.parents or not output_path.exists(): + raise HTTPException( + status_code=404, + detail=api_error( + error="job_output_unavailable", + error_code="JOB_OUTPUT_UNAVAILABLE", + message=f"Output artifact for job '{job_id}' is not available for download", + job_id=job_id, + ), + ) + return FileResponse(output_path) + + +def _require_process(process_id: str) -> None: + if process_id != _PROCESS_ID: + raise HTTPException( + status_code=404, + detail=api_error( + error="process_not_found", + error_code="PROCESS_NOT_FOUND", + message=f"Unknown process_id '{process_id}'", + process_id=process_id, + ), + ) + + +def _run_async_workflow_job( + job_id: str, + normalized_request: Any, + workflow_id: str, + request_params: dict[str, Any], + include_component_run_details: bool, +) -> None: + try: + execute_workflow( + normalized_request, + workflow_id=workflow_id, + request_params=request_params, + include_component_run_details=include_component_run_details, + run_id=job_id, + ) + except HTTPException: + return + + +def _collection_href(request: Request, collection_id: str) -> str: + return str(request.base_url).rstrip("/") + f"/pygeoapi/collections/{collection_id}" + + +def _request_href(request: Request, **updates: Any) -> str: + params = dict(request.query_params) + for key, value in updates.items(): + if value is None: + params.pop(key, None) + else: + params[key] = str(value) + query = "&".join(f"{key}={value}" for key, value in params.items()) + suffix = f"?{query}" if query else "" + return f"{request.url.path}{suffix}" + + +def _wants_html(request: Request, f: str | None) -> bool: + if f is not None: + return f.lower() == "html" + accept = request.headers.get("accept", "") + return "text/html" in accept and "application/json" not in accept + + +def _to_ogc_results( + *, + result: dict[str, Any], + job_id: str, + request: Request, + include_extended: bool = False, +) -> dict[str, Any]: + """Transform native workflow results to an OGC API - Processes results envelope.""" + outputs: list[OGCOutputValue | OGCOutputReference] = [] + + native_outputs = result.get("outputs") + if isinstance(native_outputs, dict): + for output_id, output_value in native_outputs.items(): + if output_id in {"output_file", "data_value_set"}: + continue + outputs.append( + OGCOutputValue( + id=output_id, + value=output_value, + format=OGCOutputFormatInfo(media_type=_media_type_for_output_value(output_value)), + title=output_id.replace("_", " ").title(), + description=f"Process output: {output_id}", + ) + ) + + data_value_set = result.get("data_value_set") + if isinstance(data_value_set, dict): + outputs.append( + OGCOutputValue( + id="data_value_set", + value=data_value_set, + format=OGCOutputFormatInfo( + media_type="application/vnd.dhis2+json", + schema_url="https://dhis2.github.io/dhis2-api-specification/schemas/dataValueSet.json", + ), + title="DHIS2 DataValueSet", + description="Import-ready DHIS2 DataValueSet payload", + ) + ) + + download_href = _job_output_download_href(result=result, job_id=job_id, request=request) + if download_href is not None: + outputs.append( + OGCOutputReference( + id="output_file", + href=download_href, + format=OGCOutputFormatInfo(media_type=_media_type_for_path(str(result["output_file"]))), + title="Output File", + description="Downloadable native workflow artifact", + rel="related", + ) + ) + + if include_extended: + return OGCJobResultsExtended( + outputs=outputs, + metadata={ + "job_id": job_id, + "status": result.get("status"), + "run_id": result.get("run_id"), + "workflow_id": result.get("workflow_id"), + "workflow_version": result.get("workflow_version"), + "dataset_id": result.get("dataset_id"), + "bbox": result.get("bbox"), + "feature_count": result.get("feature_count"), + "value_count": result.get("value_count"), + "run_log_file": result.get("run_log_file"), + "component_runs": result.get("component_runs", []), + }, + ).model_dump(mode="json") + + return OGCJobResultsResponse(outputs=outputs).model_dump(mode="json") + + +def _job_output_download_href(*, result: dict[str, Any], job_id: str, request: Request) -> str | None: + output_file = result.get("output_file") + if not isinstance(output_file, str) or not output_file: + return None + output_path = Path(output_file).resolve() + downloads_root = DOWNLOAD_DIR.resolve() + if downloads_root not in output_path.parents or not output_path.exists(): + return None + return str(request.url_for("download_ogc_job_output", job_id=job_id)) + + +def _media_type_for_output_value(value: Any) -> str: + if isinstance(value, (dict, list, bool, int, float)) or value is None: + return "application/json" + return "text/plain" + + +def _media_type_for_path(path_value: str) -> str: + suffix = Path(path_value).suffix.lower() + if suffix == ".json": + return "application/json" + if suffix == ".geojson": + return "application/geo+json" + if suffix in {".tif", ".tiff"}: + return "image/tiff" + if suffix == ".zarr": + return "application/vnd+zarr" + return "application/octet-stream" + + +def _render_ogc_root_html(body: dict[str, Any]) -> str: + # Map icon SVGs to navigation items by title # noqa: E501 + icons_map = { # noqa: E501 + "Browse Collections": ( # noqa: E501 + '' + ), + "List Processes": ( # noqa: E501 + '' + "" + ), + "List Jobs": ( # noqa: E501 + '' + '' + ), + "Conformance": ( # noqa: E501 + '' + ), + } + + nav_cards = "".join( + ( + '' + '
{icon}
' + '{title}' + '{description}' + '
→
' + "
" + ).format( + href=escape(item["href"]), + title=escape(item["title"]), + description=escape(item["description"]), + icon=icons_map.get( + item["title"], + ( # noqa: E501 + '' + '' + "" + ), + ), + ) + for item in body.get("navigation", []) + ) + return f""" + + + + + {escape(body["title"])} + + + +
+
OGC API
+

{escape(body["title"])}

+

{escape(body["description"])}

+ + + +
+

🌍 DHIS2 Earth Observation API • GitHub

+
+
+ +""" diff --git a/src/eo_api/ogc/schemas.py b/src/eo_api/ogc/schemas.py new file mode 100644 index 0000000..754b9b3 --- /dev/null +++ b/src/eo_api/ogc/schemas.py @@ -0,0 +1,48 @@ +"""Native OGC API - Processes schemas.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class OGCOutputFormatInfo(BaseModel): + """Format descriptor for one OGC process output.""" + + media_type: str = Field(description="IANA media type for the output payload") + schema_url: str | None = Field(default=None, description="Optional schema or specification URL") + encoding: str | None = Field(default="UTF-8", description="Character encoding when applicable") + + +class OGCOutputValue(BaseModel): + """Inline OGC process output.""" + + id: str = Field(description="Output identifier") + value: Any = Field(description="Inline output value") + format: OGCOutputFormatInfo = Field(description="Format metadata") + title: str | None = Field(default=None) + description: str | None = Field(default=None) + + +class OGCOutputReference(BaseModel): + """Referenced OGC process output.""" + + id: str = Field(description="Output identifier") + href: str = Field(description="Absolute URL to the referenced output") + format: OGCOutputFormatInfo = Field(description="Format metadata") + title: str | None = Field(default=None) + description: str | None = Field(default=None) + rel: str = Field(default="related", description="Relationship type") + + +class OGCJobResultsResponse(BaseModel): + """Strict OGC API - Processes results envelope.""" + + outputs: list[OGCOutputValue | OGCOutputReference] = Field(default_factory=list) + + +class OGCJobResultsExtended(OGCJobResultsResponse): + """Extended OGC results with native metadata.""" + + metadata: dict[str, Any] | None = Field(default=None) diff --git a/src/eo_api/ogc_api/__init__.py b/src/eo_api/ogc_api/__init__.py new file mode 100644 index 0000000..1433c7d --- /dev/null +++ b/src/eo_api/ogc_api/__init__.py @@ -0,0 +1,50 @@ +"""Mounted pygeoapi application with publication-aware runtime refresh.""" + +from __future__ import annotations + +import asyncio +import importlib +import os +from types import ModuleType +from typing import Any + +from starlette.types import Receive, Scope, Send + +from ..publications.pygeoapi import write_generated_pygeoapi_documents + +_STARLETTE_APP_MODULE = "pygeoapi.starlette_app" + + +class DynamicPygeoapiApp: + """Refresh pygeoapi runtime documents before serving mounted requests. + + This keeps the mounted publication surface aligned with live publication + truth without requiring an application restart after each publication + change. + """ + + def __init__(self) -> None: + self._module: ModuleType | None = None + # pygeoapi keeps request handlers and config as module globals. + # Serialize mounted requests so reloads cannot race with in-flight + # requests and produce mixed old/new publication state. + self._lock = asyncio.Lock() + + async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: + async with self._lock: + config_path, openapi_path = write_generated_pygeoapi_documents() + os.environ["PYGEOAPI_CONFIG"] = str(config_path) + os.environ["PYGEOAPI_OPENAPI"] = str(openapi_path) + + if self._module is None: + self._module = importlib.import_module(_STARLETTE_APP_MODULE) + else: + self._module = importlib.reload(self._module) + + app = getattr(self._module, "APP") + await app(scope, receive, send) + + +ogc_api_app: Any = DynamicPygeoapiApp() + +__all__ = ["ogc_api_app"] diff --git a/src/eo_api/publications/__init__.py b/src/eo_api/publications/__init__.py new file mode 100644 index 0000000..392d955 --- /dev/null +++ b/src/eo_api/publications/__init__.py @@ -0,0 +1 @@ +"""Publication registry package.""" diff --git a/src/eo_api/publications/capabilities.py b/src/eo_api/publications/capabilities.py new file mode 100644 index 0000000..61987dc --- /dev/null +++ b/src/eo_api/publications/capabilities.py @@ -0,0 +1,82 @@ +"""Publication serving capability policy.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .schemas import PublishedResourceExposure, PublishedResourceKind + + +@dataclass(frozen=True) +class PublicationServingCapability: + """Serving support for one publication contract.""" + + supported: bool + asset_format: str + served_by: tuple[str, ...] + ogc_collection: bool = False + error: str | None = None + + +def default_asset_format_for_kind(kind: PublishedResourceKind) -> str: + """Default asset format for a publication kind.""" + defaults = { + PublishedResourceKind.FEATURE_COLLECTION: "geojson", + PublishedResourceKind.COVERAGE: "zarr", + PublishedResourceKind.TILESET: "tiles", + PublishedResourceKind.COLLECTION: "json", + } + return defaults.get(kind, "file") + + +def evaluate_publication_serving( + *, + kind: PublishedResourceKind, + exposure: PublishedResourceExposure, + asset_format: str | None, +) -> PublicationServingCapability: + """Evaluate whether the server can expose a publication contract.""" + normalized_format = (asset_format or default_asset_format_for_kind(kind)).strip().lower() + + if exposure == PublishedResourceExposure.REGISTRY_ONLY: + return PublicationServingCapability( + supported=True, + asset_format=normalized_format, + served_by=("registry",), + ogc_collection=False, + ) + + supported_matrix: dict[tuple[PublishedResourceKind, str], PublicationServingCapability] = { + ( + PublishedResourceKind.FEATURE_COLLECTION, + "geojson", + ): PublicationServingCapability( + supported=True, + asset_format="geojson", + served_by=("pygeoapi", "analytics"), + ogc_collection=True, + ), + ( + PublishedResourceKind.COVERAGE, + "zarr", + ): PublicationServingCapability( + supported=True, + asset_format="zarr", + served_by=("pygeoapi", "raster"), + ogc_collection=True, + ), + } + capability = supported_matrix.get((kind, normalized_format)) + if capability is not None: + return capability + + return PublicationServingCapability( + supported=False, + asset_format=normalized_format, + served_by=(), + ogc_collection=False, + error=( + "Unsupported publication serving contract: " + f"kind='{kind}', asset_format='{normalized_format}', exposure='{exposure}'" + ), + ) diff --git a/src/eo_api/publications/generated_routes.py b/src/eo_api/publications/generated_routes.py new file mode 100644 index 0000000..dbd9533 --- /dev/null +++ b/src/eo_api/publications/generated_routes.py @@ -0,0 +1,29 @@ +"""Routes exposing generated pygeoapi documents from publication truth.""" + +from fastapi import APIRouter + +from .pygeoapi import build_pygeoapi_config, build_pygeoapi_openapi, write_generated_pygeoapi_documents + +router = APIRouter() + + +@router.get("/pygeoapi/config") +def get_generated_pygeoapi_config() -> dict[str, object]: + """Return generated pygeoapi config from backend publication truth.""" + return build_pygeoapi_config() + + +@router.get("/pygeoapi/openapi") +def get_generated_pygeoapi_openapi() -> dict[str, object]: + """Return generated pygeoapi OpenAPI projection from backend publication truth.""" + return build_pygeoapi_openapi() + + +@router.post("/pygeoapi/materialize") +def materialize_generated_pygeoapi_documents() -> dict[str, str]: + """Write generated pygeoapi documents to disk for runtime wiring.""" + config_path, openapi_path = write_generated_pygeoapi_documents() + return { + "config_path": str(config_path), + "openapi_path": str(openapi_path), + } diff --git a/src/eo_api/publications/pygeoapi.py b/src/eo_api/publications/pygeoapi.py new file mode 100644 index 0000000..4cfa650 --- /dev/null +++ b/src/eo_api/publications/pygeoapi.py @@ -0,0 +1,286 @@ +"""Generate pygeoapi-facing documents from backend publication state.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any +from urllib.parse import urlsplit + +import yaml + +from ..data_manager.services.downloader import DOWNLOAD_DIR, get_zarr_path +from ..data_registry.services.datasets import get_dataset +from .schemas import ( + PublishedResource, + PublishedResourceClass, + PublishedResourceExposure, + PublishedResourceKind, +) +from .services import collection_id_for_resource, ensure_source_dataset_publications, list_published_resources + +_DEFAULT_SERVER_URL = "http://127.0.0.1:8000/pygeoapi" +_TEMPLATES_DIR = Path(__file__).resolve().parent / "pygeoapi_templates" + + +def build_pygeoapi_config(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, Any]: + """Build a minimal pygeoapi config from published resources.""" + ensure_source_dataset_publications() + resources = list(_iter_pygeoapi_resources()) + return { + "server": { + "bind": {"host": "0.0.0.0", "port": 5000}, + "url": server_url, + "mimetype": "application/json; charset=UTF-8", + "encoding": "utf-8", + "templates": {"path": str(_TEMPLATES_DIR)}, + "languages": ["en-US"], + "limits": {"default_items": 20, "max_items": 50}, + "map": { + "url": "https://tile.openstreetmap.org/{z}/{x}/{y}.png", + "attribution": "OpenStreetMap", + }, + "gzip": True, + }, + "logging": {"level": "ERROR"}, + "metadata": { + "identification": { + "title": {"en": "DHIS2 EO API"}, + "description": {"en": "Generated pygeoapi publication config from backend publication truth"}, + "keywords": {"en": ["EO", "DHIS2", "OGC"]}, + "terms_of_service": "https://dhis2.org", + "url": "https://dhis2.org", + }, + "license": { + "name": "CC-BY 4.0", + "url": "https://creativecommons.org/licenses/by/4.0/", + }, + "provider": {"name": "DHIS2 EO API", "url": "https://dhis2.org"}, + "contact": {"name": "DHIS2", "position": "Team", "email": "climate@dhis2.org"}, + }, + "resources": { + collection_id_for_resource(resource): _build_pygeoapi_resource(resource) for resource in resources + }, + } + + +def build_pygeoapi_openapi(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, Any]: + """Build a minimal OpenAPI projection that reflects generated collection resources.""" + config = build_pygeoapi_config(server_url=server_url) + resources = config["resources"] + return { + "openapi": "3.0.2", + "info": { + "title": "DHIS2 EO API", + "description": "Generated pygeoapi OpenAPI projection from backend publication truth", + "version": "0.1.0", + }, + "servers": [{"url": server_url}], + "paths": { + "/collections": { + "get": { + "summary": "Collections", + "operationId": "getCollections", + "responses": {"200": {"description": "successful operation"}}, + } + }, + **{ + f"/collections/{resource_id}": { + "get": { + "summary": f"Collection {resource_id}", + "operationId": f"getCollection_{resource_id.replace('-', '_')}", + "responses": {"200": {"description": "successful operation"}}, + } + } + for resource_id in resources + }, + }, + "x-generated-resources": list(resources.keys()), + } + + +def write_generated_pygeoapi_documents(*, server_url: str = _DEFAULT_SERVER_URL) -> tuple[Path, Path]: + """Persist generated pygeoapi config and OpenAPI documents to disk.""" + output_dir = DOWNLOAD_DIR / "pygeoapi" + output_dir.mkdir(parents=True, exist_ok=True) + config_path = output_dir / "pygeoapi-config.generated.yml" + openapi_path = output_dir / "pygeoapi-openapi.generated.yml" + config_text = yaml.safe_dump(build_pygeoapi_config(server_url=server_url), sort_keys=False) + config_path.write_text(config_text, encoding="utf-8") + openapi_path.write_text( + yaml.safe_dump(build_pygeoapi_openapi(server_url=server_url), sort_keys=False), + encoding="utf-8", + ) + return config_path, openapi_path + + +def _build_pygeoapi_resource(resource: PublishedResource) -> dict[str, Any]: + provider = _build_provider(resource) + return { + "type": "collection", + "title": {"en": resource.title}, + "description": {"en": _description_for_resource(resource)}, + "keywords": _keywords_for_resource(resource), + "links": _pygeoapi_links(resource), + "extents": { + "spatial": {"bbox": _bbox_for_resource(resource)}, + "temporal": _temporal_extent_for_resource(resource), + }, + "providers": [provider], + "metadata": { + "resource_id": resource.resource_id, + "resource_class": str(resource.resource_class), + "dataset_id": resource.dataset_id, + "workflow_id": resource.workflow_id, + "job_id": resource.job_id, + "kind": str(resource.kind), + **resource.metadata, + }, + } + + +def _bbox_for_resource(resource: PublishedResource) -> list[list[float]]: + bbox = resource.metadata.get("bbox") + if isinstance(bbox, list) and bbox: + return [bbox] + return [[-180.0, -90.0, 180.0, 90.0]] + + +def _temporal_extent_for_resource(resource: PublishedResource) -> dict[str, str | None]: + metadata = resource.metadata + start = metadata.get("time_start") + end = metadata.get("time_end") + if start is not None or end is not None: + return { + "begin": str(start) if start is not None else None, + "end": str(end) if end is not None else None, + } + period_type = metadata.get("period_type") + if period_type is not None: + value = str(period_type) + return {"begin": value, "end": value} + return {"begin": None, "end": None} + + +def _keywords_for_resource(resource: PublishedResource) -> list[str]: + keywords = ["EO", "DHIS2", str(resource.resource_class), str(resource.kind)] + if resource.dataset_id is not None: + keywords.append(resource.dataset_id) + if resource.workflow_id is not None: + keywords.append(resource.workflow_id) + return keywords + + +def _description_for_resource(resource: PublishedResource) -> str: + metadata = resource.metadata + + if resource.resource_class == PublishedResourceClass.SOURCE: + source = metadata.get("source") + variable = metadata.get("variable") + period_type = metadata.get("period_type") + parts = ["Source dataset"] + if source: + parts.append(f"from {source}") + if variable: + parts.append(f"for {variable}") + if period_type: + parts.append(f"with {period_type} cadence") + return " ".join(parts) + "." + + if resource.resource_class == PublishedResourceClass.DERIVED: + dataset_id = resource.dataset_id or metadata.get("dataset_id") + workflow_id = resource.workflow_id or metadata.get("workflow_id") + feature_count = metadata.get("feature_count") + value_count = metadata.get("value_count") + + parts = ["Derived workflow output"] + if workflow_id: + parts.append(f"from {workflow_id}") + if dataset_id: + parts.append(f"for {dataset_id}") + + details: list[str] = [] + if feature_count is not None: + details.append(f"{feature_count} features") + if value_count is not None: + details.append(f"{value_count} values") + if details: + parts.append(f"({', '.join(details)})") + + return " ".join(parts) + "." + + return resource.description + + +def _build_provider(resource: PublishedResource) -> dict[str, Any]: + if resource.kind == PublishedResourceKind.COVERAGE: + zarr_path = Path(resource.path) if resource.path is not None else None + if zarr_path is None: + dataset = get_dataset(str(resource.dataset_id)) + if dataset is None: + raise ValueError(f"Unknown dataset_id '{resource.dataset_id}' for resource '{resource.resource_id}'") + zarr_path = get_zarr_path(dataset) + if zarr_path is None: + raise ValueError(f"No zarr cache available for dataset '{resource.dataset_id}'") + return { + "name": "xarray", + "type": "coverage", + "data": str(zarr_path), + "default": True, + } + + if resource.kind == PublishedResourceKind.FEATURE_COLLECTION and resource.path is not None: + suffix = Path(resource.path).suffix.lower() + if suffix == ".geojson": + return { + "name": "GeoJSON", + "type": "feature", + "data": resource.path, + "id_field": "id", + "default": True, + } + + raise ValueError(f"Resource '{resource.resource_id}' is not yet mappable to a pygeoapi provider") + + +def _iter_pygeoapi_resources() -> list[PublishedResource]: + resources: list[PublishedResource] = [] + for resource in list_published_resources(exposure=PublishedResourceExposure.OGC): + try: + _build_provider(resource) + except ValueError: + continue + resources.append(resource) + return resources + + +def _pygeoapi_links(resource: PublishedResource) -> list[dict[str, str]]: + links: list[dict[str, str]] = [] + for link in resource.links: + href = str(link.get("href", "")) + rel = str(link.get("rel", "related")) + if href == "": + continue + link_type = "text/html" if rel == "analytics" else "application/json" + if rel == "analytics": + title = "Analytics Viewer" + elif rel == "raster-capabilities": + title = "Raster Rendering Capabilities" + else: + title = rel.replace("-", " ").title() + links.append( + { + "type": link_type, + "rel": rel, + "title": title, + "href": _absolute_ogc_href(href), + } + ) + return links + + +def _absolute_ogc_href(href: str) -> str: + if href.startswith("http://") or href.startswith("https://"): + return href + parsed = urlsplit(_DEFAULT_SERVER_URL) + origin = f"{parsed.scheme}://{parsed.netloc}" + return f"{origin}{href}" diff --git a/src/eo_api/publications/pygeoapi_templates/collections/collection.html b/src/eo_api/publications/pygeoapi_templates/collections/collection.html new file mode 100644 index 0000000..79a5c31 --- /dev/null +++ b/src/eo_api/publications/pygeoapi_templates/collections/collection.html @@ -0,0 +1,641 @@ +{% extends "_base.html" %} +{% block title %}{{ super() }} {{ data['title'] }} {% endblock %} +{% block desc %}{{ data.get('description','') | truncate(250) }}{% endblock %} +{% block tags %}{{ data.get('keywords',[]) | join(',') }}{% endblock %} +{% block crumbs %}{{ super() }} +/ {% trans %}Collections{% endtrans %} +/ {{ data['title'] | truncate( 25 ) }} +{% endblock %} + +{% block extrahead %} + + + + +{% endblock %} + +{% block body %} +
+ {% set resource_config = config.get('resources', {}).get(data['id'], {}) %} + {% set resource_metadata = resource_config.get('metadata', {}) %} + {% set providers = resource_config.get('providers', []) %} + {% set raster_links = data['links'] | selectattr('rel', 'equalto', 'raster-capabilities') | list %} + {% set variable_name = resource_metadata.get('variable', '...') %} + {% set prefers_aggregated_view = data['id'].startswith('chirps') or variable_name == 'precip' %} + {% set single_preview_href = '/raster/' ~ data['id'] ~ '/preview.png?variable=' ~ variable_name ~ '&datetime=2024-01-01' %} + {% set aggregate_preview_href = '/raster/' ~ data['id'] ~ '/preview.png?variable=' ~ variable_name ~ '&aggregation=sum&start=2024-01-01&end=2024-01-31' %} + {% set tilejson_href = '/raster/' ~ data['id'] ~ '/WebMercatorQuad/tilejson.json?variable=' ~ variable_name ~ '&datetime=2024-01-01' %} +
+
+

{{ data['title'] }}

+

{{ data['description'] }}

+

+ {% for kw in data['keywords'] %} + {{ kw }} + {% endfor %} +

+
+
+
+
+ {% if providers and providers[0]['type'] == 'coverage' %} +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
Loading raster layer…
+
+ 0 +
+ 50 +
+ {% endif %} +
+ {% if providers and providers[0]['type'] == 'coverage' %} +
+ Click the map to inspect the rendered raster value for the active selection. +
+ {% endif %} +
+
+ + {% set ns = namespace(header_printed=false) %} + {% for link in data['links'] %} + {% if link['rel'] == 'license' %} + {% if not ns.header_printed %} +

{% trans %}License{% endtrans %}

+ {% set ns.header_printed = true %} + {% endif %} + + {% endif %} + {% endfor %} + + {% if data['itemType'] == 'feature' or data['itemType'] == 'record' %} + {% set analytics_links = data['links'] | selectattr('rel', 'equalto', 'analytics') | list %} +
+
+

{% trans %}Browse{% endtrans %}

+

{% trans %}Open the collection items view with map and attribute table.{% endtrans %}

+
+ {% trans %}Open items{% endtrans %} + +
+ {% if analytics_links %} +
+

{% trans %}Analytics{% endtrans %}

+

{% trans %}Open the separate analytics viewer for this published resource.{% endtrans %}

+ {% for link in analytics_links %} + + {{ link['title'] or 'Analytics Viewer' }} + + {% endfor %} +
+ {% endif %} +
+

{% trans %}Queryables{% endtrans %}

+

{% trans %}Inspect the fields that can be used for filtering, such as period.{% endtrans %}

+ + {% trans %}Open queryables{% endtrans %} + +
+
+

{% trans %}Schema{% endtrans %}

+

{% trans %}View the collection schema exposed by the publication layer.{% endtrans %}

+ + {% trans %}Open schema{% endtrans %} + +
+ {% for provider in providers %} + {% if 'tile' in provider['type'] %} +
+

{% trans %}Tiles{% endtrans %}

+

{% trans %}Open tile endpoints for map-oriented publication access.{% endtrans %}

+ + {% trans %}Open tiles{% endtrans %} + +
+ {% endif %} + {% endfor %} +
+ {% endif %} + + {% if providers and providers[0]['type'] == 'coverage' %} +
+
+

{% trans %}Raster Rendering{% endtrans %}

+

{% trans %}This collection exposes raster rendering via the backend-owned raster publication surface.{% endtrans %}

+ {% for link in raster_links %} + + {{ link['title'] or 'Raster Capabilities' }} + + {% endfor %} +
+
+

{% trans %}Temporal Contract{% endtrans %}

+

{% trans %}Temporal raster rendering requires either a single date or an aggregation window.{% endtrans %}

+ ?datetime=YYYY-MM-DD
+ ?aggregation=sum&start=YYYY-MM-DD&end=YYYY-MM-DD + +
+
+

{% trans %}Variable{% endtrans %}

+

{% trans %}Use the dataset variable when requesting preview, tilejson, or tile rendering.{% endtrans %}

+ variable={{ variable_name }} + +
+
+

{% trans %}Default Styling{% endtrans %}

+

{% trans %}Rendering uses dataset-aware defaults so previews are readable without manual rescaling.{% endtrans %}

+
CHIRPS defaults: colormap ylorrd, single date 0,50, monthly sum 0,300
+
+
+ {% endif %} + + {% if 'parameter_names' in data %} +

Parameters

+ + + + + + + {% for parameter in data['parameter_names'].values() %} + + + + + + {% endfor %} +
idnameunits
{{ parameter['id'] }}{{ parameter['name'] }}{{ parameter['unit']['symbol']['value'] }}
+ {% endif %} + +

{% trans %}Links{% endtrans %}

+

{% trans %}Raw protocol and related links for this collection.{% endtrans %}

+ + {% if data['itemType'] == 'feature' %} +

{% trans %}Reference Systems{% endtrans %}

+
    + {% for crs in data['crs'] %} +
  • + {{ crs }} +
  • + {% endfor %} +
+

{% trans %}Storage CRS{% endtrans %}

+ + {% endif %} + +
+{% endblock %} + +{% block extrafoot %} + +{% endblock %} diff --git a/src/eo_api/publications/routes.py b/src/eo_api/publications/routes.py new file mode 100644 index 0000000..a3f328a --- /dev/null +++ b/src/eo_api/publications/routes.py @@ -0,0 +1,46 @@ +"""Routes for backend-owned publication state.""" + +from fastapi import APIRouter, HTTPException + +from ..shared.api_errors import api_error +from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceListResponse +from .services import ensure_source_dataset_publications, get_published_resource, list_published_resources + +router = APIRouter() + + +@router.get("", response_model=PublishedResourceListResponse) +def list_publications( + resource_class: PublishedResourceClass | None = None, + dataset_id: str | None = None, + workflow_id: str | None = None, + exposure: PublishedResourceExposure | None = None, +) -> PublishedResourceListResponse: + """List backend-owned published resources.""" + ensure_source_dataset_publications() + return PublishedResourceListResponse( + resources=list_published_resources( + resource_class=resource_class, + dataset_id=dataset_id, + workflow_id=workflow_id, + exposure=exposure, + ) + ) + + +@router.get("/{resource_id}", response_model=PublishedResource) +def get_publication(resource_id: str) -> PublishedResource: + """Get one published resource.""" + ensure_source_dataset_publications() + resource = get_published_resource(resource_id) + if resource is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown resource_id '{resource_id}'", + resource_id=resource_id, + ), + ) + return resource diff --git a/src/eo_api/publications/schemas.py b/src/eo_api/publications/schemas.py new file mode 100644 index 0000000..240ddc7 --- /dev/null +++ b/src/eo_api/publications/schemas.py @@ -0,0 +1,59 @@ +"""Schemas for backend-owned published resources.""" + +from __future__ import annotations + +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, Field + + +class PublishedResourceClass(StrEnum): + """High-level publication origin.""" + + SOURCE = "source" + DERIVED = "derived" + + +class PublishedResourceKind(StrEnum): + """Supported OGC-facing resource kinds.""" + + COLLECTION = "collection" + COVERAGE = "coverage" + FEATURE_COLLECTION = "feature_collection" + TILESET = "tileset" + + +class PublishedResourceExposure(StrEnum): + """Whether a registered resource should be surfaced via OGC.""" + + REGISTRY_ONLY = "registry_only" + OGC = "ogc" + + +class PublishedResource(BaseModel): + """Backend-owned publication state for one discoverable resource.""" + + resource_id: str + resource_class: PublishedResourceClass + kind: PublishedResourceKind + title: str + description: str + dataset_id: str | None = None + workflow_id: str | None = None + job_id: str | None = None + run_id: str | None = None + path: str | None = None + ogc_path: str | None = None + asset_format: str | None = None + exposure: PublishedResourceExposure = PublishedResourceExposure.REGISTRY_ONLY + created_at: str + updated_at: str + metadata: dict[str, Any] = Field(default_factory=dict) + links: list[dict[str, Any]] = Field(default_factory=list) + + +class PublishedResourceListResponse(BaseModel): + """List of published resources.""" + + resources: list[PublishedResource] diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py new file mode 100644 index 0000000..ee2392f --- /dev/null +++ b/src/eo_api/publications/services.py @@ -0,0 +1,352 @@ +"""Disk-backed published resource registry.""" + +from __future__ import annotations + +import datetime as dt +import json +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +from ..data_accessor.services.accessor import get_data_coverage +from ..data_manager.services.downloader import DOWNLOAD_DIR, get_zarr_path +from ..data_registry.services.datasets import list_datasets +from .capabilities import ( + PublicationServingCapability, + default_asset_format_for_kind, + evaluate_publication_serving, +) +from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceKind + +if TYPE_CHECKING: + from ..workflows.schemas import WorkflowExecuteResponse + +logger = logging.getLogger(__name__) + +_LEGACY_PYGEOAPI_PREFIX = "/ogcapi" +_PYGEOAPI_PREFIX = "/pygeoapi" + + +def ensure_source_dataset_publications() -> list[PublishedResource]: + """Seed published source dataset resources from the dataset registry.""" + resources: list[PublishedResource] = [] + for dataset in list_datasets(): + resource_id = f"dataset-{dataset['id']}" + existing = get_published_resource(resource_id) + timestamp = _utc_now() + coverage_metadata = _coverage_metadata_for_dataset(dataset) + record = PublishedResource( + resource_id=resource_id, + resource_class=PublishedResourceClass.SOURCE, + kind=PublishedResourceKind.COVERAGE, + title=str(dataset.get("name") or dataset["id"]), + description=( + f"Source dataset from {dataset.get('source') or dataset['id']}" + f" for {dataset.get('variable') or dataset['id']}" + f" with {dataset.get('period_type') or 'native'} cadence." + ), + dataset_id=str(dataset["id"]), + path=None, + ogc_path=f"/pygeoapi/collections/{dataset['id']}", + asset_format="zarr", + exposure=PublishedResourceExposure.OGC, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + metadata={ + "dataset_id": dataset["id"], + "variable": dataset.get("variable"), + "period_type": dataset.get("period_type"), + "source": dataset.get("source"), + "source_url": dataset.get("source_url"), + "resolution": dataset.get("resolution"), + "units": dataset.get("units"), + **coverage_metadata, + }, + links=[ + { + "rel": "collection", + "href": f"/pygeoapi/collections/{dataset['id']}", + }, + { + "rel": "raster-capabilities", + "href": f"/raster/{dataset['id']}/capabilities", + }, + ], + ) + _write_resource(record) + resources.append(record) + return resources + + +def _coverage_metadata_for_dataset(dataset: dict[str, object]) -> dict[str, object]: + """Best-effort spatial/temporal metadata for one source dataset.""" + zarr_path = get_zarr_path(dataset) + if zarr_path is None: + logger.info( + "Skipping coverage metadata for dataset '%s': no zarr archive available", + dataset.get("id"), + ) + return {} + + try: + coverage = get_data_coverage(dataset).get("coverage") + except (OSError, RuntimeError, ValueError) as exc: + logger.warning( + "Skipping coverage metadata for dataset '%s': %s", + dataset.get("id"), + exc, + ) + return {} + except Exception: + logger.exception("Could not derive coverage metadata for dataset '%s'", dataset.get("id")) + return {} + + if not isinstance(coverage, dict): + return {} + + spatial = coverage.get("spatial") + temporal = coverage.get("temporal") + metadata: dict[str, object] = {} + + if isinstance(spatial, dict): + xmin = spatial.get("xmin") + ymin = spatial.get("ymin") + xmax = spatial.get("xmax") + ymax = spatial.get("ymax") + if all(value is not None for value in (xmin, ymin, xmax, ymax)): + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None + metadata["bbox"] = [float(xmin), float(ymin), float(xmax), float(ymax)] + + if isinstance(temporal, dict): + start = temporal.get("start") + end = temporal.get("end") + if start is not None: + metadata["time_start"] = str(start) + if end is not None: + metadata["time_end"] = str(end) + + return metadata + + +def register_workflow_output_publication( + *, + response: WorkflowExecuteResponse, + kind: PublishedResourceKind, + exposure: PublishedResourceExposure, + published_path: str | None = None, + asset_format: str | None = None, +) -> PublishedResource: + """Register a completed workflow output as a published derived resource.""" + resource_id = f"workflow-output-{response.run_id}" + existing = get_published_resource(resource_id) + timestamp = _utc_now() + publication_path = published_path or response.output_file + resolved_asset_format = asset_format or default_asset_format_for_kind(kind) + capability = evaluate_publication_serving( + kind=kind, + exposure=exposure, + asset_format=resolved_asset_format, + ) + if not capability.supported: + raise ValueError(capability.error or "Unsupported publication serving contract") + ogc_path = _derived_resource_ogc_path(resource_id=resource_id, capability=capability) + analytics_metadata = ( + _analytics_metadata_for_published_asset(publication_path) + if kind == PublishedResourceKind.FEATURE_COLLECTION + else {"eligible": False, "period_count": 0, "has_period_field": False} + ) + links = [ + {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, + {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, + ] + if ogc_path is not None: + links.append({"rel": "collection", "href": ogc_path}) + if "raster" in capability.served_by: + links.append({"rel": "raster-capabilities", "href": f"/raster/{resource_id}/capabilities"}) + if analytics_metadata["eligible"] and "analytics" in capability.served_by: + links.append({"rel": "analytics", "href": f"/analytics/publications/{resource_id}/viewer"}) + record = PublishedResource( + resource_id=resource_id, + resource_class=PublishedResourceClass.DERIVED, + kind=kind, + title=f"{response.workflow_id} output for {response.dataset_id}", + description=( + f"Derived workflow output from {response.workflow_id} for {response.dataset_id} " + f"({response.feature_count or 0} features, {response.value_count or 0} values)." + ), + dataset_id=response.dataset_id, + workflow_id=response.workflow_id, + job_id=response.run_id, + run_id=response.run_id, + path=publication_path, + ogc_path=ogc_path, + asset_format=resolved_asset_format, + exposure=exposure, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + metadata={ + "workflow_id": response.workflow_id, + "workflow_version": response.workflow_version, + "dataset_id": response.dataset_id, + "feature_count": response.feature_count, + "value_count": response.value_count, + "bbox": response.bbox, + "native_output_file": response.output_file, + "period_count": analytics_metadata["period_count"], + "has_period_field": analytics_metadata["has_period_field"], + "analytics_eligible": analytics_metadata["eligible"], + }, + links=links, + ) + _write_resource(record) + return record + + +def _derived_resource_ogc_path(*, resource_id: str, capability: PublicationServingCapability) -> str | None: + if capability.ogc_collection: + return f"/pygeoapi/collections/{resource_id}" + return None + + +def list_published_resources( + *, + resource_class: PublishedResourceClass | None = None, + dataset_id: str | None = None, + workflow_id: str | None = None, + exposure: PublishedResourceExposure | None = None, +) -> list[PublishedResource]: + """List persisted published resources.""" + resources: list[PublishedResource] = [] + for path in _resources_dir().glob("*.json"): + resource = PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + resources.append(_normalize_pygeoapi_resource_links(resource)) + resources.sort(key=lambda item: item.created_at, reverse=True) + if resource_class is not None: + resources = [item for item in resources if item.resource_class == resource_class] + if dataset_id is not None: + resources = [item for item in resources if item.dataset_id == dataset_id] + if workflow_id is not None: + resources = [item for item in resources if item.workflow_id == workflow_id] + if exposure is not None: + resources = [item for item in resources if item.exposure == exposure] + return resources + + +def get_published_resource(resource_id: str) -> PublishedResource | None: + """Fetch a single published resource.""" + path = _resource_path(resource_id) + if not path.exists(): + return None + resource = PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + return _normalize_pygeoapi_resource_links(resource) + + +def delete_published_resource(resource_id: str) -> PublishedResource | None: + """Delete one persisted published resource if it exists.""" + resource = get_published_resource(resource_id) + if resource is None: + return None + path = _resource_path(resource_id) + if path.exists(): + path.unlink() + return resource + + +def get_published_resource_by_collection_id(collection_id: str) -> PublishedResource | None: + """Resolve an OGC collection identifier to a published resource.""" + ensure_source_dataset_publications() + for resource in list_published_resources(exposure=PublishedResourceExposure.OGC): + if _collection_id_for_resource(resource) == collection_id: + return resource + return None + + +def collection_id_for_resource(resource: PublishedResource) -> str: + """Return the OGC collection identifier for a published resource.""" + return _collection_id_for_resource(resource) + + +def _write_resource(resource: PublishedResource) -> None: + _resources_dir().mkdir(parents=True, exist_ok=True) + _resource_path(resource.resource_id).write_text(resource.model_dump_json(indent=2), encoding="utf-8") + + +def _resource_path(resource_id: str) -> Path: + return _resources_dir() / f"{resource_id}.json" + + +def _resources_dir() -> Path: + return DOWNLOAD_DIR / "published_resources" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _collection_id_for_resource(resource: PublishedResource) -> str: + if resource.resource_class == PublishedResourceClass.SOURCE and resource.dataset_id is not None: + return resource.dataset_id + return resource.resource_id + + +def _normalize_pygeoapi_resource_links(resource: PublishedResource) -> PublishedResource: + updates: dict[str, object] = {} + + if resource.ogc_path and resource.ogc_path.startswith(_LEGACY_PYGEOAPI_PREFIX): + updates["ogc_path"] = resource.ogc_path.replace(_LEGACY_PYGEOAPI_PREFIX, _PYGEOAPI_PREFIX, 1) + + normalized_links: list[dict[str, object]] = [] + links_changed = False + for link in resource.links: + normalized_link = dict(link) + href = normalized_link.get("href") + if isinstance(href, str) and href.startswith(_LEGACY_PYGEOAPI_PREFIX): + normalized_link["href"] = href.replace(_LEGACY_PYGEOAPI_PREFIX, _PYGEOAPI_PREFIX, 1) + links_changed = True + normalized_links.append(normalized_link) + + if links_changed: + updates["links"] = normalized_links + + if not updates: + return resource + + return resource.model_copy(update=updates) + + +def _analytics_metadata_for_published_asset(path_value: str | None) -> dict[str, bool | int]: + if path_value is None: + return {"eligible": False, "period_count": 0, "has_period_field": False} + + path = Path(path_value) + if path.suffix.lower() != ".geojson" or not path.exists(): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + features = payload.get("features") + if not isinstance(features, list): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + periods: set[str] = set() + has_period_field = False + for feature in features: + if not isinstance(feature, dict): + continue + properties = feature.get("properties", {}) + if not isinstance(properties, dict): + continue + if "period" in properties: + has_period_field = True + value = properties.get("period") + if value is not None: + periods.add(str(value)) + + return { + "eligible": has_period_field and len(periods) > 1, + "period_count": len(periods), + "has_period_field": has_period_field, + } diff --git a/src/eo_api/raster/__init__.py b/src/eo_api/raster/__init__.py new file mode 100644 index 0000000..fe28f23 --- /dev/null +++ b/src/eo_api/raster/__init__.py @@ -0,0 +1,3 @@ +from . import routes as routes + +__all__ = ["routes"] diff --git a/src/eo_api/raster/routes.py b/src/eo_api/raster/routes.py new file mode 100644 index 0000000..a105055 --- /dev/null +++ b/src/eo_api/raster/routes.py @@ -0,0 +1,517 @@ +"""Raster publication routes and Zarr-backed TiTiler integration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, cast + +import attr +from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi import Path as FastAPIPath +from rio_tiler.colormap import cmap +from rio_tiler.io.xarray import XarrayReader +from titiler.core.dependencies import ImageRenderingParams +from titiler.core.routing import EndpointScope +from titiler.xarray.dependencies import XarrayParams +from titiler.xarray.extensions import VariablesExtension +from titiler.xarray.factory import TilerFactory +from titiler.xarray.io import Reader, get_variable + +from ..data_manager.services.downloader import get_zarr_path +from ..data_registry.services.datasets import get_dataset +from ..publications.schemas import PublishedResource, PublishedResourceKind +from ..publications.services import ( + collection_id_for_resource, + ensure_source_dataset_publications, + get_published_resource, + get_published_resource_by_collection_id, +) +from ..shared.api_errors import api_error + +router = APIRouter() + +SUPPORTED_AGGREGATIONS = {"sum", "mean", "max", "min"} + +RASTER_STYLE_PROFILES: dict[str, dict[str, Any]] = { + "chirps3_precipitation_daily": { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 50.0), + "sum": (0.0, 300.0), + "mean": (0.0, 50.0), + "max": (0.0, 100.0), + "min": (0.0, 20.0), + }, + "label": "Precipitation intensity", + }, + "era5land_precipitation_hourly": { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 15.0), + "sum": (0.0, 150.0), + "mean": (0.0, 15.0), + "max": (0.0, 40.0), + "min": (0.0, 10.0), + }, + "label": "Precipitation intensity", + }, + "era5land_temperature_hourly": { + "colormap_name": "coolwarm", + "rescale_by_mode": { + "datetime": (260.0, 320.0), + "sum": (260.0, 320.0), + "mean": (260.0, 320.0), + "max": (260.0, 330.0), + "min": (240.0, 310.0), + }, + "label": "Temperature", + }, + "worldpop_population_yearly": { + "colormap_name": "viridis", + "rescale_by_mode": { + "datetime": (0.0, 500.0), + "sum": (0.0, 1000.0), + "mean": (0.0, 500.0), + "max": (0.0, 1000.0), + "min": (0.0, 100.0), + }, + "label": "Population density", + }, +} + + +@router.get("/{resource_id}/capabilities") +def get_raster_capabilities(resource_id: str) -> dict[str, Any]: + """Describe whether a published resource is TiTiler-eligible.""" + resource = _resolve_published_resource(resource_id) + capabilities = _titiler_capabilities(resource) + return { + "resource_id": resource.resource_id, + "collection_id": collection_id_for_resource(resource), + "kind": str(resource.kind), + "asset_format": resource.asset_format, + "titiler": capabilities, + } + + +def _resource_path_dependency(resource_id: str) -> str: + """Resolve one published resource to a TiTiler-readable Zarr dataset path.""" + resource = _resolve_published_resource(resource_id) + capabilities = _titiler_capabilities(resource) + if not capabilities["eligible"]: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_publication_unsupported", + error_code="RASTER_PUBLICATION_UNSUPPORTED", + message=str(capabilities["reason"]), + resource_id=resource.resource_id, + ), + ) + + path = capabilities.get("path") + if not isinstance(path, str) or path == "": + raise HTTPException( + status_code=500, + detail=api_error( + error="raster_publication_invalid", + error_code="RASTER_PUBLICATION_INVALID", + message=f"Resource '{resource.resource_id}' is missing a TiTiler dataset path", + resource_id=resource.resource_id, + ), + ) + return path + + +@dataclass +class RasterReaderParams(XarrayParams): + """Reader params with a user-facing temporal selector.""" + + datetime: str | None = Query( + default=None, + description="Time slice to render for temporal datasets, for example `2024-01-01`.", + ) + aggregation: str | None = Query( + default=None, + description="Temporal aggregation to apply before rendering, for example `sum` or `mean`.", + ) + start: str | None = Query( + default=None, + description="Start date for temporal aggregation, for example `2024-01-01`.", + ) + end: str | None = Query( + default=None, + description="End date for temporal aggregation, for example `2024-01-31`.", + ) + + def __post_init__(self) -> None: + selector_values = list(self.sel or []) + if self.datetime is not None: + selector_values.append(f"time={self.datetime}") + self.sel = selector_values or None + + def as_dict(self, exclude_none: bool = True) -> dict[Any, Any]: + values = super().as_dict(exclude_none=exclude_none) + values.pop("datetime", None) + return values + + +@dataclass +class RasterImageRenderingParams(ImageRenderingParams): + """Image rendering params with dataset-aware default rescaling.""" + + resource_id: str = FastAPIPath() + aggregation: str | None = Query(default=None) + + def __post_init__(self) -> None: + raw_rescale = cast(Any, self.__dict__.get("rescale")) + if raw_rescale is None: + profile = _style_profile_for_resource(self.resource_id) + default_range = _default_rescale_for_profile(profile, aggregation=self.aggregation) + if default_range is not None: + self.__dict__["rescale"] = [f"{default_range[0]},{default_range[1]}"] + super().__post_init__() + + +@attr.s +class AggregatingReader(Reader): + """Xarray reader that can collapse a temporal dimension before rendering.""" + + aggregation: str | None = attr.ib(default=None) + start: str | None = attr.ib(default=None) + end: str | None = attr.ib(default=None) + + def __attrs_post_init__(self) -> None: + opener_options = { + "group": self.group, + "decode_times": self.decode_times, + **self.opener_options, + } + + self.ds = self.opener(self.src_path, **opener_options) + self.input = get_variable( + self.ds, + self.variable, + sel=self.sel, + ) + + if self.aggregation is not None: + self.input = _aggregate_temporal_dataarray( + self.input, + aggregation=self.aggregation, + start=self.start, + end=self.end, + ) + + XarrayReader.__attrs_post_init__(self) + + +def _require_temporal_selector_for_rendering(request: Request, resource_id: str) -> None: + """Require a time selector before rendering temporal datasets as images/tiles.""" + resource = _resolve_published_resource(resource_id) + dataset = _resolve_resource_dataset(resource) + if dataset is None or not _dataset_requires_temporal_selector(dataset): + return + + datetime_value = request.query_params.get("datetime") + aggregation = request.query_params.get("aggregation") + start = request.query_params.get("start") + end = request.query_params.get("end") + + time_selectors = [selector for selector in request.query_params.getlist("sel") if selector.startswith("time=")] + + if datetime_value and aggregation: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Use either 'datetime' or 'aggregation' with a date range, not both.", + resource_id=resource.resource_id, + ), + ) + + if aggregation is not None: + if aggregation not in SUPPORTED_AGGREGATIONS: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message=( + f"Unsupported aggregation '{aggregation}'. " + f"Supported values: {', '.join(sorted(SUPPORTED_AGGREGATIONS))}." + ), + resource_id=resource.resource_id, + ), + ) + if not start or not end: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Temporal aggregation requires both 'start' and 'end' query parameters.", + resource_id=resource.resource_id, + ), + ) + if time_selectors: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Do not combine 'aggregation' with a direct 'sel=time=...' selector.", + resource_id=resource.resource_id, + ), + ) + return + + if start or end: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Use 'start' and 'end' only together with an 'aggregation' query parameter.", + resource_id=resource.resource_id, + ), + ) + + if datetime_value or time_selectors: + return + + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_datetime_required", + error_code="RASTER_DATETIME_REQUIRED", + message=( + f"Temporal raster rendering for dataset '{dataset['id']}' requires a time selector. " + "Use '?datetime=YYYY-MM-DD' or '?aggregation=sum&start=YYYY-MM-DD&end=YYYY-MM-DD'." + ), + resource_id=resource.resource_id, + ), + ) + + +def _colormap_dependency( + resource_id: str, + colormap_name: str | None = Query(default=None, description="Named colormap override."), + colormap: str | None = Query(default=None, description="JSON encoded custom colormap override."), + aggregation: str | None = Query(default=None), +) -> Any: + if colormap_name: + return cmap.get(colormap_name) + + if colormap: + # Delegate explicit custom colormap handling back to TiTiler callers. + from titiler.core.dependencies import create_colormap_dependency + + return create_colormap_dependency(cmap)(colormap_name=None, colormap=colormap) + + profile = _style_profile_for_resource(resource_id) + if profile is None: + return None + + base_colormap = cmap.get(str(profile["colormap_name"])) + if not isinstance(base_colormap, dict): + return base_colormap + default_map = cast(dict[Any, Any], base_colormap.copy()) + if str(profile["colormap_name"]) in {"ylorrd", "blues", "viridis"}: + default_map[0] = (0, 0, 0, 0) + return default_map + + +_factory = TilerFactory( + reader=AggregatingReader, + router_prefix="", + path_dependency=_resource_path_dependency, + route_dependencies=[ + ( + [ + EndpointScope(path="/preview", method="GET"), + EndpointScope(path="/preview.{format}", method="GET"), + EndpointScope(path="/preview/{width}x{height}.{format}", method="GET"), + EndpointScope(path="/{tileMatrixSetId}/tilejson.json", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}.{format}", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}@{scale}x", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}@{scale}x.{format}", method="GET"), + ], + [Depends(_require_temporal_selector_for_rendering)], + ) + ], + extensions=[VariablesExtension()], + colormap_dependency=_colormap_dependency, + render_dependency=RasterImageRenderingParams, + reader_dependency=RasterReaderParams, + add_viewer=False, + add_ogc_maps=False, + add_preview=True, + add_part=False, +) +router.include_router(_factory.router, prefix="/{resource_id}") + + +def _resolve_published_resource(resource_id: str) -> PublishedResource: + ensure_source_dataset_publications() + resource = get_published_resource(resource_id) + if resource is None: + resource = get_published_resource_by_collection_id(resource_id) + if resource is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown published resource or collection '{resource_id}'", + resource_id=resource_id, + ), + ) + return resource + + +def _titiler_capabilities(resource: PublishedResource) -> dict[str, Any]: + if resource.kind not in {PublishedResourceKind.COVERAGE, PublishedResourceKind.TILESET}: + return { + "eligible": False, + "reader": None, + "reason": f"Resource kind '{resource.kind}' is not raster/tile publishable", + } + + dataset = _resolve_resource_dataset(resource) + if dataset is None: + return { + "eligible": False, + "reader": None, + "reason": ( + "No dataset registry record is linked to this resource, so a Zarr-backed raster " + "publication cannot be resolved." + ), + } + + candidate_path = _resolve_zarr_path(resource, dataset) + if candidate_path is None: + return { + "eligible": False, + "reader": "xarray", + "reason": ( + f"No Zarr archive is available yet for dataset '{dataset['id']}'. " + f"Build it first via '/manage/{dataset['id']}/build_zarr'." + ), + } + + base = f"/raster/{collection_id_for_resource(resource)}" + return { + "eligible": True, + "reader": "xarray", + "reason": None, + "path": str(candidate_path), + "dataset_id": dataset["id"], + "variable_hint": dataset.get("variable"), + "render_time_selector_required": _dataset_requires_temporal_selector(dataset), + "supported_render_aggregations": sorted(SUPPORTED_AGGREGATIONS), + "style_defaults": _style_profile_for_dataset(dataset), + "endpoints": { + "variables": f"{base}/variables", + "info": f"{base}/info", + "tilejson": f"{base}/WebMercatorQuad/tilejson.json", + "tiles": f"{base}/tiles/WebMercatorQuad/{{z}}/{{x}}/{{y}}.png", + "preview": f"{base}/preview.png", + "point": f"{base}/point/{{lon}},{{lat}}", + "statistics": f"{base}/statistics", + }, + } + + +def _resolve_resource_dataset(resource: PublishedResource) -> dict[str, Any] | None: + if resource.dataset_id is None: + return None + return get_dataset(resource.dataset_id) + + +def _dataset_requires_temporal_selector(dataset: dict[str, Any]) -> bool: + return bool(dataset.get("period_type")) + + +def _style_profile_for_resource(resource_id: str) -> dict[str, Any] | None: + resource = _resolve_published_resource(resource_id) + dataset = _resolve_resource_dataset(resource) + if dataset is None: + return None + return _style_profile_for_dataset(dataset) + + +def _style_profile_for_dataset(dataset: dict[str, Any]) -> dict[str, Any] | None: + profile = RASTER_STYLE_PROFILES.get(str(dataset["id"])) + if profile is not None: + return profile + + units = str(dataset.get("units") or "").lower() + variable = str(dataset.get("variable") or "").lower() + if "mm" in units or "precip" in variable: + return { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 50.0), + "sum": (0.0, 300.0), + "mean": (0.0, 50.0), + "max": (0.0, 100.0), + "min": (0.0, 20.0), + }, + "label": "Precipitation intensity", + } + + return None + + +def _default_rescale_for_profile( + profile: dict[str, Any] | None, + *, + aggregation: str | None, +) -> tuple[float, float] | None: + if profile is None: + return None + rescale_by_mode = profile.get("rescale_by_mode", {}) + mode = aggregation or "datetime" + range_value = rescale_by_mode.get(mode) or rescale_by_mode.get("datetime") + if range_value is None: + return None + return cast(tuple[float, float], tuple(range_value)) + + +def _aggregate_temporal_dataarray( + data_array: Any, + *, + aggregation: str, + start: str | None, + end: str | None, +) -> Any: + if "time" not in data_array.dims: + raise ValueError("Temporal aggregation requires a 'time' dimension") + + time_window = data_array.sel(time=slice(start, end)) + if time_window.sizes.get("time", 0) == 0: + raise ValueError("Temporal aggregation produced no time slices for the requested date range") + + aggregate_fn = getattr(time_window, aggregation, None) + if aggregate_fn is None: + raise ValueError(f"Unsupported temporal aggregation '{aggregation}'") + return aggregate_fn(dim="time", skipna=True) + + +def _resolve_zarr_path(resource: PublishedResource, dataset: dict[str, Any]) -> Path | None: + if resource.path: + resource_path = Path(resource.path) + if resource_path.exists(): + return resource_path + + native_output = resource.metadata.get("native_output_file") + if isinstance(native_output, str): + native_path = Path(native_output) + if native_path.exists(): + return native_path + + return get_zarr_path(dataset) diff --git a/src/eo_api/shared/api_errors.py b/src/eo_api/shared/api_errors.py new file mode 100644 index 0000000..cde2d7d --- /dev/null +++ b/src/eo_api/shared/api_errors.py @@ -0,0 +1,86 @@ +"""Shared typed API error helpers.""" + +from __future__ import annotations + +from typing import NoReturn + +from fastapi import HTTPException +from pydantic import BaseModel + + +class ApiErrorResponse(BaseModel): + """Stable API error envelope.""" + + error: str + error_code: str + message: str + resource_id: str | None = None + process_id: str | None = None + job_id: str | None = None + run_id: str | None = None + schedule_id: str | None = None + status: str | None = None + failed_component: str | None = None + failed_component_version: str | None = None + + +def api_error( + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + run_id: str | None = None, + schedule_id: str | None = None, + status: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, +) -> dict[str, str]: + """Build a stable API error envelope.""" + return ApiErrorResponse( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + run_id=run_id, + schedule_id=schedule_id, + status=status, + failed_component=failed_component, + failed_component_version=failed_component_version, + ).model_dump(exclude_none=True) + + +def raise_api_error( + status_code: int, + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + run_id: str | None = None, + status: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, +) -> NoReturn: + """Raise an HTTPException using the shared typed error envelope.""" + raise HTTPException( + status_code=status_code, + detail=api_error( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + run_id=run_id, + status=status, + failed_component=failed_component, + failed_component_version=failed_component_version, + ), + ) diff --git a/src/eo_api/shared/dhis2_adapter.py b/src/eo_api/shared/dhis2_adapter.py index 0e9e5b1..ea54348 100644 --- a/src/eo_api/shared/dhis2_adapter.py +++ b/src/eo_api/shared/dhis2_adapter.py @@ -86,4 +86,4 @@ def get_org_unit_geojson(client: DHIS2Client, uid: str) -> dict[str, Any]: def get_org_unit_subtree_geojson(client: DHIS2Client, uid: str) -> dict[str, Any]: """Fetch a subtree of organisation units as GeoJSON.""" - return cast(dict[str, Any], client.get_org_unit_subtree_geojson(uid)) \ No newline at end of file + return cast(dict[str, Any], client.get_org_unit_subtree_geojson(uid)) diff --git a/src/eo_api/shared/time.py b/src/eo_api/shared/time.py index 269740f..5690a11 100644 --- a/src/eo_api/shared/time.py +++ b/src/eo_api/shared/time.py @@ -1,7 +1,10 @@ +"""Shared time conversion helpers.""" + from typing import Any import numpy as np + def numpy_datetime_to_period_string(datetimes: np.ndarray[Any, Any], period_type: str) -> np.ndarray[Any, Any]: """Convert an array of numpy datetimes to truncated period strings.""" # TODO: this and numpy_period_string should be merged @@ -9,4 +12,4 @@ def numpy_datetime_to_period_string(datetimes: np.ndarray[Any, Any], period_type # Map periods to string lengths: YYYY-MM-DDTHH (13), YYYY-MM-DD (10), etc. lengths = {"hourly": 13, "daily": 10, "monthly": 7, "yearly": 4} - return s.astype(f"U{lengths[period_type]}") \ No newline at end of file + return s.astype(f"U{lengths[period_type]}") diff --git a/src/eo_api/startup.py b/src/eo_api/startup.py index 1d1ba51..e6be460 100644 --- a/src/eo_api/startup.py +++ b/src/eo_api/startup.py @@ -1,10 +1,14 @@ -"""Early-boot side effects +"""Early-boot side effects. This module is imported before any other eo_api modules so that environment variables and logging are configured before other imports. """ import logging +import os +import sys +from pathlib import Path + from dotenv import load_dotenv # noqa: E402 # -- Load .env (must happen before pygeoapi reads PYGEOAPI_CONFIG) ------------ @@ -19,3 +23,46 @@ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s - %(message)s")) eo_logger.addHandler(handler) eo_logger.propagate = False + + +def _configure_proj_data() -> None: + """Point PROJ at the active environment's data files.""" + candidates: list[Path] = [] + for sys_path in sys.path: + if not sys_path: + continue + candidates.append(Path(sys_path) / "rasterio" / "proj_data") + + try: + from pyproj import datadir + + pyproj_data_dir = datadir.get_data_dir() + if pyproj_data_dir: + candidates.append(Path(pyproj_data_dir)) + except Exception: + pass + + for candidate in candidates: + if candidate.exists(): + proj_path = str(candidate) + os.environ["PROJ_LIB"] = proj_path + os.environ["PROJ_DATA"] = proj_path + eo_logger.info("Configured PROJ data directory: %s", proj_path) + return + + eo_logger.warning("Could not locate a compatible PROJ data directory in the active environment") + + +def _configure_generated_pygeoapi() -> None: + """Materialize publication-driven pygeoapi documents before pygeoapi import.""" + from eo_api.publications.pygeoapi import write_generated_pygeoapi_documents + + server_url = os.environ.get("PYGEOAPI_SERVER_URL", "http://127.0.0.1:8000/pygeoapi") + config_path, openapi_path = write_generated_pygeoapi_documents(server_url=server_url) + os.environ["PYGEOAPI_CONFIG"] = str(config_path) + os.environ["PYGEOAPI_OPENAPI"] = str(openapi_path) + eo_logger.info("Configured generated pygeoapi documents: %s %s", config_path, openapi_path) + + +_configure_proj_data() +_configure_generated_pygeoapi() diff --git a/src/eo_api/system/__init__.py b/src/eo_api/system/__init__.py index 00e7846..865a013 100644 --- a/src/eo_api/system/__init__.py +++ b/src/eo_api/system/__init__.py @@ -1 +1,4 @@ -from . import routes, schemas \ No newline at end of file +from . import routes as routes +from . import schemas as schemas + +__all__ = ["routes", "schemas"] diff --git a/src/eo_api/system/routes.py b/src/eo_api/system/routes.py index 2639ea3..1c41b1f 100644 --- a/src/eo_api/system/routes.py +++ b/src/eo_api/system/routes.py @@ -4,7 +4,6 @@ from importlib.metadata import version from fastapi import APIRouter, Request -from fastapi.responses import RedirectResponse from .schemas import AppInfo, HealthStatus, Link, RootResponse, Status diff --git a/src/eo_api/workflows/__init__.py b/src/eo_api/workflows/__init__.py new file mode 100644 index 0000000..23f4ad6 --- /dev/null +++ b/src/eo_api/workflows/__init__.py @@ -0,0 +1,6 @@ +"""Workflow APIs for generic gridded-data to DHIS2 pipelines.""" + +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py new file mode 100644 index 0000000..450a017 --- /dev/null +++ b/src/eo_api/workflows/routes.py @@ -0,0 +1,398 @@ +"""API routes for workflow discovery, execution, and native job access.""" + +from typing import Any + +from fastapi import APIRouter, HTTPException, Request + +from ..publications.capabilities import evaluate_publication_serving +from ..publications.schemas import PublishedResourceExposure +from ..publications.services import collection_id_for_resource, get_published_resource +from ..shared.api_errors import api_error +from .schemas import ( + WorkflowAssemblyExecuteRequest, + WorkflowCatalogItem, + WorkflowCatalogResponse, + WorkflowExecuteEnvelopeRequest, + WorkflowExecuteResponse, + WorkflowJobCleanupResponse, + WorkflowJobListResponse, + WorkflowJobRecord, + WorkflowJobStatus, + WorkflowSchedule, + WorkflowScheduleCreateRequest, + WorkflowScheduleTriggerRequest, + WorkflowScheduleTriggerResponse, + WorkflowValidateRequest, + WorkflowValidateResponse, + WorkflowValidateStep, +) +from .services.definitions import list_workflow_definitions, load_workflow_definition +from .services.engine import execute_workflow, validate_workflow_steps +from .services.job_store import cleanup_jobs, delete_job, get_job, get_job_result, get_job_trace, list_jobs +from .services.schedules import create_schedule, delete_schedule, get_schedule, list_schedules, trigger_schedule +from .services.simple_mapper import normalize_simple_request + +router = APIRouter() + + +def _workflow_publication_summary(workflow: Any) -> dict[str, Any]: + publication = workflow.publication + capability = evaluate_publication_serving( + kind=publication.intent, + exposure=publication.exposure, + asset_format=publication.asset_format, + ) + asset_binding = None + if publication.asset is not None: + asset_binding = {"from_step": publication.asset.from_step, "output": publication.asset.output} + publication_inputs = { + name: {"from_step": ref.from_step, "output": ref.output} for name, ref in publication.inputs.items() + } + return { + "publication_publishable": publication.publishable, + "publication_intent": str(publication.intent) if publication.publishable else None, + "publication_exposure": str(publication.exposure) if publication.publishable else None, + "publication_asset_format": publication.asset_format, + "publication_asset_binding": asset_binding, + "publication_inputs": publication_inputs, + "serving_supported": capability.supported, + "serving_asset_format": capability.asset_format, + "serving_targets": list(capability.served_by), + "serving_error": capability.error, + } + + +@router.get("", response_model=WorkflowCatalogResponse) +def list_workflows() -> WorkflowCatalogResponse: + """List all allowlisted workflow definitions.""" + try: + definitions = list_workflow_definitions() + except ValueError as exc: + raise HTTPException( + status_code=500, + detail=api_error( + error="workflow_catalog_unavailable", + error_code="CATALOG_UNAVAILABLE", + message=str(exc), + ), + ) from exc + return WorkflowCatalogResponse( + workflows=[ + WorkflowCatalogItem( + workflow_id=definition.workflow_id, + version=definition.version, + step_count=len(definition.steps), + components=[step.component for step in definition.steps], + **_workflow_publication_summary(definition), + ) + for definition in definitions + ] + ) + + +@router.get("/jobs", response_model=WorkflowJobListResponse) +def list_workflow_jobs( + process_id: str | None = None, + status: WorkflowJobStatus | None = None, +) -> WorkflowJobListResponse: + """List persisted workflow jobs.""" + return WorkflowJobListResponse(jobs=list_jobs(process_id=process_id, status=status)) + + +@router.get("/jobs/{job_id}", response_model=WorkflowJobRecord) +def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: + """Fetch one persisted workflow job.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + links: list[dict[str, str]] = [ + {"rel": "self", "href": str(request.url_for("get_workflow_job", job_id=job_id))}, + {"rel": "result", "href": str(request.url_for("get_workflow_job_result", job_id=job_id))}, + {"rel": "trace", "href": str(request.url_for("get_workflow_job_trace", job_id=job_id))}, + ] + publication = get_published_resource(f"workflow-output-{job_id}") + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + collection_id = collection_id_for_resource(publication) + links.append( + { + "rel": "collection", + "href": f"{str(request.base_url).rstrip('/')}/pygeoapi/collections/{collection_id}", + } + ) + analytics_link = next((link for link in publication.links if link.get("rel") == "analytics"), None) + if analytics_link is not None: + links.append( + { + "rel": "analytics", + "href": f"{str(request.base_url).rstrip('/')}{analytics_link['href']}", + } + ) + return job.model_copy(update={"links": links}) + + +@router.get("/jobs/{job_id}/result") +def get_workflow_job_result(job_id: str) -> dict[str, Any]: + """Fetch persisted workflow results for a completed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + result = get_job_result(job_id) + if result is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) + return result + + +@router.get("/jobs/{job_id}/trace") +def get_workflow_job_trace(job_id: str) -> dict[str, Any]: + """Fetch persisted workflow trace for a completed or failed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + trace = get_job_trace(job_id) + if trace is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="job_trace_unavailable", + error_code="JOB_TRACE_UNAVAILABLE", + message=f"Trace is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) + return trace + + +@router.delete("/jobs/{job_id}") +def delete_workflow_job(job_id: str) -> dict[str, Any]: + """Delete one workflow job and cascade run-owned derived artifacts.""" + deleted = delete_job(job_id) + if deleted is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + return deleted + + +@router.post("/jobs/cleanup", response_model=WorkflowJobCleanupResponse) +def cleanup_workflow_jobs( + dry_run: bool = True, + keep_latest: int | None = None, + older_than_hours: int | None = None, +) -> WorkflowJobCleanupResponse: + """Apply retention policy to terminal jobs and derived artifacts.""" + try: + result = cleanup_jobs( + dry_run=dry_run, + keep_latest=keep_latest, + older_than_hours=older_than_hours, + ) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="cleanup_policy_invalid", + error_code="CLEANUP_POLICY_INVALID", + message=str(exc), + ), + ) from exc + return WorkflowJobCleanupResponse.model_validate(result) + + +@router.post("/schedules", response_model=WorkflowSchedule) +def create_workflow_schedule(payload: WorkflowScheduleCreateRequest) -> WorkflowSchedule: + """Create a recurring workflow schedule contract.""" + try: + return create_schedule(payload) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="schedule_invalid", + error_code="SCHEDULE_INVALID", + message=str(exc), + ), + ) from exc + + +@router.get("/schedules", response_model=list[WorkflowSchedule]) +def list_workflow_schedules(workflow_id: str | None = None) -> list[WorkflowSchedule]: + """List persisted workflow schedules.""" + return list_schedules(workflow_id=workflow_id) + + +@router.get("/schedules/{schedule_id}", response_model=WorkflowSchedule) +def get_workflow_schedule(schedule_id: str) -> WorkflowSchedule: + """Fetch one persisted workflow schedule.""" + schedule = get_schedule(schedule_id) + if schedule is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="schedule_not_found", + error_code="SCHEDULE_NOT_FOUND", + message=f"Unknown schedule_id '{schedule_id}'", + schedule_id=schedule_id, + ), + ) + return schedule + + +@router.delete("/schedules/{schedule_id}", status_code=204) +def delete_workflow_schedule(schedule_id: str) -> None: + """Delete one persisted workflow schedule.""" + deleted = delete_schedule(schedule_id) + if deleted is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="schedule_not_found", + error_code="SCHEDULE_NOT_FOUND", + message=f"Unknown schedule_id '{schedule_id}'", + schedule_id=schedule_id, + ), + ) + + +@router.post("/schedules/{schedule_id}/trigger", response_model=WorkflowScheduleTriggerResponse) +def trigger_workflow_schedule( + schedule_id: str, + payload: WorkflowScheduleTriggerRequest | None = None, +) -> WorkflowScheduleTriggerResponse: + """Trigger one persisted schedule immediately.""" + try: + trigger_response, _result = trigger_schedule( + schedule_id=schedule_id, + execution_time=(payload.execution_time if payload is not None else None), + ) + except ValueError as exc: + message = str(exc) + error_code = "SCHEDULE_NOT_FOUND" if "Unknown schedule_id" in message else "SCHEDULE_TRIGGER_INVALID" + status_code = 404 if error_code == "SCHEDULE_NOT_FOUND" else 422 + raise HTTPException( + status_code=status_code, + detail=api_error( + error="schedule_trigger_failed" if status_code == 422 else "schedule_not_found", + error_code=error_code, + message=message, + schedule_id=schedule_id, + ), + ) from exc + return trigger_response + + +@router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) +def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> WorkflowExecuteResponse: + """Run workflow from a single flat request payload.""" + request, _warnings = normalize_simple_request(payload.request) + return execute_workflow( + request, + workflow_id=payload.request.workflow_id, + request_params=payload.request.model_dump(), + include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="catalog", + ) + + +@router.post("/execute", response_model=WorkflowExecuteResponse) +def run_inline_assembled_workflow(payload: WorkflowAssemblyExecuteRequest) -> WorkflowExecuteResponse: + """Run an inline assembled workflow definition from one flat request payload.""" + request, _warnings = normalize_simple_request(payload.request) + return execute_workflow( + request, + workflow_id=payload.workflow.workflow_id, + workflow_definition=payload.workflow, + request_params=payload.request.model_dump(exclude_none=True), + include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="inline", + ) + + +@router.post("/validate", response_model=WorkflowValidateResponse) +def validate_workflow_assembly(payload: WorkflowValidateRequest) -> WorkflowValidateResponse: + """Validate workflow assembly without executing any component.""" + warnings: list[str] = [] + errors: list[str] = [] + + try: + if payload.workflow is not None: + workflow = payload.workflow + else: + workflow = load_workflow_definition(payload.workflow_id or "") + except ValueError as exc: + return WorkflowValidateResponse( + valid=False, + workflow_id=payload.workflow_id or "unknown", + workflow_version=0, + step_count=0, + components=[], + publication_publishable=False, + warnings=warnings, + errors=[str(exc)], + ) + + request_params: dict[str, object] = {} + if payload.request is not None: + _request, map_warnings = normalize_simple_request(payload.request) + warnings.extend(map_warnings) + request_params = payload.request.model_dump(exclude_none=True) + + try: + resolved_steps = [ + WorkflowValidateStep.model_validate(step) + for step in validate_workflow_steps(workflow=workflow, request_params=request_params) + ] + except ValueError as exc: + errors.append(str(exc)) + resolved_steps = [] + + return WorkflowValidateResponse( + valid=not errors, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + step_count=len(workflow.steps), + components=[step.component for step in workflow.steps], + **_workflow_publication_summary(workflow), + resolved_steps=resolved_steps, + warnings=warnings, + errors=errors, + ) diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py new file mode 100644 index 0000000..b75d2fd --- /dev/null +++ b/src/eo_api/workflows/schemas.py @@ -0,0 +1,395 @@ +"""Schemas for generic DHIS2 workflow execution.""" + +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, Field, model_validator + +from .services.definitions import WorkflowDefinition + + +class FeatureSourceType(StrEnum): + """Supported feature source backends.""" + + GEOJSON_FILE = "geojson_file" + DHIS2_LEVEL = "dhis2_level" + DHIS2_IDS = "dhis2_ids" + + +class AggregationMethod(StrEnum): + """Supported numeric aggregation methods.""" + + MEAN = "mean" + SUM = "sum" + MIN = "min" + MAX = "max" + + +class PeriodType(StrEnum): + """Supported temporal period types.""" + + HOURLY = "hourly" + DAILY = "daily" + MONTHLY = "monthly" + YEARLY = "yearly" + + +class FeatureSourceConfig(BaseModel): + """How to fetch features for spatial aggregation.""" + + source_type: FeatureSourceType + geojson_path: str | None = None + dhis2_level: int | None = None + dhis2_ids: list[str] | None = None + dhis2_parent: str | None = None + feature_id_property: str = "id" + + @model_validator(mode="after") + def validate_by_source(self) -> "FeatureSourceConfig": + """Enforce required fields per source backend.""" + if self.source_type == FeatureSourceType.GEOJSON_FILE and not self.geojson_path: + raise ValueError("geojson_path is required when source_type='geojson_file'") + if self.source_type == FeatureSourceType.DHIS2_LEVEL and self.dhis2_level is None: + raise ValueError("dhis2_level is required when source_type='dhis2_level'") + if self.source_type == FeatureSourceType.DHIS2_IDS and not self.dhis2_ids: + raise ValueError("dhis2_ids is required when source_type='dhis2_ids'") + return self + + +class TemporalAggregationConfig(BaseModel): + """Temporal rollup config.""" + + target_period_type: PeriodType + method: AggregationMethod = AggregationMethod.SUM + + +class SpatialAggregationConfig(BaseModel): + """Spatial aggregation config.""" + + method: AggregationMethod = AggregationMethod.MEAN + + +class Dhis2DataValueSetConfig(BaseModel): + """Mapping from aggregate outputs to DHIS2 DataValueSet fields.""" + + data_element_uid: str + category_option_combo_uid: str = "HllvX50cXC0" + attribute_option_combo_uid: str = "HllvX50cXC0" + data_set_uid: str | None = None + org_unit_property: str = "id" + stored_by: str | None = None + + +class WorkflowExecuteRequest(BaseModel): + """End-to-end workflow request.""" + + dataset_id: str + start: str + end: str + publish: bool = False + overwrite: bool = False + country_code: str | None = None + feature_source: FeatureSourceConfig + temporal_aggregation: TemporalAggregationConfig + spatial_aggregation: SpatialAggregationConfig = Field(default_factory=SpatialAggregationConfig) + dhis2: Dhis2DataValueSetConfig + + +class ComponentRun(BaseModel): + """Execution metadata for one workflow component.""" + + component: str + status: str + started_at: str + ended_at: str + duration_ms: int + inputs: dict[str, Any] + outputs: dict[str, Any] | None = None + error: str | None = None + + +class WorkflowExecuteResponse(BaseModel): + """Workflow execution response.""" + + status: str + run_id: str + workflow_id: str + workflow_version: int + dataset_id: str + outputs: dict[str, Any] = Field(default_factory=dict) + primary_output_name: str | None = None + bbox: list[float] | None = None + feature_count: int | None = None + value_count: int | None = None + output_file: str | None = None + run_log_file: str + data_value_set: dict[str, Any] | None = None + component_runs: list[ComponentRun] + component_run_details_included: bool = False + component_run_details_available: bool = True + + +class WorkflowJobStatus(StrEnum): + """Native workflow job lifecycle states.""" + + ACCEPTED = "accepted" + RUNNING = "running" + SUCCESSFUL = "successful" + FAILED = "failed" + DISMISSED = "dismissed" + + +class WorkflowJobOrchestrationStep(BaseModel): + """Compact summary of one workflow step.""" + + id: str + component: str + version: str + execution_mode: str | None = None + inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + + +class WorkflowJobOrchestration(BaseModel): + """Compact summary of workflow orchestration.""" + + definition_source: str + step_count: int + components: list[str] + steps: list[WorkflowJobOrchestrationStep] + + +class WorkflowJobRecord(BaseModel): + """Persisted workflow job metadata.""" + + job_id: str + process_id: str + workflow_id: str + workflow_version: int + dataset_id: str + status: WorkflowJobStatus + created_at: str + updated_at: str + request: dict[str, Any] + orchestration: WorkflowJobOrchestration + run_log_file: str | None = None + output_file: str | None = None + error: str | None = None + error_code: str | None = None + failed_component: str | None = None + failed_component_version: str | None = None + trigger_type: str = "on_demand" + schedule_id: str | None = None + idempotency_key: str | None = None + links: list[dict[str, Any]] = Field(default_factory=list) + + +class WorkflowJobStoredRecord(WorkflowJobRecord): + """Persisted workflow job metadata including internal result payload.""" + + run_id: str + result: dict[str, Any] | None = None + + +class WorkflowJobListResponse(BaseModel): + """List of persisted workflow jobs.""" + + jobs: list[WorkflowJobRecord] + + +class WorkflowJobCleanupCandidate(BaseModel): + """One terminal job selected by retention policy.""" + + job_id: str + status: WorkflowJobStatus + created_at: str + workflow_id: str + dataset_id: str + + +class WorkflowJobCleanupResponse(BaseModel): + """Result of applying or previewing a workflow job retention policy.""" + + dry_run: bool + keep_latest: int | None = None + older_than_hours: int | None = None + candidate_count: int + deleted_count: int + candidates: list[WorkflowJobCleanupCandidate] + deleted_job_ids: list[str] + + +class WorkflowCatalogItem(BaseModel): + """Discoverable workflow definition summary.""" + + workflow_id: str + version: int + publication_publishable: bool + publication_intent: str | None = None + publication_exposure: str | None = None + publication_asset_format: str | None = None + publication_asset_binding: dict[str, str] | None = None + publication_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + serving_supported: bool = False + serving_asset_format: str | None = None + serving_targets: list[str] = Field(default_factory=list) + serving_error: str | None = None + step_count: int + components: list[str] + + +class WorkflowCatalogResponse(BaseModel): + """List of allowlisted workflow definitions.""" + + workflows: list[WorkflowCatalogItem] + + +class WorkflowRequest(BaseModel): + """Public flat workflow request payload.""" + + workflow_id: str = "dhis2_datavalue_set_v1" + dataset_id: str + start_date: str | None = None + end_date: str | None = None + start_year: int | None = None + end_year: int | None = None + org_unit_level: int | None = None + org_unit_ids: list[str] | None = None + data_element: str + temporal_resolution: PeriodType = PeriodType.MONTHLY + temporal_reducer: AggregationMethod = AggregationMethod.SUM + spatial_reducer: AggregationMethod = AggregationMethod.MEAN + publish: bool = False + overwrite: bool = False + dry_run: bool = True + feature_id_property: str = "id" + stage: str | None = None + flavor: str | None = None + country_code: str | None = None + output_format: str | None = None + include_component_run_details: bool = False + + @model_validator(mode="after") + def validate_time_window(self) -> "WorkflowRequest": + """Require either date range or year range.""" + has_dates = bool(self.start_date and self.end_date) + has_years = self.start_year is not None and self.end_year is not None + if not has_dates and not has_years: + raise ValueError("Provide either start_date/end_date or start_year/end_year") + if self.org_unit_level is None and not self.org_unit_ids: + raise ValueError("Provide org_unit_level or org_unit_ids") + return self + + +class WorkflowExecuteEnvelopeRequest(BaseModel): + """Envelope for workflow execution input payload.""" + + request: WorkflowRequest + + +class WorkflowAssemblyExecuteRequest(BaseModel): + """Inline workflow assembly + wrapped public workflow input.""" + + request: WorkflowRequest + workflow: WorkflowDefinition + + +class WorkflowValidateRequest(BaseModel): + """Validation request for discovered or inline workflow assembly.""" + + workflow_id: str | None = None + workflow: WorkflowDefinition | None = None + request: WorkflowRequest | None = None + + @model_validator(mode="after") + def validate_workflow_source(self) -> "WorkflowValidateRequest": + """Require exactly one workflow source.""" + if (self.workflow_id is None and self.workflow is None) or ( + self.workflow_id is not None and self.workflow is not None + ): + raise ValueError("Provide exactly one of workflow_id or workflow") + return self + + +class WorkflowValidateStep(BaseModel): + """Resolved workflow step metadata from validation.""" + + index: int + id: str | None = None + component: str + version: str + resolved_config: dict[str, Any] + resolved_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + + +class JobRetentionPolicy(BaseModel): + """Retention policy metadata for scheduled runs.""" + + keep_latest: int | None = Field(default=None, ge=0) + older_than_hours: int | None = Field(default=None, ge=0) + automatic_cleanup: bool = True + + +class WorkflowSchedule(BaseModel): + """Recurring workflow execution contract.""" + + schedule_id: str + workflow_id: str + cron_expression: str + request: WorkflowRequest + enabled: bool = True + idempotency_key_template: str = "{workflow_id}:{schedule_id}:{date}" + retention_policy: JobRetentionPolicy = Field(default_factory=JobRetentionPolicy) + created_at: str + updated_at: str + last_triggered_at: str | None = None + + +class WorkflowScheduleCreateRequest(BaseModel): + """Create a recurring workflow execution schedule.""" + + workflow_id: str | None = None + cron_expression: str + request: WorkflowRequest + enabled: bool = True + idempotency_key_template: str = "{workflow_id}:{schedule_id}:{date}" + retention_policy: JobRetentionPolicy = Field(default_factory=JobRetentionPolicy) + + +class WorkflowScheduleTriggerRequest(BaseModel): + """Trigger one schedule execution.""" + + execution_time: str | None = None + + +class WorkflowScheduleTriggerResponse(BaseModel): + """Result of triggering a schedule execution.""" + + schedule_id: str + workflow_id: str + job_id: str + status: WorkflowJobStatus + idempotency_key: str + reused_existing_job: bool = False + + +class WorkflowValidateResponse(BaseModel): + """Validation result for a workflow assembly.""" + + valid: bool + workflow_id: str + workflow_version: int + step_count: int + components: list[str] + publication_publishable: bool = False + publication_intent: str | None = None + publication_exposure: str | None = None + publication_asset_format: str | None = None + publication_asset_binding: dict[str, str] | None = None + publication_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + serving_supported: bool = False + serving_asset_format: str | None = None + serving_targets: list[str] = Field(default_factory=list) + serving_error: str | None = None + resolved_steps: list[WorkflowValidateStep] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + errors: list[str] = Field(default_factory=list) diff --git a/src/eo_api/workflows/services/__init__.py b/src/eo_api/workflows/services/__init__.py new file mode 100644 index 0000000..4e6122f --- /dev/null +++ b/src/eo_api/workflows/services/__init__.py @@ -0,0 +1 @@ +"""Workflow service components.""" diff --git a/src/eo_api/workflows/services/datavalueset.py b/src/eo_api/workflows/services/datavalueset.py new file mode 100644 index 0000000..3768872 --- /dev/null +++ b/src/eo_api/workflows/services/datavalueset.py @@ -0,0 +1,65 @@ +"""DHIS2 DataValueSet builder component.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +import numpy as np + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import Dhis2DataValueSetConfig, PeriodType + + +def build_data_value_set( + records: list[dict[str, Any]], + *, + dataset_id: str, + period_type: PeriodType, + config: Dhis2DataValueSetConfig, +) -> tuple[dict[str, Any], str]: + """Build and serialize a DHIS2-compatible DataValueSet JSON payload.""" + data_values: list[dict[str, Any]] = [] + for record in records: + period = _format_period(record["time"], period_type) + data_values.append( + { + "dataElement": config.data_element_uid, + "period": period, + "orgUnit": record["org_unit"], + "categoryOptionCombo": config.category_option_combo_uid, + "attributeOptionCombo": config.attribute_option_combo_uid, + "value": str(record["value"]), + } + ) + + payload: dict[str, Any] = {"dataValues": data_values} + if config.data_set_uid: + payload["dataSet"] = config.data_set_uid + if config.stored_by: + payload["storedBy"] = config.stored_by + output_file = _write_data_value_set(payload, dataset_id) + return payload, output_file + + +def _write_data_value_set(payload: dict[str, Any], dataset_id: str) -> str: + """Persist DataValueSet payload and return file path.""" + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) + now = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = DOWNLOAD_DIR / f"{dataset_id}_datavalueset_{now}.json" + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(path) + + +def _format_period(time_value: Any, period_type: PeriodType) -> str: + ts = np.datetime64(time_value) + s = np.datetime_as_string(ts, unit="D") + year, month, day = s.split("-") + if period_type == PeriodType.DAILY: + return f"{year}{month}{day}" + if period_type == PeriodType.MONTHLY: + return f"{year}{month}" + if period_type == PeriodType.YEARLY: + return year + return s.replace("-", "") diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py new file mode 100644 index 0000000..2e17779 --- /dev/null +++ b/src/eo_api/workflows/services/definitions.py @@ -0,0 +1,321 @@ +"""Declarative workflow definition loading and validation.""" + +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import Any, Literal + +import yaml +from pydantic import AliasChoices, BaseModel, Field, model_validator + +from ...publications.capabilities import evaluate_publication_serving +from ...publications.schemas import PublishedResourceExposure, PublishedResourceKind + +SCRIPT_DIR = Path(__file__).parent.resolve() +WORKFLOWS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "workflows" +DEFAULT_WORKFLOW_ID = "dhis2_datavalue_set_v1" + + +class WorkflowStep(BaseModel): + """One component step in a declarative workflow definition.""" + + id: str | None = None + component: str + version: str = "v1" + config: dict[str, Any] = Field(default_factory=dict) + inputs: dict[str, "WorkflowStepInput"] = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_component_version(self) -> "WorkflowStep": + """Ensure component@version exists in the registered component catalog.""" + supported_versions = _supported_component_versions(self.component) + if self.version not in supported_versions: + known = ", ".join(sorted(supported_versions)) or "" + raise ValueError( + f"Unsupported component version '{self.component}@{self.version}'. Supported versions: {known}" + ) + return self + + +class WorkflowStepInput(BaseModel): + """Reference one named output from a prior workflow step.""" + + from_step: str + output: str = Field(validation_alias=AliasChoices("output", "output_key")) + + +class WorkflowOutputBinding(BaseModel): + """Expose one named workflow output from a step output.""" + + from_step: str + output: str = Field(validation_alias=AliasChoices("output", "output_key")) + include_in_response: bool = True + + +class WorkflowPublicationPolicy(BaseModel): + """Publication policy for workflow outputs.""" + + publishable: bool = Field(default=False, validation_alias=AliasChoices("publishable", "enabled")) + strategy: Literal["on_success", "manual"] = Field( + default="on_success", + validation_alias=AliasChoices("strategy", "publish_strategy"), + ) + intent: PublishedResourceKind = Field( + default=PublishedResourceKind.FEATURE_COLLECTION, + validation_alias=AliasChoices("intent", "resource_kind"), + ) + exposure: PublishedResourceExposure = PublishedResourceExposure.REGISTRY_ONLY + required_output_file_suffixes: list[str] = Field(default_factory=list) + asset: WorkflowStepInput | None = None + asset_format: str | None = None + inputs: dict[str, WorkflowStepInput] = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_publication_policy(self) -> "WorkflowPublicationPolicy": + """Normalize workflow publication policy.""" + normalized_suffixes = [] + for suffix in self.required_output_file_suffixes: + normalized_suffixes.append(suffix if suffix.startswith(".") else f".{suffix}") + self.required_output_file_suffixes = normalized_suffixes + if self.asset_format is not None: + self.asset_format = self.asset_format.strip().lower() or None + return self + + +class WorkflowDefinition(BaseModel): + """Declarative workflow definition.""" + + workflow_id: str + version: int = 1 + publication: WorkflowPublicationPolicy = Field(default_factory=WorkflowPublicationPolicy) + steps: list[WorkflowStep] + outputs: dict[str, WorkflowOutputBinding] = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_steps(self) -> "WorkflowDefinition": + """Validate component compatibility and exported workflow outputs.""" + if not self.steps: + raise ValueError("Workflow steps cannot be empty") + _assign_step_ids(self.steps) + available_outputs: dict[str, set[str]] = {} + latest_producer_for_output: dict[str, str] = {} + for step in self.steps: + if step.id is None: + raise ValueError(f"Workflow step '{step.component}' is missing an id") + + resolved_inputs = _normalize_step_inputs( + step=step, + available_outputs=available_outputs, + latest_producer_for_output=latest_producer_for_output, + ) + step.inputs = resolved_inputs + + outputs = _component_outputs(step.component, step.version) + available_outputs[step.id] = outputs + for output_name in outputs: + latest_producer_for_output[output_name] = step.id + + if not self.outputs: + raise ValueError("Workflow must declare at least one exported output") + + _validate_workflow_outputs(bindings=self.outputs, available_outputs=available_outputs, owner="Workflow outputs") + if self.publication.publishable: + if self.publication.asset is None and not self.publication.inputs: + raise ValueError("Publishable workflows must declare a publication asset or publication inputs") + if self.publication.asset is not None: + _validate_workflow_outputs( + bindings={"asset": self.publication.asset}, + available_outputs=available_outputs, + owner="Workflow publication asset", + ) + if self.publication.inputs: + _validate_workflow_outputs( + bindings=self.publication.inputs, + available_outputs=available_outputs, + owner="Workflow publication", + ) + capability = evaluate_publication_serving( + kind=self.publication.intent, + exposure=self.publication.exposure, + asset_format=self.publication.asset_format, + ) + if not capability.supported: + raise ValueError(capability.error or "Unsupported publication serving contract") + return self + + +def load_workflow_definition( + workflow_id: str = DEFAULT_WORKFLOW_ID, + *, + path: Path | None = None, +) -> WorkflowDefinition: + """Load and validate workflow definition from discovered YAML files.""" + if path is not None: + workflow_file = path + else: + workflow_files = _discover_workflow_files() + workflow_file_or_none = workflow_files.get(workflow_id) + if workflow_file_or_none is None: + known = ", ".join(sorted(workflow_files)) + raise ValueError(f"Unknown workflow_id '{workflow_id}'. Allowed values: {known}") + workflow_file = workflow_file_or_none + + if not workflow_file.exists(): + raise ValueError(f"Workflow definition file not found: {workflow_file}") + with open(workflow_file, encoding="utf-8") as f: + raw = yaml.safe_load(f) + if raw is None: + raise ValueError(f"Workflow definition file is empty: {workflow_file}") + definition = WorkflowDefinition.model_validate(raw) + if path is None and definition.workflow_id != workflow_id: + raise ValueError( + f"workflow_id mismatch: requested '{workflow_id}' but definition declares '{definition.workflow_id}'" + ) + return definition + + +def list_workflow_definitions() -> list[WorkflowDefinition]: + """Load and return all discovered workflow definitions.""" + workflow_files = _discover_workflow_files() + return [load_workflow_definition(workflow_id) for workflow_id in sorted(workflow_files)] + + +def _discover_workflow_files() -> dict[str, Path]: + """Discover and validate workflow IDs from all YAML files in workflows folder.""" + if not WORKFLOWS_DIR.is_dir(): + raise ValueError(f"Workflow directory not found: {WORKFLOWS_DIR}") + + discovered: dict[str, Path] = {} + for workflow_file in sorted(WORKFLOWS_DIR.glob("*.y*ml")): + with open(workflow_file, encoding="utf-8") as f: + raw = yaml.safe_load(f) + if raw is None: + raise ValueError(f"Workflow definition file is empty: {workflow_file}") + if not isinstance(raw, dict): + raise ValueError(f"Workflow definition must be a mapping/object: {workflow_file}") + + workflow_id = raw.get("workflow_id") + if not isinstance(workflow_id, str) or not workflow_id: + raise ValueError(f"Missing/invalid workflow_id in: {workflow_file}") + + existing = discovered.get(workflow_id) + if existing is not None: + raise ValueError(f"Duplicate workflow_id '{workflow_id}' in files: {existing.name}, {workflow_file.name}") + discovered[workflow_id] = workflow_file + + return discovered + + +def _assign_step_ids(steps: list[WorkflowStep]) -> None: + seen_ids: set[str] = set() + component_counts: dict[str, int] = {} + for step in steps: + if step.id is None: + count = component_counts.get(step.component, 0) + 1 + component_counts[step.component] = count + step.id = step.component if count == 1 else f"{step.component}_{count}" + if step.id in seen_ids: + raise ValueError(f"Duplicate workflow step id '{step.id}'") + seen_ids.add(step.id) + + +def _normalize_step_inputs( + *, + step: WorkflowStep, + available_outputs: dict[str, set[str]], + latest_producer_for_output: dict[str, str], +) -> dict[str, WorkflowStepInput]: + declared_inputs = dict(step.inputs) + required_inputs = _component_required_inputs(step.component, step.version) + optional_inputs = _component_optional_inputs(step.component, step.version) + + if not declared_inputs: + for input_name in sorted(required_inputs | optional_inputs): + producer = latest_producer_for_output.get(input_name) + if producer is None: + continue + declared_inputs[input_name] = WorkflowStepInput(from_step=producer, output=input_name) + + missing_required = required_inputs - set(declared_inputs) + if missing_required: + missing = ", ".join(sorted(missing_required)) + raise ValueError(f"Component '{step.component}' is missing required upstream outputs: {missing}") + + allowed_inputs = required_inputs | optional_inputs + unexpected_inputs = set(declared_inputs) - allowed_inputs + if unexpected_inputs: + unexpected = ", ".join(sorted(unexpected_inputs)) + raise ValueError(f"Component '{step.component}' declares unsupported inputs: {unexpected}") + + for input_name, ref in declared_inputs.items(): + available_for_step = available_outputs.get(ref.from_step) + if available_for_step is None: + raise ValueError( + f"Component '{step.component}' references unknown upstream " + f"step '{ref.from_step}' for input '{input_name}'" + ) + if ref.output not in available_for_step: + raise ValueError( + f"Component '{step.component}' input '{input_name}' references " + f"missing output '{ref.output}' from step '{ref.from_step}'" + ) + + return declared_inputs + + +def _validate_workflow_outputs( + *, + bindings: Mapping[str, WorkflowStepInput | WorkflowOutputBinding], + available_outputs: dict[str, set[str]], + owner: str, +) -> None: + if not bindings: + raise ValueError(f"{owner} cannot be empty") + for output_name, ref in bindings.items(): + available_for_step = available_outputs.get(ref.from_step) + if available_for_step is None: + raise ValueError(f"{owner} reference '{output_name}' points to unknown step '{ref.from_step}'") + if ref.output not in available_for_step: + raise ValueError( + f"{owner} reference '{output_name}' points to missing output '{ref.output}' from step '{ref.from_step}'" + ) + + +def _component_definition(component: str, version: str) -> tuple[set[str], set[str], set[str]]: + from ...components import services as component_services + + definition = component_services.component_registry().get(f"{component}@{version}") + if definition is None: + raise ValueError(f"Unsupported component version '{component}@{version}'. Supported versions: ") + return ( + set(definition.workflow_inputs_required), + set(definition.workflow_inputs_optional), + set(definition.outputs), + ) + + +def _supported_component_versions(component: str) -> set[str]: + from ...components import services as component_services + + versions: set[str] = set() + for key in component_services.component_registry(): + name, _, version = key.partition("@") + if name == component and version: + versions.add(version) + return versions + + +def _component_required_inputs(component: str, version: str) -> set[str]: + required, _, _ = _component_definition(component, version) + return required + + +def _component_optional_inputs(component: str, version: str) -> set[str]: + _, optional, _ = _component_definition(component, version) + return optional + + +def _component_outputs(component: str, version: str) -> set[str]: + _, _, outputs = _component_definition(component, version) + return outputs diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py new file mode 100644 index 0000000..5a63314 --- /dev/null +++ b/src/eo_api/workflows/services/engine.py @@ -0,0 +1,576 @@ +"""Workflow orchestration engine for gridded-data pipelines.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Literal + +from fastapi import HTTPException + +from ...components import services as component_services +from ...data_registry.services.datasets import get_dataset +from ...publications.services import register_workflow_output_publication +from ...shared.api_errors import api_error +from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowJobStatus +from .definitions import ( + WorkflowDefinition, + WorkflowOutputBinding, + WorkflowPublicationPolicy, + WorkflowStep, + load_workflow_definition, +) +from .job_store import initialize_job, mark_job_failed, mark_job_running, mark_job_success +from .publication_assets import build_feature_collection_asset, write_feature_collection_asset, write_json_asset +from .run_logs import persist_run_log +from .runtime import WorkflowRuntime + + +class WorkflowComponentError(RuntimeError): + """Typed component failure with stable error code and component context.""" + + def __init__( + self, + *, + error_code: str, + message: str, + component: str, + component_version: str, + status_code: int, + ) -> None: + super().__init__(message) + self.error_code = error_code + self.component = component + self.component_version = component_version + self.status_code = status_code + + +@dataclass +class WorkflowExecutionContext: + """Step-scoped workflow outputs and compatibility lookup helpers.""" + + step_outputs: dict[str, dict[str, Any]] = field(default_factory=dict) + latest_outputs: dict[str, Any] = field(default_factory=dict) + + def set_step_outputs(self, step_id: str, outputs: dict[str, Any]) -> None: + self.step_outputs[step_id] = outputs + self.latest_outputs.update(outputs) + + def get_step_output(self, *, step_id: str, output_name: str) -> Any: + outputs = self.step_outputs.get(step_id) + if outputs is None or output_name not in outputs: + raise RuntimeError(f"Workflow definition missing prerequisite for '{step_id}.{output_name}'") + return outputs[output_name] + + def require_output(self, output_name: str) -> Any: + if output_name not in self.latest_outputs: + raise RuntimeError(f"Workflow definition missing prerequisite for '{output_name}'") + return self.latest_outputs[output_name] + + +def execute_workflow( + request: WorkflowExecuteRequest, + *, + workflow_id: str = "dhis2_datavalue_set_v1", + workflow_definition: WorkflowDefinition | None = None, + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + run_id: str | None = None, + workflow_definition_source: Literal["catalog", "inline"] = "catalog", + trigger_type: str = "on_demand", + schedule_id: str | None = None, + idempotency_key: str | None = None, +) -> WorkflowExecuteResponse: + """Execute the feature->download->aggregate->DataValueSet workflow.""" + runtime = WorkflowRuntime(run_id=run_id) + workflow: WorkflowDefinition | None = None + + dataset = get_dataset(request.dataset_id) + if dataset is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="dataset_not_found", + error_code="DATASET_NOT_FOUND", + message=f"Dataset '{request.dataset_id}' not found", + resource_id=request.dataset_id, + ), + ) + + context = WorkflowExecutionContext() + + try: + if workflow_definition is not None: + workflow = workflow_definition + else: + try: + workflow = load_workflow_definition(workflow_id) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="workflow_definition_invalid", + error_code="WORKFLOW_DEFINITION_INVALID", + message=str(exc), + ), + ) from exc + + initialize_job( + job_id=runtime.run_id, + request=request, + request_payload=request_params, + workflow=workflow, + workflow_definition_source=workflow_definition_source, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + status=WorkflowJobStatus.RUNNING, + trigger_type=trigger_type, + schedule_id=schedule_id, + idempotency_key=idempotency_key, + ) + mark_job_running(runtime.run_id) + _execute_workflow_steps( + workflow=workflow, + runtime=runtime, + request=request, + request_params=request_params, + dataset=dataset, + context=context, + ) + exported_outputs = _resolve_workflow_outputs(workflow.outputs, context) + output_summary = _summarize_workflow_outputs(exported_outputs) + run_log_file = persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="completed", + output_file=output_summary["output_file"], + ) + + response = WorkflowExecuteResponse( + status="completed", + run_id=runtime.run_id, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + dataset_id=request.dataset_id, + outputs=exported_outputs, + primary_output_name=next(iter(workflow.outputs), None), + bbox=output_summary["bbox"], + feature_count=output_summary["feature_count"], + value_count=output_summary["value_count"], + output_file=output_summary["output_file"], + run_log_file=run_log_file, + data_value_set=output_summary["data_value_set"], + component_runs=runtime.component_runs if include_component_run_details else [], + component_run_details_included=include_component_run_details, + component_run_details_available=True, + ) + mark_job_success(job_id=runtime.run_id, response=response) + if _should_publish_workflow_output( + request=request, + response=response, + publication=workflow.publication, + workflow_definition_source=workflow_definition_source, + ): + publication_path, publication_asset_format = _build_publication_artifact( + response=response, + request=request, + publication=workflow.publication, + context=context, + exported_outputs=exported_outputs, + ) + register_workflow_output_publication( + response=response, + kind=workflow.publication.intent, + exposure=workflow.publication.exposure, + published_path=publication_path, + asset_format=publication_asset_format, + ) + return response + except WorkflowComponentError as exc: + run_log_file = persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error=str(exc), + error_code=exc.error_code, + failed_component=exc.component, + failed_component_version=exc.component_version, + ) + if workflow is not None: + mark_job_failed( + job_id=runtime.run_id, + error=str(exc), + error_code=exc.error_code, + failed_component=exc.component, + failed_component_version=exc.component_version, + run_log_file=run_log_file, + ) + error = "upstream_unreachable" if exc.error_code == "UPSTREAM_UNREACHABLE" else "workflow_execution_failed" + raise HTTPException( + status_code=exc.status_code, + detail=api_error( + error=error, + error_code=exc.error_code, + message=str(exc), + run_id=runtime.run_id, + failed_component=exc.component, + failed_component_version=exc.component_version, + ), + ) from exc + except HTTPException: + run_log_file = persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error="http_exception", + ) + if workflow is not None: + mark_job_failed(job_id=runtime.run_id, error="http_exception", run_log_file=run_log_file) + raise + except Exception as exc: + run_log_file = persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error=str(exc), + error_code="EXECUTION_FAILED", + ) + if workflow is not None: + mark_job_failed( + job_id=runtime.run_id, + error=str(exc), + error_code="EXECUTION_FAILED", + run_log_file=run_log_file, + ) + last_component = runtime.component_runs[-1].component if runtime.component_runs else "unknown" + raise HTTPException( + status_code=500, + detail=api_error( + error="workflow_execution_failed", + error_code="EXECUTION_FAILED", + message=str(exc), + run_id=runtime.run_id, + failed_component=last_component, + failed_component_version="unknown", + ), + ) from exc + + +def _should_publish_workflow_output( + *, + request: WorkflowExecuteRequest, + response: WorkflowExecuteResponse, + publication: WorkflowPublicationPolicy, + workflow_definition_source: Literal["catalog", "inline"], +) -> bool: + """Apply workflow-level publication policy to a concrete workflow output.""" + if not request.publish: + return False + if not publication.publishable: + return False + if publication.strategy != "on_success": + return False + if not _server_allows_workflow_publication(workflow_definition_source=workflow_definition_source): + return False + if publication.required_output_file_suffixes: + if response.output_file is None: + return False + suffix = Path(response.output_file).suffix.lower() + return suffix in publication.required_output_file_suffixes + return True + + +def _server_allows_workflow_publication(*, workflow_definition_source: Literal["catalog", "inline"]) -> bool: + """Apply server-side guardrails to workflow-driven publication.""" + if workflow_definition_source == "catalog": + return True + return os.environ.get("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", "").lower() in {"1", "true", "yes"} + + +def _build_publication_artifact( + *, + response: WorkflowExecuteResponse, + request: WorkflowExecuteRequest, + publication: WorkflowPublicationPolicy, + context: WorkflowExecutionContext, + exported_outputs: dict[str, Any], +) -> tuple[str, str]: + """Build the publication-facing artifact for a publishable workflow output.""" + if publication.asset is not None: + asset_value = context.get_step_output( + step_id=publication.asset.from_step, + output_name=publication.asset.output, + ) + return _materialize_publication_asset( + asset_value=asset_value, + dataset_id=response.dataset_id, + publication=publication, + ) + + if publication.intent.value == "feature_collection": + features_ref = publication.inputs.get("features") + records_ref = publication.inputs.get("records") + if features_ref is None or records_ref is None: + raise ValueError("Feature collection publication requires declared publication inputs: features, records") + features = context.get_step_output(step_id=features_ref.from_step, output_name=features_ref.output) + records = context.get_step_output(step_id=records_ref.from_step, output_name=records_ref.output) + path = build_feature_collection_asset( + dataset_id=response.dataset_id, + features=features, + records=records, + period_type=request.temporal_aggregation.target_period_type, + feature_id_property=request.feature_source.feature_id_property, + ) + return path, "geojson" + output_file_ref = publication.inputs.get("output_file") + if output_file_ref is not None: + path_value = context.get_step_output(step_id=output_file_ref.from_step, output_name=output_file_ref.output) + if not isinstance(path_value, str): + raise ValueError("Publication input 'output_file' must resolve to a filesystem path string") + return path_value, Path(path_value).suffix.lstrip(".") or "file" + if response.output_file is not None: + return response.output_file, _asset_format_for_path(response.output_file) + primary_output_name = response.primary_output_name + if primary_output_name is not None: + primary_output = exported_outputs.get(primary_output_name) + if isinstance(primary_output, str): + return primary_output, _asset_format_for_path(primary_output) + raise ValueError("Workflow publication could not resolve a publication artifact") + + +def _materialize_publication_asset( + *, + asset_value: Any, + dataset_id: str, + publication: WorkflowPublicationPolicy, +) -> tuple[str, str]: + """Resolve a declared publication asset to a persisted asset path and format.""" + if isinstance(asset_value, str): + return asset_value, publication.asset_format or _asset_format_for_path(asset_value) + if publication.intent.value == "feature_collection" and isinstance(asset_value, dict): + if asset_value.get("type") == "FeatureCollection": + return write_feature_collection_asset(collection=asset_value, dataset_id=dataset_id), "geojson" + if isinstance(asset_value, (dict, list)): + asset_format = publication.asset_format or "json" + return write_json_asset(payload=asset_value, dataset_id=dataset_id, suffix=asset_format), asset_format + raise ValueError("Declared publication asset must resolve to a file path or JSON-serializable value") + + +def _resolve_workflow_outputs( + bindings: dict[str, WorkflowOutputBinding], + context: WorkflowExecutionContext, +) -> dict[str, Any]: + """Resolve exported workflow outputs from step-scoped execution context.""" + resolved: dict[str, Any] = {} + for name, binding in bindings.items(): + if not binding.include_in_response: + continue + resolved[name] = context.get_step_output(step_id=binding.from_step, output_name=binding.output) + return resolved + + +def _summarize_workflow_outputs(outputs: dict[str, Any]) -> dict[str, Any]: + """Derive compatibility summary fields from declared workflow outputs.""" + features = outputs.get("features") + bbox = outputs.get("bbox") + records = outputs.get("records") + data_value_set = outputs.get("data_value_set") + output_file = outputs.get("output_file") + + if not isinstance(bbox, list): + bbox = None + if not isinstance(output_file, str): + output_file = None + + feature_count: int | None = None + if isinstance(features, dict): + feature_items = features.get("features") + if isinstance(feature_items, list): + feature_count = len(feature_items) + + value_count: int | None = None + if isinstance(data_value_set, dict): + data_values = data_value_set.get("dataValues") + if isinstance(data_values, list): + value_count = len(data_values) + elif isinstance(records, list): + value_count = len(records) + + return { + "bbox": bbox, + "feature_count": feature_count, + "value_count": value_count, + "output_file": output_file, + "data_value_set": data_value_set if isinstance(data_value_set, dict) else None, + } + + +def _asset_format_for_path(path_value: str) -> str: + suffix = Path(path_value).suffix.lower() + if suffix.startswith("."): + suffix = suffix[1:] + return suffix or "file" + + +def _is_upstream_connectivity_error(exc: Exception) -> bool: + message = str(exc).lower() + patterns = ( + "could not connect to server", + "failed to connect", + "connection refused", + "name or service not known", + "temporary failure in name resolution", + "timed out", + "curl error", + ) + return any(pattern in message for pattern in patterns) + + +def _execute_workflow_steps( + *, + workflow: WorkflowDefinition, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + request_params: dict[str, Any] | None, + dataset: dict[str, Any], + context: WorkflowExecutionContext, +) -> None: + """Execute workflow components using declarative YAML step order.""" + for step in workflow.steps: + if step.id is None: + raise WorkflowComponentError( + error_code="INPUT_VALIDATION_FAILED", + message=f"Workflow step '{step.component}' is missing an id", + component=step.component, + component_version=step.version, + status_code=422, + ) + runtime_definition = component_services.workflow_runtime_registry().get(f"{step.component}@{step.version}") + if runtime_definition is None: + raise WorkflowComponentError( + error_code="INPUT_VALIDATION_FAILED", + message=f"Unsupported workflow component '{step.component}@{step.version}'", + component=step.component, + component_version=step.version, + status_code=422, + ) + try: + step_config = _resolve_step_config(step.config, request_params or {}) + component_services.validate_component_runtime_config(step.component, step.version, step_config) + except ValueError as exc: + raise WorkflowComponentError( + error_code="CONFIG_VALIDATION_FAILED", + message=str(exc), + component=step.component, + component_version=step.version, + status_code=422, + ) from exc + + try: + resolved_inputs = _resolve_step_inputs(step=step, context=context) + updates = runtime_definition.executor( + step=step, + runtime=runtime, + request=request, + dataset=dataset, + resolved_inputs=resolved_inputs, + step_config=step_config, + ) + except Exception as exc: + if _is_upstream_connectivity_error(exc): + raise WorkflowComponentError( + error_code="UPSTREAM_UNREACHABLE", + message="Could not reach upstream data source. Check network/proxy and retry.", + component=step.component, + component_version=step.version, + status_code=503, + ) from exc + raise WorkflowComponentError( + error_code="EXECUTION_FAILED", + message=str(exc), + component=step.component, + component_version=step.version, + status_code=500, + ) from exc + + _validate_step_outputs(step=step, outputs=updates) + context.set_step_outputs(step.id, updates) + + +def validate_workflow_steps( + *, + workflow: WorkflowDefinition, + request_params: dict[str, Any] | None = None, +) -> list[dict[str, Any]]: + """Resolve and validate step configs without executing components.""" + resolved_steps: list[dict[str, Any]] = [] + params = request_params or {} + for index, step in enumerate(workflow.steps): + try: + resolved_config = _resolve_step_config(step.config, params) + component_services.validate_component_runtime_config(step.component, step.version, resolved_config) + except ValueError as exc: + raise ValueError(f"Step {index + 1} ({step.component}@{step.version}) validation failed: {exc}") from exc + resolved_steps.append( + { + "index": index + 1, + "id": step.id, + "component": step.component, + "version": step.version, + "resolved_config": resolved_config, + "resolved_inputs": { + input_name: {"from_step": ref.from_step, "output": ref.output} + for input_name, ref in step.inputs.items() + }, + } + ) + return resolved_steps + + +def _resolve_step_inputs(step: WorkflowStep, context: WorkflowExecutionContext) -> dict[str, Any]: + """Resolve one step's declared upstream references into concrete values.""" + resolved: dict[str, Any] = {} + for input_name, ref in step.inputs.items(): + resolved[input_name] = context.get_step_output(step_id=ref.from_step, output_name=ref.output) + return resolved + + +def _validate_step_outputs(*, step: WorkflowStep, outputs: dict[str, Any]) -> None: + """Ensure a step only emits its declared outputs and required outputs are present.""" + declared_outputs = set(component_services.component_registry()[f"{step.component}@{step.version}"].outputs) + internal_outputs = set(outputs) + unexpected_outputs = internal_outputs - declared_outputs + if unexpected_outputs: + unexpected = ", ".join(sorted(unexpected_outputs)) + raise RuntimeError(f"Component '{step.component}' emitted undeclared outputs: {unexpected}") + missing_outputs = declared_outputs - internal_outputs + if missing_outputs: + missing = ", ".join(sorted(missing_outputs)) + raise RuntimeError(f"Component '{step.component}' did not emit declared outputs: {missing}") + + +def _resolve_step_config(config: dict[str, Any], request_params: dict[str, Any]) -> dict[str, Any]: + """Resolve $request. tokens in step config.""" + resolved: dict[str, Any] = {} + for key, value in config.items(): + resolved[key] = _resolve_value(value, request_params) + return resolved + + +def _resolve_value(value: Any, request_params: dict[str, Any]) -> Any: + """Resolve a config value recursively.""" + if isinstance(value, str) and value.startswith("$request."): + field = value.removeprefix("$request.") + if field not in request_params: + raise ValueError(f"Unknown request field in config token: {value}") + return request_params[field] + if isinstance(value, dict): + return {k: _resolve_value(v, request_params) for k, v in value.items()} + if isinstance(value, list): + return [_resolve_value(v, request_params) for v in value] + return value diff --git a/src/eo_api/workflows/services/features.py b/src/eo_api/workflows/services/features.py new file mode 100644 index 0000000..95d8cc4 --- /dev/null +++ b/src/eo_api/workflows/services/features.py @@ -0,0 +1,69 @@ +"""Feature source component for workflow execution.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import geopandas as gpd + +from ...shared.dhis2_adapter import create_client, get_org_unit_geojson, get_org_units_geojson +from ..schemas import FeatureSourceConfig, FeatureSourceType + + +def resolve_features(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: + """Resolve features from a source and return FeatureCollection + bbox.""" + if config.source_type == FeatureSourceType.GEOJSON_FILE: + collection = _read_geojson_file(config.geojson_path or "") + elif config.source_type == FeatureSourceType.DHIS2_LEVEL: + client = create_client() + collection = get_org_units_geojson(client, level=config.dhis2_level, parent=config.dhis2_parent) + else: + client = create_client() + collection = _collection_from_dhis2_ids(client, config.dhis2_ids or []) + + collection = _normalize_feature_collection(collection) + bbox = _bbox_from_feature_collection(collection) + return collection, bbox + + +def feature_id(feature: dict[str, Any], key: str) -> str: + """Get feature identifier from properties, feature id, or UID fallbacks.""" + properties = feature.get("properties", {}) + value = properties.get(key) or feature.get("id") or properties.get("id") or properties.get("uid") + if value is None: + raise ValueError(f"Unable to find feature identifier using key '{key}'") + return str(value) + + +def _read_geojson_file(path: str) -> dict[str, Any]: + raw = json.loads(Path(path).read_text(encoding="utf-8")) + return _normalize_feature_collection(raw) + + +def _collection_from_dhis2_ids(client: Any, ou_ids: list[str]) -> dict[str, Any]: + features: list[dict[str, Any]] = [] + for uid in ou_ids: + unit_geojson = get_org_unit_geojson(client, uid) + normalized = _normalize_feature_collection(unit_geojson) + features.extend(normalized["features"]) + return {"type": "FeatureCollection", "features": features} + + +def _normalize_feature_collection(raw: dict[str, Any]) -> dict[str, Any]: + raw_type = raw.get("type") + if raw_type == "FeatureCollection": + return raw + if raw_type == "Feature": + return {"type": "FeatureCollection", "features": [raw]} + if "features" in raw and isinstance(raw["features"], list): + return {"type": "FeatureCollection", "features": raw["features"]} + raise ValueError("Input is not a valid GeoJSON feature or feature collection") + + +def _bbox_from_feature_collection(collection: dict[str, Any]) -> list[float]: + if not collection.get("features"): + raise ValueError("Feature collection is empty") + bounds = gpd.read_file(json.dumps(collection)).total_bounds + return [float(v) for v in bounds] diff --git a/src/eo_api/workflows/services/job_store.py b/src/eo_api/workflows/services/job_store.py new file mode 100644 index 0000000..8408297 --- /dev/null +++ b/src/eo_api/workflows/services/job_store.py @@ -0,0 +1,358 @@ +"""Disk-backed workflow job persistence.""" + +from __future__ import annotations + +import datetime as dt +import json +from pathlib import Path +from typing import Any, cast + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ...publications.pygeoapi import write_generated_pygeoapi_documents +from ...publications.services import delete_published_resource +from ..schemas import ( + WorkflowExecuteRequest, + WorkflowExecuteResponse, + WorkflowJobOrchestration, + WorkflowJobOrchestrationStep, + WorkflowJobRecord, + WorkflowJobStatus, + WorkflowJobStoredRecord, +) +from .definitions import WorkflowDefinition + +_DEFAULT_PROCESS_ID = "generic-dhis2-workflow" + + +def initialize_job( + *, + job_id: str, + request: WorkflowExecuteRequest, + request_payload: dict[str, Any] | None, + workflow: WorkflowDefinition, + workflow_definition_source: str, + workflow_id: str, + workflow_version: int, + status: WorkflowJobStatus = WorkflowJobStatus.RUNNING, + process_id: str = _DEFAULT_PROCESS_ID, + trigger_type: str = "on_demand", + schedule_id: str | None = None, + idempotency_key: str | None = None, +) -> WorkflowJobRecord: + """Create or replace a persisted job record.""" + existing = get_stored_job(job_id) + timestamp = _utc_now() + record = WorkflowJobStoredRecord( + job_id=job_id, + run_id=job_id, + process_id=process_id, + workflow_id=workflow_id, + workflow_version=workflow_version, + dataset_id=request.dataset_id, + status=status, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + request=request_payload if request_payload is not None else request.model_dump(mode="json"), + orchestration=_build_orchestration_summary( + workflow=workflow, + workflow_definition_source=workflow_definition_source, + ), + run_log_file=existing.run_log_file if existing is not None else None, + output_file=existing.output_file if existing is not None else None, + result=existing.result if existing is not None else None, + error=existing.error if existing is not None else None, + error_code=existing.error_code if existing is not None else None, + failed_component=existing.failed_component if existing is not None else None, + failed_component_version=existing.failed_component_version if existing is not None else None, + trigger_type=trigger_type if existing is None else existing.trigger_type, + schedule_id=schedule_id if existing is None else existing.schedule_id, + idempotency_key=idempotency_key if existing is None else existing.idempotency_key, + ) + _write_job(record) + return record + + +def mark_job_running(job_id: str) -> WorkflowJobRecord: + """Transition an existing job to running.""" + record = _require_job(job_id) + updated = record.model_copy(update={"status": WorkflowJobStatus.RUNNING, "updated_at": _utc_now()}) + _write_job(updated) + return updated + + +def mark_job_success( + *, + job_id: str, + response: WorkflowExecuteResponse, +) -> WorkflowJobRecord: + """Persist successful job completion details.""" + record = _require_job(job_id) + updated = record.model_copy( + update={ + "status": WorkflowJobStatus.SUCCESSFUL, + "updated_at": _utc_now(), + "run_log_file": response.run_log_file, + "output_file": response.output_file, + "result": response.model_dump(mode="json"), + "error": None, + "error_code": None, + "failed_component": None, + "failed_component_version": None, + } + ) + _write_job(updated) + return updated + + +def mark_job_failed( + *, + job_id: str, + error: str, + error_code: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, + run_log_file: str | None = None, +) -> WorkflowJobRecord: + """Persist failed job details.""" + record = _require_job(job_id) + updated = record.model_copy( + update={ + "status": WorkflowJobStatus.FAILED, + "updated_at": _utc_now(), + "run_log_file": run_log_file or record.run_log_file, + "error": error, + "error_code": error_code, + "failed_component": failed_component, + "failed_component_version": failed_component_version, + "result": None, + } + ) + _write_job(updated) + return updated + + +def get_job(job_id: str) -> WorkflowJobRecord | None: + """Load one persisted job if it exists.""" + record = get_stored_job(job_id) + if record is None: + return None + return _to_public_job_record(record) + + +def get_stored_job(job_id: str) -> WorkflowJobStoredRecord | None: + """Load one persisted job including internal result payload if it exists.""" + path = _job_path(job_id) + if not path.exists(): + return None + return WorkflowJobStoredRecord.model_validate_json(path.read_text(encoding="utf-8")) + + +def list_jobs(*, process_id: str | None = None, status: WorkflowJobStatus | None = None) -> list[WorkflowJobRecord]: + """List persisted jobs ordered by newest first.""" + jobs: list[WorkflowJobRecord] = [] + for path in _jobs_dir().glob("*.json"): + jobs.append( + _to_public_job_record(WorkflowJobStoredRecord.model_validate_json(path.read_text(encoding="utf-8"))) + ) + jobs.sort(key=lambda item: item.created_at, reverse=True) + if process_id is not None: + jobs = [job for job in jobs if job.process_id == process_id] + if status is not None: + jobs = [job for job in jobs if job.status == status] + return jobs + + +def get_job_result(job_id: str) -> dict[str, Any] | None: + """Return persisted workflow result payload for a completed job.""" + record = get_stored_job(job_id) + if record is None: + return None + return record.result + + +def get_job_trace(job_id: str) -> dict[str, Any] | None: + """Return persisted run-trace payload for a workflow job if available.""" + record = get_stored_job(job_id) + if record is None or record.run_log_file is None: + return None + path = Path(record.run_log_file) + if not path.exists(): + return None + return cast(dict[str, Any], json.loads(path.read_text(encoding="utf-8"))) + + +def find_job_by_schedule_key(*, schedule_id: str, idempotency_key: str) -> WorkflowJobRecord | None: + """Return the newest job matching one schedule/idempotency pair.""" + for job in list_jobs(): + if job.schedule_id == schedule_id and job.idempotency_key == idempotency_key: + return job + return None + + +def delete_job(job_id: str) -> dict[str, Any] | None: + """Delete a job and cascade removal of run-owned derived artifacts.""" + record = get_stored_job(job_id) + if record is None: + return None + + deleted_paths: list[str] = [] + publication = delete_published_resource(f"workflow-output-{job_id}") + if publication is not None: + for candidate in (publication.path, publication.metadata.get("native_output_file")): + deleted = _delete_owned_path(candidate) + if deleted is not None: + deleted_paths.append(deleted) + + for candidate in (record.run_log_file, record.output_file): + deleted = _delete_owned_path(candidate) + if deleted is not None: + deleted_paths.append(deleted) + + job_path = _job_path(job_id) + if job_path.exists(): + job_path.unlink() + deleted_paths.append(str(job_path)) + + # Keep generated documents on disk aligned with current publication truth. + config_path, openapi_path = write_generated_pygeoapi_documents() + return { + "job_id": job_id, + "deleted": True, + "deleted_paths": deleted_paths, + "deleted_publication": publication.resource_id if publication is not None else None, + "materialized_config_path": str(config_path), + "materialized_openapi_path": str(openapi_path), + "pygeoapi_runtime_reload_required": False, + } + + +def cleanup_jobs( + *, + dry_run: bool, + keep_latest: int | None = None, + older_than_hours: int | None = None, +) -> dict[str, Any]: + """Apply retention policy to terminal jobs and their run-owned artifacts.""" + if keep_latest is not None and keep_latest < 0: + raise ValueError("keep_latest must be >= 0") + if older_than_hours is not None and older_than_hours < 0: + raise ValueError("older_than_hours must be >= 0") + + terminal_statuses = { + WorkflowJobStatus.SUCCESSFUL, + WorkflowJobStatus.FAILED, + WorkflowJobStatus.DISMISSED, + } + terminal_jobs = [job for job in list_jobs() if job.status in terminal_statuses] + candidates = terminal_jobs + + if older_than_hours is not None: + cutoff = dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=older_than_hours) + candidates = [job for job in candidates if _parse_iso8601(job.created_at) <= cutoff] + + if keep_latest is not None: + protected_ids = {job.job_id for job in terminal_jobs[:keep_latest]} + candidates = [job for job in candidates if job.job_id not in protected_ids] + + deleted_job_ids: list[str] = [] + if not dry_run: + for job in candidates: + deleted = delete_job(job.job_id) + if deleted is not None: + deleted_job_ids.append(job.job_id) + + return { + "dry_run": dry_run, + "keep_latest": keep_latest, + "older_than_hours": older_than_hours, + "candidate_count": len(candidates), + "deleted_count": len(deleted_job_ids), + "candidates": [ + { + "job_id": job.job_id, + "status": job.status, + "created_at": job.created_at, + "workflow_id": job.workflow_id, + "dataset_id": job.dataset_id, + } + for job in candidates + ], + "deleted_job_ids": deleted_job_ids, + } + + +def _require_job(job_id: str) -> WorkflowJobStoredRecord: + record = get_stored_job(job_id) + if record is None: + raise ValueError(f"Unknown job_id '{job_id}'") + return record + + +def _write_job(record: WorkflowJobRecord) -> None: + _jobs_dir().mkdir(parents=True, exist_ok=True) + _job_path(record.job_id).write_text(record.model_dump_json(indent=2), encoding="utf-8") + + +def _job_path(job_id: str) -> Path: + return _jobs_dir() / f"{job_id}.json" + + +def _jobs_dir() -> Path: + return DOWNLOAD_DIR / "workflow_jobs" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _parse_iso8601(value: str) -> dt.datetime: + return dt.datetime.fromisoformat(value) + + +def _to_public_job_record(record: WorkflowJobStoredRecord) -> WorkflowJobRecord: + data = record.model_dump(mode="json") + data.pop("run_id", None) + data.pop("result", None) + return WorkflowJobRecord.model_validate(data) + + +def _build_orchestration_summary( + *, + workflow: WorkflowDefinition, + workflow_definition_source: str, +) -> WorkflowJobOrchestration: + return WorkflowJobOrchestration( + definition_source=workflow_definition_source, + step_count=len(workflow.steps), + components=[step.component for step in workflow.steps], + steps=[ + WorkflowJobOrchestrationStep( + id=step.id or step.component, + component=step.component, + version=step.version, + execution_mode=cast(str | None, step.config.get("execution_mode")), + inputs={ + input_name: {"from_step": ref.from_step, "output": ref.output} + for input_name, ref in step.inputs.items() + }, + ) + for step in workflow.steps + ], + ) + + +def _delete_owned_path(path_value: Any) -> str | None: + if not isinstance(path_value, str) or path_value == "": + return None + path = Path(path_value) + if not path.exists() or not path.is_file(): + return None + try: + resolved = path.resolve() + downloads_root = DOWNLOAD_DIR.resolve() + except OSError: + return None + if downloads_root not in resolved.parents: + return None + path.unlink() + return str(path) diff --git a/src/eo_api/workflows/services/preflight.py b/src/eo_api/workflows/services/preflight.py new file mode 100644 index 0000000..9ff6c3f --- /dev/null +++ b/src/eo_api/workflows/services/preflight.py @@ -0,0 +1,23 @@ +"""Preflight checks for external data source connectivity.""" + +from __future__ import annotations + +import socket +from urllib.parse import urlparse + + +def check_upstream_connectivity(dataset: dict[str, object], timeout_seconds: float = 5.0) -> None: + """Fail fast if a dataset source host is not reachable.""" + source_url = dataset.get("source_url") + if not isinstance(source_url, str) or not source_url: + return + + parsed = urlparse(source_url) + hostname = parsed.hostname + if not hostname: + return + port = parsed.port or (443 if parsed.scheme == "https" else 80) + + # Fail quickly on DNS/TCP connectivity issues instead of waiting for long GDAL timeouts. + with socket.create_connection((hostname, port), timeout=timeout_seconds): + pass diff --git a/src/eo_api/workflows/services/publication_assets.py b/src/eo_api/workflows/services/publication_assets.py new file mode 100644 index 0000000..7255404 --- /dev/null +++ b/src/eo_api/workflows/services/publication_assets.py @@ -0,0 +1,90 @@ +"""Build OGC-ready publication assets from workflow execution context.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +import numpy as np + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import PeriodType +from .features import feature_id + + +def build_feature_collection_asset( + *, + dataset_id: str, + features: dict[str, Any], + records: list[dict[str, Any]], + period_type: PeriodType, + feature_id_property: str = "id", +) -> str: + """Write a GeoJSON FeatureCollection derived from workflow records and features.""" + features_by_id = {feature_id(feature, feature_id_property): feature for feature in features.get("features", [])} + output_features: list[dict[str, Any]] = [] + for index, record in enumerate(records): + org_unit = str(record["org_unit"]) + source_feature = features_by_id.get(org_unit) + if source_feature is None: + continue + properties = source_feature.get("properties", {}) + output_features.append( + { + "type": "Feature", + "id": f"{org_unit}-{record['time']}-{index}", + "geometry": source_feature.get("geometry"), + "properties": { + "org_unit": org_unit, + "org_unit_name": _org_unit_name(properties), + "period": _format_period(record["time"], period_type), + "value": record["value"], + }, + } + ) + + collection = {"type": "FeatureCollection", "features": output_features} + return write_feature_collection_asset(collection=collection, dataset_id=dataset_id) + + +def write_feature_collection_asset(*, collection: dict[str, Any], dataset_id: str) -> str: + """Persist a ready-made GeoJSON FeatureCollection as a publication asset.""" + return _write_json_asset(payload=collection, dataset_id=dataset_id, suffix="geojson") + + +def write_json_asset(*, payload: Any, dataset_id: str, suffix: str = "json") -> str: + """Persist a JSON-serializable publication payload to disk.""" + normalized_suffix = suffix.lstrip(".") or "json" + return _write_json_asset(payload=payload, dataset_id=dataset_id, suffix=normalized_suffix) + + +def _write_json_asset(*, payload: Any, dataset_id: str, suffix: str) -> str: + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) + now = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = DOWNLOAD_DIR / f"{dataset_id}_publication_{now}.{suffix}" + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(path) + + +def _format_period(time_value: Any, period_type: PeriodType) -> str: + ts = np.datetime64(time_value) + s = np.datetime_as_string(ts, unit="D") + year, month, day = s.split("-") + if period_type == PeriodType.DAILY: + return f"{year}-{month}-{day}" + if period_type == PeriodType.MONTHLY: + return f"{year}-{month}" + if period_type == PeriodType.YEARLY: + return year + if period_type == PeriodType.HOURLY: + return np.datetime_as_string(ts, unit="h") + return s + + +def _org_unit_name(properties: dict[str, Any]) -> str | None: + for key in ("name", "displayName", "org_unit_name"): + value = properties.get(key) + if isinstance(value, str) and value.strip(): + return value + return None diff --git a/src/eo_api/workflows/services/run_logs.py b/src/eo_api/workflows/services/run_logs.py new file mode 100644 index 0000000..aa9fef2 --- /dev/null +++ b/src/eo_api/workflows/services/run_logs.py @@ -0,0 +1,43 @@ +"""Run-log persistence for workflow executions.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import ComponentRun, WorkflowExecuteRequest + + +def persist_run_log( + *, + run_id: str, + request: WorkflowExecuteRequest, + component_runs: list[ComponentRun], + status: str, + output_file: str | None = None, + error: str | None = None, + error_code: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, +) -> str: + """Write workflow run metadata to disk and return file path.""" + logs_dir = DOWNLOAD_DIR / "workflow_runs" + logs_dir.mkdir(parents=True, exist_ok=True) + timestamp = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = logs_dir / f"{timestamp}_{run_id}.json" + + payload: dict[str, Any] = { + "run_id": run_id, + "status": status, + "request": request.model_dump(mode="json"), + "component_runs": [run.model_dump(mode="json") for run in component_runs], + "output_file": output_file, + "error": error, + "error_code": error_code, + "failed_component": failed_component, + "failed_component_version": failed_component_version, + } + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(path) diff --git a/src/eo_api/workflows/services/runtime.py b/src/eo_api/workflows/services/runtime.py new file mode 100644 index 0000000..10a0aac --- /dev/null +++ b/src/eo_api/workflows/services/runtime.py @@ -0,0 +1,89 @@ +"""Component runtime wrapper for workflow housekeeping metadata.""" + +from __future__ import annotations + +import datetime as dt +import time +import uuid +from collections.abc import Callable +from typing import Any + +from ..schemas import ComponentRun + + +class WorkflowRuntime: + """Capture execution metadata for component orchestration.""" + + def __init__(self, *, run_id: str | None = None) -> None: + self.run_id = run_id or str(uuid.uuid4()) + self.component_runs: list[ComponentRun] = [] + + def run(self, component: str, fn: Callable[..., Any], **kwargs: Any) -> Any: + """Execute one component and record start/end/input/output metadata.""" + started = dt.datetime.now(dt.timezone.utc) + started_perf = time.perf_counter() + + try: + result = fn(**kwargs) + ended = dt.datetime.now(dt.timezone.utc) + self.component_runs.append( + ComponentRun( + component=component, + status="completed", + started_at=started.isoformat(), + ended_at=ended.isoformat(), + duration_ms=int((time.perf_counter() - started_perf) * 1000), + inputs=_to_json_summary(kwargs), + outputs={"result": _to_json_summary(result)}, + ) + ) + return result + except Exception as exc: + ended = dt.datetime.now(dt.timezone.utc) + self.component_runs.append( + ComponentRun( + component=component, + status="failed", + started_at=started.isoformat(), + ended_at=ended.isoformat(), + duration_ms=int((time.perf_counter() - started_perf) * 1000), + inputs=_to_json_summary(kwargs), + outputs=None, + error=str(exc), + ) + ) + raise + + +def _to_json_summary(value: Any, *, depth: int = 0, max_depth: int = 2) -> Any: + """Convert arbitrary values into a compact JSON-safe summary.""" + if depth >= max_depth: + return _fallback_summary(value) + + if value is None or isinstance(value, (str, int, float, bool)): + return value + + if isinstance(value, list): + return [_to_json_summary(v, depth=depth + 1, max_depth=max_depth) for v in value[:20]] + + if isinstance(value, tuple): + return [_to_json_summary(v, depth=depth + 1, max_depth=max_depth) for v in value[:20]] + + if isinstance(value, dict): + out: dict[str, Any] = {} + for i, (k, v) in enumerate(value.items()): + if i >= 30: + out["..."] = "truncated" + break + out[str(k)] = _to_json_summary(v, depth=depth + 1, max_depth=max_depth) + return out + + return _fallback_summary(value) + + +def _fallback_summary(value: Any) -> str: + if hasattr(value, "shape"): + return f"{type(value).__name__}(shape={getattr(value, 'shape')})" + if hasattr(value, "sizes"): + return f"{type(value).__name__}(sizes={getattr(value, 'sizes')})" + return type(value).__name__ diff --git a/src/eo_api/workflows/services/schedules.py b/src/eo_api/workflows/services/schedules.py new file mode 100644 index 0000000..c5582ca --- /dev/null +++ b/src/eo_api/workflows/services/schedules.py @@ -0,0 +1,186 @@ +"""Disk-backed workflow schedule persistence and execution helpers.""" + +from __future__ import annotations + +import datetime as dt +import uuid +from pathlib import Path + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import ( + WorkflowExecuteResponse, + WorkflowJobStatus, + WorkflowSchedule, + WorkflowScheduleCreateRequest, + WorkflowScheduleTriggerResponse, +) +from .definitions import load_workflow_definition +from .engine import execute_workflow +from .job_store import find_job_by_schedule_key +from .simple_mapper import normalize_simple_request + + +def create_schedule(payload: WorkflowScheduleCreateRequest) -> WorkflowSchedule: + """Persist one workflow schedule.""" + timestamp = _utc_now() + workflow_id = payload.workflow_id or payload.request.workflow_id + if payload.workflow_id is not None and payload.request.workflow_id != payload.workflow_id: + raise ValueError("workflow_id must match request.workflow_id when both are provided") + schedule = WorkflowSchedule( + schedule_id=str(uuid.uuid4()), + workflow_id=workflow_id, + cron_expression=payload.cron_expression, + request=payload.request.model_copy(update={"workflow_id": workflow_id}), + enabled=payload.enabled, + idempotency_key_template=payload.idempotency_key_template, + retention_policy=payload.retention_policy, + created_at=timestamp, + updated_at=timestamp, + last_triggered_at=None, + ) + _validate_cron(schedule.cron_expression) + load_workflow_definition(schedule.workflow_id) + _write_schedule(schedule) + return schedule + + +def list_schedules(*, workflow_id: str | None = None) -> list[WorkflowSchedule]: + """List persisted schedules ordered by newest first.""" + schedules: list[WorkflowSchedule] = [] + for path in _schedules_dir().glob("*.json"): + schedules.append(WorkflowSchedule.model_validate_json(path.read_text(encoding="utf-8"))) + schedules.sort(key=lambda item: item.created_at, reverse=True) + if workflow_id is not None: + schedules = [item for item in schedules if item.workflow_id == workflow_id] + return schedules + + +def get_schedule(schedule_id: str) -> WorkflowSchedule | None: + """Fetch one persisted schedule.""" + path = _schedule_path(schedule_id) + if not path.exists(): + return None + return WorkflowSchedule.model_validate_json(path.read_text(encoding="utf-8")) + + +def delete_schedule(schedule_id: str) -> WorkflowSchedule | None: + """Delete one persisted schedule.""" + schedule = get_schedule(schedule_id) + if schedule is None: + return None + path = _schedule_path(schedule_id) + if path.exists(): + path.unlink() + return schedule + + +def trigger_schedule( + *, + schedule_id: str, + execution_time: str | None = None, +) -> tuple[WorkflowScheduleTriggerResponse, WorkflowExecuteResponse | None]: + """Execute one schedule immediately with idempotency protection.""" + schedule = get_schedule(schedule_id) + if schedule is None: + raise ValueError(f"Unknown schedule_id '{schedule_id}'") + if not schedule.enabled: + raise ValueError(f"Schedule '{schedule_id}' is disabled") + + trigger_time = _parse_execution_time(execution_time) + idempotency_key = _render_idempotency_key( + template=schedule.idempotency_key_template, + workflow_id=schedule.workflow_id, + schedule_id=schedule.schedule_id, + execution_time=trigger_time, + ) + existing_job = find_job_by_schedule_key(schedule_id=schedule.schedule_id, idempotency_key=idempotency_key) + if existing_job is not None: + return ( + WorkflowScheduleTriggerResponse( + schedule_id=schedule.schedule_id, + workflow_id=schedule.workflow_id, + job_id=existing_job.job_id, + status=existing_job.status, + idempotency_key=idempotency_key, + reused_existing_job=True, + ), + None, + ) + + request, _warnings = normalize_simple_request(schedule.request) + response = execute_workflow( + request, + workflow_id=schedule.workflow_id, + request_params=schedule.request.model_dump(exclude_none=True), + include_component_run_details=schedule.request.include_component_run_details, + run_id=str(uuid.uuid4()), + workflow_definition_source="catalog", + trigger_type="scheduled", + schedule_id=schedule.schedule_id, + idempotency_key=idempotency_key, + ) + updated_schedule = schedule.model_copy(update={"updated_at": _utc_now(), "last_triggered_at": _utc_now()}) + _write_schedule(updated_schedule) + return ( + WorkflowScheduleTriggerResponse( + schedule_id=schedule.schedule_id, + workflow_id=schedule.workflow_id, + job_id=response.run_id, + status=WorkflowJobStatus.SUCCESSFUL, + idempotency_key=idempotency_key, + reused_existing_job=False, + ), + response, + ) + + +def _write_schedule(schedule: WorkflowSchedule) -> None: + _schedules_dir().mkdir(parents=True, exist_ok=True) + _schedule_path(schedule.schedule_id).write_text(schedule.model_dump_json(indent=2), encoding="utf-8") + + +def _schedules_dir() -> Path: + return DOWNLOAD_DIR / "workflow_schedules" + + +def _schedule_path(schedule_id: str) -> Path: + return _schedules_dir() / f"{schedule_id}.json" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _validate_cron(value: str) -> None: + parts = value.split() + if len(parts) != 5: + raise ValueError("cron_expression must have 5 space-separated fields") + + +def _parse_execution_time(value: str | None) -> dt.datetime: + if value is None: + return dt.datetime.now(dt.timezone.utc) + parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.timezone.utc) + return parsed.astimezone(dt.timezone.utc) + + +def _render_idempotency_key( + *, + template: str, + workflow_id: str, + schedule_id: str, + execution_time: dt.datetime, +) -> str: + values = { + "workflow_id": workflow_id, + "schedule_id": schedule_id, + "date": execution_time.strftime("%Y-%m-%d"), + "datetime": execution_time.strftime("%Y-%m-%dT%H:%M:%SZ"), + "hour": execution_time.strftime("%Y-%m-%dT%H"), + } + rendered = template + for key, value in values.items(): + rendered = rendered.replace(f"{{{key}}}", value) + return rendered diff --git a/src/eo_api/workflows/services/simple_mapper.py b/src/eo_api/workflows/services/simple_mapper.py new file mode 100644 index 0000000..164b9e6 --- /dev/null +++ b/src/eo_api/workflows/services/simple_mapper.py @@ -0,0 +1,94 @@ +"""Mapper from simplified workflow inputs to internal workflow request.""" + +from __future__ import annotations + +from ...data_registry.services.datasets import get_dataset +from ...shared.api_errors import raise_api_error +from ..schemas import ( + Dhis2DataValueSetConfig, + FeatureSourceConfig, + FeatureSourceType, + SpatialAggregationConfig, + TemporalAggregationConfig, + WorkflowExecuteRequest, + WorkflowRequest, +) + +_IGNORED_FIELDS = ["dry_run", "stage", "flavor", "output_format"] + + +def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteRequest, list[str]]: + """Translate public workflow request format to internal workflow request.""" + inputs = payload + dataset_id = inputs.dataset_id + dataset = get_dataset(dataset_id) + start: str + end: str + feature_source: FeatureSourceConfig + + period_type = str(dataset.get("period_type", "")).lower() if dataset else "" + + if inputs.start_date and inputs.end_date: + if period_type == "yearly": + start = inputs.start_date[:4] + end = inputs.end_date[:4] + elif period_type in {"hourly", "daily", "monthly"}: + # dhis2eo downloaders expect month windows for these dataset types. + start = inputs.start_date[:7] + end = inputs.end_date[:7] + else: + start = inputs.start_date + end = inputs.end_date + elif inputs.start_year is not None and inputs.end_year is not None: + if period_type == "yearly": + start = str(inputs.start_year) + end = str(inputs.end_year) + else: + start = f"{inputs.start_year}-01-01" + end = f"{inputs.end_year}-12-31" + else: + raise_api_error( + 422, + error="workflow_request_invalid", + error_code="REQUEST_VALIDATION_FAILED", + message="Provide either start_date/end_date or start_year/end_year", + ) + + if inputs.org_unit_level is not None: + feature_source = FeatureSourceConfig( + source_type=FeatureSourceType.DHIS2_LEVEL, + dhis2_level=inputs.org_unit_level, + feature_id_property=inputs.feature_id_property, + ) + elif inputs.org_unit_ids: + feature_source = FeatureSourceConfig( + source_type=FeatureSourceType.DHIS2_IDS, + dhis2_ids=inputs.org_unit_ids, + feature_id_property=inputs.feature_id_property, + ) + else: + raise_api_error( + 422, + error="workflow_request_invalid", + error_code="REQUEST_VALIDATION_FAILED", + message="Provide org_unit_level or org_unit_ids", + ) + + normalized = WorkflowExecuteRequest( + dataset_id=dataset_id, + start=start, + end=end, + publish=inputs.publish, + overwrite=inputs.overwrite, + country_code=inputs.country_code, + feature_source=feature_source, + temporal_aggregation=TemporalAggregationConfig( + target_period_type=inputs.temporal_resolution, + method=inputs.temporal_reducer, + ), + spatial_aggregation=SpatialAggregationConfig(method=inputs.spatial_reducer), + dhis2=Dhis2DataValueSetConfig(data_element_uid=inputs.data_element), + ) + + warnings = [f"Input field '{field}' is currently accepted but not used in execution" for field in _IGNORED_FIELDS] + return normalized, warnings diff --git a/src/eo_api/workflows/services/spatial.py b/src/eo_api/workflows/services/spatial.py new file mode 100644 index 0000000..8bf3f05 --- /dev/null +++ b/src/eo_api/workflows/services/spatial.py @@ -0,0 +1,63 @@ +"""Spatial aggregation component for gridded datasets.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import xarray as xr +from shapely import contains_xy +from shapely.geometry import shape + +from ...data_manager.services.utils import get_lon_lat_dims, get_time_dim +from .features import feature_id + + +def aggregate_to_features( + ds: xr.Dataset, + *, + variable: str, + features: dict[str, Any], + method: str, + feature_id_property: str, +) -> list[dict[str, Any]]: + """Aggregate one gridded variable into per-feature time series.""" + da = ds[variable] + time_dim = get_time_dim(da) + lon_dim, lat_dim = get_lon_lat_dims(da) + lon_values = da[lon_dim].values + lat_values = da[lat_dim].values + lon_grid, lat_grid = np.meshgrid(lon_values, lat_values) + + output: list[dict[str, Any]] = [] + for feature in features.get("features", []): + geom = shape(feature["geometry"]) + mask = contains_xy(geom, lon_grid, lat_grid) + if not np.any(mask): + continue + + mask_da = xr.DataArray( + mask, + dims=(lat_dim, lon_dim), + coords={lat_dim: da[lat_dim], lon_dim: da[lon_dim]}, + ) + reduced = getattr(da.where(mask_da), method)(dim=[lat_dim, lon_dim], skipna=True) + org_unit = feature_id(feature, feature_id_property) + for t, value in zip(reduced[time_dim].values, reduced.values, strict=True): + if np.isnan(value): + continue + # Keep component outputs JSON-safe for direct API exposure and remote execution. + if isinstance(t, np.datetime64): + time_value: Any = np.datetime_as_string(t, unit="s") + elif isinstance(t, np.generic): + time_value = t.item() + else: + time_value = t + output.append( + { + "org_unit": org_unit, + "time": time_value, + "value": float(value), + } + ) + return output diff --git a/src/eo_api/workflows/services/temporal.py b/src/eo_api/workflows/services/temporal.py new file mode 100644 index 0000000..e244f85 --- /dev/null +++ b/src/eo_api/workflows/services/temporal.py @@ -0,0 +1,25 @@ +"""Temporal aggregation component.""" + +from __future__ import annotations + +from typing import cast + +import xarray as xr + +from ...data_manager.services.utils import get_time_dim +from ..schemas import AggregationMethod, PeriodType + +_PERIOD_TO_FREQ: dict[PeriodType, str] = { + PeriodType.HOURLY: "1h", + PeriodType.DAILY: "1D", + PeriodType.MONTHLY: "MS", + PeriodType.YEARLY: "YS", +} + + +def aggregate_temporal(ds: xr.Dataset, *, period_type: PeriodType, method: AggregationMethod) -> xr.Dataset: + """Resample a dataset over the time dimension to the target period.""" + time_dim = get_time_dim(ds) + freq = _PERIOD_TO_FREQ[period_type] + resampled = ds.resample({time_dim: freq}) + return cast(xr.Dataset, getattr(resampled, method.value)(keep_attrs=True)) diff --git a/tests/conftest.py b/tests/conftest.py index 9c1b3c2..a92c038 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,28 @@ +from pathlib import Path + import pytest from fastapi.testclient import TestClient from eo_api.main import app +from eo_api.publications import pygeoapi as publication_pygeoapi +from eo_api.publications import services as publication_services +from eo_api.workflows.services import datavalueset, job_store, publication_assets, run_logs @pytest.fixture def client() -> TestClient: return TestClient(app) + + +@pytest.fixture(autouse=True) +def isolate_download_artifacts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Keep workflow/publication tests from writing into the repo download dir.""" + isolated_download_dir = tmp_path / "downloads" + isolated_download_dir.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(datavalueset, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_assets, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", isolated_download_dir) diff --git a/tests/test_data_accessor.py b/tests/test_data_accessor.py new file mode 100644 index 0000000..d55adcc --- /dev/null +++ b/tests/test_data_accessor.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import numpy as np +import pytest +import xarray as xr +from fastapi.testclient import TestClient + +from eo_api.data_accessor.services.accessor import ( + get_coverage_summary, + get_point_values, + get_preview_summary, +) +from eo_api.main import app + + +def test_get_point_values_returns_time_series(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + result = get_point_values( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + lon=1.9, + lat=8.0, + start="2024-01", + end="2024-02", + ) + + assert result["dataset_id"] == "chirps3_precipitation_daily" + assert result["variable"] == "precip" + assert result["value_count"] == 2 + assert result["resolved_point"] == {"lon": 2.0, "lat": 8.0} + assert result["values"] == [{"period": "2024-01", "value": 2.0}, {"period": "2024-02", "value": 4.0}] + + +def test_point_query_outside_coverage_returns_typed_error(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/point", + params={"lon": 99.0, "lat": 99.0, "start": "2024-01", "end": "2024-02"}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "point_query_invalid" + assert body["error_code"] == "POINT_QUERY_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" + + +def test_get_preview_summary_returns_stats_and_sample(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + result = get_preview_summary( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + start="2024-01", + end="2024-02", + bbox=[1.0, 8.0, 2.0, 8.0], + max_cells=3, + ) + + assert result["dataset_id"] == "chirps3_precipitation_daily" + assert result["stats"] == {"min": 1.0, "max": 4.0, "mean": 2.5, "value_count": 4} + assert result["dims"] == {"time": 2, "lat": 1, "lon": 2} + assert len(result["sample"]) == 3 + assert result["sample"][0]["period"] == "2024-01" + + +def test_preview_endpoint_requires_complete_bbox(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/preview", + params={"start": "2024-01", "end": "2024-02", "xmin": 1.0, "ymin": 8.0}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "preview_invalid" + assert body["error_code"] == "PREVIEW_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" + + +def test_get_coverage_summary_wraps_preview_and_full_coverage(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "eo_api.data_accessor.services.accessor.get_preview_summary", + lambda *args, **kwargs: { + "dataset_id": "chirps3_precipitation_daily", + "variable": "precip", + "requested": {"start": "2024-01", "end": "2024-02", "bbox": [1.0, 8.0, 2.0, 8.0]}, + "dims": {"time": 2, "lat": 1, "lon": 2}, + "stats": {"min": 1.0, "max": 4.0, "mean": 2.5, "value_count": 4}, + "sample": [{"period": "2024-01", "lat": 8.0, "lon": 1.0, "value": 1.0}], + }, + ) + monkeypatch.setattr( + "eo_api.data_accessor.services.accessor.get_data_coverage", + lambda dataset: { + "coverage": { + "temporal": {"start": "2024-01", "end": "2024-12"}, + "spatial": {"xmin": 1.0, "ymin": 8.0, "xmax": 2.0, "ymax": 9.0}, + } + }, + ) + + result = get_coverage_summary( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + start="2024-01", + end="2024-02", + bbox=[1.0, 8.0, 2.0, 8.0], + max_cells=3, + ) + + assert result["coverage"]["temporal"] == {"start": "2024-01", "end": "2024-12"} + assert result["coverage"]["spatial"] == {"xmin": 1.0, "ymin": 8.0, "xmax": 2.0, "ymax": 9.0} + assert result["subset"]["stats"]["mean"] == 2.5 + assert result["subset"]["sample"][0]["value"] == 1.0 + + +def test_coverage_endpoint_requires_complete_bbox(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/coverage", + params={"start": "2024-01", "end": "2024-02", "xmin": 1.0, "ymin": 8.0}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "coverage_invalid" + assert body["error_code"] == "COVERAGE_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" diff --git a/tests/test_raster_routes.py b/tests/test_raster_routes.py new file mode 100644 index 0000000..4c212dd --- /dev/null +++ b/tests/test_raster_routes.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +import xarray as xr +from fastapi.testclient import TestClient + +from eo_api.main import app +from eo_api.raster import routes as raster_routes + + +def test_raster_capabilities_report_missing_zarr_archive(monkeypatch: pytest.MonkeyPatch) -> None: + client = TestClient(app) + with monkeypatch.context() as patcher: + patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) + response = client.get("/raster/chirps3_precipitation_daily/capabilities") + + assert response.status_code == 200 + body = response.json() + assert body["collection_id"] == "chirps3_precipitation_daily" + assert body["kind"] == "coverage" + assert body["titiler"]["eligible"] is False + assert body["titiler"]["reader"] == "xarray" + assert "build_zarr" in body["titiler"]["reason"] + + +def test_raster_variables_route_rejects_resource_without_zarr_archive(monkeypatch: pytest.MonkeyPatch) -> None: + client = TestClient(app) + with monkeypatch.context() as patcher: + patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) + response = client.get("/raster/chirps3_precipitation_daily/variables") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_publication_unsupported" + assert body["error_code"] == "RASTER_PUBLICATION_UNSUPPORTED" + + +def test_raster_variables_route_uses_zarr_backed_xarray_reader(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(4, dtype=float).reshape(1, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/variables") + + assert response.status_code == 200 + assert response.json() == ["precip"] + + +def test_raster_preview_requires_datetime_for_temporal_dataset(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_datetime_required" + assert body["error_code"] == "RASTER_DATETIME_REQUIRED" + + +def test_raster_preview_with_datetime_renders_single_time_slice( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip&datetime=2024-01-01") + + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + assert response.content + + +def test_raster_preview_with_aggregation_renders_time_reduced_image( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get( + "/raster/chirps3_precipitation_daily/preview.png" + "?variable=precip&aggregation=sum&start=2024-01-01&end=2024-01-02" + ) + + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + assert response.content + + +def test_raster_preview_rejects_aggregation_without_range(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip&aggregation=sum") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_temporal_query_invalid" + assert body["error_code"] == "RASTER_TEMPORAL_QUERY_INVALID" + + +def test_raster_tile_outside_bounds_returns_404(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get( + "/raster/chirps3_precipitation_daily/tiles/WebMercatorQuad/6/30/31.png" + "?variable=precip&aggregation=sum&start=2024-01-01&end=2024-01-02" + ) + + assert response.status_code == 404 diff --git a/tests/test_root.py b/tests/test_root.py index 5353344..f90d3c6 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -1,6 +1,6 @@ from fastapi.testclient import TestClient -from eo_api.schemas import HealthStatus, RootResponse +from eo_api.system.schemas import HealthStatus, RootResponse def test_root_returns_200(client: TestClient) -> None: diff --git a/tests/test_workflows.py b/tests/test_workflows.py new file mode 100644 index 0000000..a291ae9 --- /dev/null +++ b/tests/test_workflows.py @@ -0,0 +1,2762 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any, cast + +import numpy as np +import pytest +import xarray as xr +from fastapi import HTTPException +from fastapi.routing import APIRoute +from fastapi.testclient import TestClient + +from eo_api.components import services as component_services +from eo_api.main import app +from eo_api.publications import pygeoapi as publication_pygeoapi +from eo_api.publications import services as publication_services +from eo_api.workflows.schemas import ( + AggregationMethod, + PeriodType, + WorkflowExecuteRequest, + WorkflowExecuteResponse, + WorkflowRequest, +) +from eo_api.workflows.services import engine, job_store, run_logs +from eo_api.workflows.services.definitions import WorkflowDefinition, load_workflow_definition +from eo_api.workflows.services.simple_mapper import normalize_simple_request + + +def _valid_public_payload() -> dict[str, Any]: + return { + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "publish": True, + "dry_run": True, + "include_component_run_details": False, + } + } + + +def _standard_workflow_outputs( + *, + feature_step: str = "feature_source", + spatial_step: str = "spatial_aggregation", + build_step: str = "build_datavalueset", +) -> dict[str, dict[str, str]]: + return { + "bbox": {"from_step": feature_step, "output": "bbox"}, + "features": {"from_step": feature_step, "output": "features"}, + "records": {"from_step": spatial_step, "output": "records"}, + "data_value_set": {"from_step": build_step, "output": "data_value_set"}, + "output_file": {"from_step": build_step, "output": "output_file"}, + } + + +def _standard_publication_inputs( + *, + feature_step: str = "feature_source", + spatial_step: str = "spatial_aggregation", + build_step: str = "build_datavalueset", +) -> dict[str, dict[str, str]]: + return { + "features": {"from_step": feature_step, "output": "features"}, + "records": {"from_step": spatial_step, "output": "records"}, + "output_file": {"from_step": build_step, "output": "output_file"}, + } + + +def _patch_successful_execution(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + + +def _patch_successful_execution_multi_period(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]], [[2.0]]])}, + coords={"time": ["2024-01-01", "2024-02-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [ + {"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}, + {"org_unit": "OU_1", "time": "2024-02-01", "value": 12.0}, + ], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}, {"value": "12.0"}]}, "/tmp/data/out.json"), + ) + + +def test_workflow_endpoint_exists_once() -> None: + workflow_routes = { + route.path + for route in app.routes + if isinstance(route, APIRoute) and route.path.startswith("/workflows") and "POST" in route.methods + } + assert workflow_routes == { + "/workflows/dhis2-datavalue-set", + "/workflows/execute", + "/workflows/jobs/cleanup", + "/workflows/schedules", + "/workflows/schedules/{schedule_id}/trigger", + "/workflows/validate", + } + + +def test_ogc_process_routes_exist() -> None: + ogc_routes = { + route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/ogcapi") + } + assert "/ogcapi" in ogc_routes + assert "/ogcapi/conformance" in ogc_routes + assert "/ogcapi/openapi" in ogc_routes + assert "/ogcapi/processes" in ogc_routes + assert "/ogcapi/processes/{process_id}" in ogc_routes + assert "/ogcapi/processes/{process_id}/execution" in ogc_routes + assert "/ogcapi/jobs" in ogc_routes + assert "/ogcapi/jobs/{job_id}" in ogc_routes + assert "/ogcapi/jobs/{job_id}/results" in ogc_routes + assert "/ogcapi/jobs/{job_id}/download" in ogc_routes + + +def test_publication_generated_pygeoapi_routes_exist() -> None: + publication_routes = { + route.path + for route in app.routes + if isinstance(route, APIRoute) and route.path.startswith("/publications/pygeoapi") + } + assert "/publications/pygeoapi/config" in publication_routes + assert "/publications/pygeoapi/openapi" in publication_routes + assert "/publications/pygeoapi/materialize" in publication_routes + + +def test_analytics_viewer_routes_exist() -> None: + analytics_routes = { + route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/analytics") + } + assert "/analytics/publications/{resource_id}" in analytics_routes + assert "/analytics/publications/{resource_id}/viewer" in analytics_routes + + +def test_pygeoapi_runtime_env_points_to_generated_documents() -> None: + config_path = os.environ.get("PYGEOAPI_CONFIG") + openapi_path = os.environ.get("PYGEOAPI_OPENAPI") + assert config_path is not None + assert openapi_path is not None + assert config_path.endswith("pygeoapi-config.generated.yml") + assert openapi_path.endswith("pygeoapi-openapi.generated.yml") + assert Path(config_path).exists() + assert Path(openapi_path).exists() + + +def test_pygeoapi_mount_serves_landing_page(client: TestClient) -> None: + response = client.get("/ogcapi?f=json") + assert response.status_code == 200 + body = response.json() + assert body["title"] == "DHIS2 EO API" + rels = {link["rel"] for link in body["links"]} + assert {"self", "alternate", "data", "processes", "jobs"} <= rels + + +def test_native_ogc_conformance_exists(client: TestClient) -> None: + response = client.get("/ogcapi/conformance") + assert response.status_code == 200 + body = response.json() + conforms_to = set(body["conformsTo"]) + assert "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/core" in conforms_to + assert "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/job-list" in conforms_to + + +def test_native_ogc_openapi_exists(client: TestClient) -> None: + response = client.get("/ogcapi/openapi") + assert response.status_code == 200 + body = response.json() + assert body["openapi"] == "3.0.2" + assert "/jobs/{job_id}/results" in body["paths"] + assert "/processes/{process_id}/execution" in body["paths"] + + +def test_native_ogc_process_description_exposes_inputs_and_outputs(client: TestClient) -> None: + response = client.get("/ogcapi/processes/generic-dhis2-workflow") + assert response.status_code == 200 + body = response.json() + assert body["id"] == "generic-dhis2-workflow" + assert "request" in body["inputs"] + assert "schema" in body["inputs"]["request"] + assert "outputs" in body["outputs"] + assert "schema" in body["outputs"]["outputs"] + + +def test_publication_endpoint_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/publications/does-not-exist") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "published_resource_not_found" + assert body["error_code"] == "PUBLISHED_RESOURCE_NOT_FOUND" + assert body["resource_id"] == "does-not-exist" + + +def test_analytics_endpoint_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/analytics/publications/does-not-exist") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "published_resource_not_found" + assert body["error_code"] == "PUBLISHED_RESOURCE_NOT_FOUND" + assert body["resource_id"] == "does-not-exist" + + +def test_mapper_validation_uses_typed_error_envelope() -> None: + payload = WorkflowRequest.model_construct( # type: ignore[call-arg] + workflow_id="dhis2_datavalue_set_v1", + dataset_id="chirps3_precipitation_daily", + org_unit_level=3, + data_element="DE_UID", + temporal_resolution=PeriodType.MONTHLY, + temporal_reducer=AggregationMethod.SUM, + spatial_reducer=AggregationMethod.MEAN, + overwrite=False, + dry_run=True, + feature_id_property="id", + include_component_run_details=False, + ) + + with pytest.raises(HTTPException) as exc_info: + normalize_simple_request(payload) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "workflow_request_invalid" + assert detail["error_code"] == "REQUEST_VALIDATION_FAILED" + + +def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClient) -> None: + response = client.get("/workflows") + assert response.status_code == 200 + body = response.json() + assert "workflows" in body + assert len(body["workflows"]) >= 2 + by_id = {item["workflow_id"]: item for item in body["workflows"]} + + default = by_id["dhis2_datavalue_set_v1"] + assert default["version"] == 1 + assert default["publication_publishable"] is True + assert default["publication_intent"] == "feature_collection" + assert default["publication_exposure"] == "ogc" + assert default["publication_asset_format"] is None + assert default["publication_asset_binding"] is None + assert default["publication_inputs"]["features"]["from_step"] == "get_features" + assert default["serving_supported"] is True + assert default["serving_asset_format"] == "geojson" + assert default["serving_targets"] == ["pygeoapi", "analytics"] + assert default["serving_error"] is None + assert default["step_count"] == 5 + assert default["components"] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + fast = by_id["dhis2_datavalue_set_without_temporal_aggregation_v1"] + assert fast["version"] == 1 + assert fast["publication_publishable"] is False + assert fast["publication_intent"] is None + assert fast["publication_exposure"] is None + assert fast["publication_asset_format"] is None + assert fast["publication_asset_binding"] is None + assert fast["publication_inputs"] == {} + assert fast["serving_supported"] is True + assert fast["serving_asset_format"] == "geojson" + assert fast["serving_targets"] == ["registry"] + assert fast["serving_error"] is None + assert fast["step_count"] == 4 + assert fast["components"] == [ + "feature_source", + "download_dataset", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_workflow_definition_allows_non_datavalueset_terminal_step_when_outputs_declared() -> None: + definition = WorkflowDefinition.model_validate( + { + "workflow_id": "generic_records_v1", + "version": 1, + "steps": [ + {"id": "get_features", "component": "feature_source", "version": "v1"}, + { + "id": "spatial_agg", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "get_features", "output": "bbox"}, + "features": {"from_step": "get_features", "output": "features"}, + }, + }, + ], + "outputs": { + "features": {"from_step": "get_features", "output": "features"}, + "records": {"from_step": "spatial_agg", "output": "records"}, + }, + } + ) + + assert [step.component for step in definition.steps] == ["feature_source", "spatial_aggregation"] + assert set(definition.outputs) == {"features", "records"} + + +def test_workflow_definition_requires_explicit_outputs() -> None: + with pytest.raises(ValueError, match="declare at least one exported output"): + WorkflowDefinition.model_validate( + { + "workflow_id": "missing_outputs_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + } + ) + + +def test_publishable_workflow_can_declare_publication_asset_without_builder_inputs() -> None: + definition = WorkflowDefinition.model_validate( + { + "workflow_id": "coverage_publish_v1", + "version": 1, + "publication": { + "publishable": True, + "intent": "coverage", + "asset": {"from_step": "build", "output": "output_file"}, + "asset_format": "zarr", + }, + "steps": [ + {"id": "feature_source", "component": "feature_source", "version": "v1"}, + {"id": "download_dataset", "component": "download_dataset", "version": "v1"}, + { + "id": "spatial_aggregation", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "feature_source", "output": "bbox"}, + "features": {"from_step": "feature_source", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "version": "v1", + "inputs": {"records": {"from_step": "spatial_aggregation", "output": "records"}}, + }, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build", + ), + } + ) + + assert definition.publication.asset is not None + assert definition.publication.asset.from_step == "build" + + +def test_publishable_workflow_rejects_unsupported_serving_contract() -> None: + with pytest.raises(ValueError, match="Unsupported publication serving contract"): + WorkflowDefinition.model_validate( + { + "workflow_id": "tileset_publish_v1", + "version": 1, + "publication": { + "publishable": True, + "intent": "tileset", + "exposure": "ogc", + "asset": {"from_step": "build", "output": "output_file"}, + "asset_format": "tiles", + }, + "steps": [ + {"id": "feature_source", "component": "feature_source", "version": "v1"}, + {"id": "download_dataset", "component": "download_dataset", "version": "v1"}, + { + "id": "spatial_aggregation", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "feature_source", "output": "bbox"}, + "features": {"from_step": "feature_source", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "version": "v1", + "inputs": {"records": {"from_step": "spatial_aggregation", "output": "records"}}, + }, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build", + ), + } + ) + + +def test_components_catalog_endpoint_returns_five_components(client: TestClient) -> None: + response = client.get("/components") + assert response.status_code == 200 + items = response.json()["components"] + names = {item["name"] for item in items} + assert names == { + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + } + for item in items: + assert item["version"] == "v1" + assert isinstance(item["input_schema"], dict) + assert "config_schema" not in item + assert isinstance(item["output_schema"], dict) + assert "EXECUTION_FAILED" in item["error_codes"] + assert item["endpoint"]["method"] == "POST" + assert item["endpoint"]["path"].startswith("/components/") + + +def test_components_catalog_include_internal_includes_config_schema(client: TestClient) -> None: + response = client.get("/components?include_internal=true") + assert response.status_code == 200 + items = response.json()["components"] + assert len(items) >= 5 + for item in items: + assert isinstance(item["config_schema"], dict) + + +def test_workflow_endpoint_returns_response_shape(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-123", + workflow_id="dhis2_datavalue_set_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-123.json", + data_value_set={ + "dataValues": [ + { + "dataElement": "abc123def45", + "period": "202401", + "orgUnit": "OU_1", + "categoryOptionCombo": "HllvX50cXC0", + "attributeOptionCombo": "HllvX50cXC0", + "value": "12.3", + } + ] + }, + component_runs=[], + ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + workflow_definition_source: str = "catalog", + ) -> WorkflowExecuteResponse: + del payload, workflow_id, request_params, include_component_run_details, workflow_definition_source + return stub + + monkeypatch.setattr( + "eo_api.workflows.routes.execute_workflow", + _execute_stub, + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + body = response.json() + assert body["status"] == "completed" + assert body["run_id"] == "run-123" + assert body["workflow_id"] == "dhis2_datavalue_set_v1" + assert body["workflow_version"] == 1 + assert body["run_log_file"].endswith(".json") + assert "dataValues" in body["data_value_set"] + assert body["component_run_details_included"] is False + assert body["component_run_details_available"] is True + + +def test_workflow_endpoint_validates_required_fields(client: TestClient) -> None: + payload = _valid_public_payload() + payload["request"].pop("org_unit_level") + + response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert response.status_code == 422 + + +def test_workflow_job_result_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/workflows/jobs/does-not-exist/result") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "job_not_found" + assert body["error_code"] == "JOB_NOT_FOUND" + assert body["job_id"] == "does-not-exist" + + +def test_pygeoapi_collection_missing_returns_not_found(client: TestClient) -> None: + response = client.get("/pygeoapi/collections/does-not-exist", params={"f": "json"}) + assert response.status_code == 404 + + +def test_ogc_job_results_unavailable_uses_typed_error_envelope( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.execute_workflow", lambda *args, **kwargs: None) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + result_response = client.get(f"/ogcapi/jobs/{job_id}/results") + assert result_response.status_code == 409 + body = result_response.json()["detail"] + assert body["error"] == "job_result_unavailable" + assert body["error_code"] == "JOB_RESULT_UNAVAILABLE" + assert body["job_id"] == job_id + + +def test_workflow_endpoint_accepts_simplified_payload(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + normalized = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3, "feature_id_property": "id"}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-123", + workflow_id="dhis2_datavalue_set_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-123.json", + data_value_set={"dataValues": []}, + component_runs=[], + ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + workflow_definition_source: str = "catalog", + ) -> WorkflowExecuteResponse: + del payload, workflow_id, request_params, include_component_run_details, workflow_definition_source + return stub + + monkeypatch.setattr("eo_api.workflows.routes.normalize_simple_request", lambda payload: (normalized, [])) + monkeypatch.setattr( + "eo_api.workflows.routes.execute_workflow", + _execute_stub, + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + assert response.json()["status"] == "completed" + + +def test_inline_workflow_execute_endpoint_accepts_assembly(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-assembly-123", + workflow_id="adhoc_dhis2_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-assembly-123.json", + data_value_set={"dataValues": []}, + component_runs=[], + ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + workflow_definition: WorkflowDefinition | None = None, + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + workflow_definition_source: str = "inline", + ) -> WorkflowExecuteResponse: + del payload, request_params, include_component_run_details + assert workflow_id == "adhoc_dhis2_v1" + assert workflow_definition is not None + assert workflow_definition_source == "inline" + assert workflow_definition.workflow_id == "adhoc_dhis2_v1" + assert len(workflow_definition.steps) == 4 + return stub + + monkeypatch.setattr("eo_api.workflows.routes.execute_workflow", _execute_stub) + + response = client.post( + "/workflows/execute", + json={ + "workflow": { + "workflow_id": "adhoc_dhis2_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + "outputs": _standard_workflow_outputs(), + }, + "request": { + "workflow_id": "adhoc_dhis2_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": False, + }, + }, + ) + assert response.status_code == 200 + assert response.json()["workflow_id"] == "adhoc_dhis2_v1" + + +def test_inline_workflow_execute_endpoint_rejects_bad_component_chain(client: TestClient) -> None: + response = client.post( + "/workflows/execute", + json={ + "workflow": { + "workflow_id": "bad_adhoc_v1", + "version": 1, + "steps": [ + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + "outputs": { + "data_value_set": {"from_step": "build_datavalueset", "output": "data_value_set"}, + "output_file": {"from_step": "build_datavalueset", "output": "output_file"}, + }, + }, + "request": { + "workflow_id": "bad_adhoc_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 422 + + +def test_workflow_validate_endpoint_accepts_valid_inline_workflow(client: TestClient) -> None: + response = client.post( + "/workflows/validate", + json={ + "workflow": { + "workflow_id": "adhoc_validate_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + "outputs": _standard_workflow_outputs(), + }, + "request": { + "workflow_id": "adhoc_validate_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is True + assert body["workflow_id"] == "adhoc_validate_v1" + assert body["publication_publishable"] is False + assert body["publication_intent"] is None + assert body["publication_inputs"] == {} + assert body["serving_supported"] is True + assert body["serving_asset_format"] == "geojson" + assert body["serving_targets"] == ["registry"] + assert body["step_count"] == 4 + assert len(body["resolved_steps"]) == 4 + assert body["errors"] == [] + + +def test_workflow_validate_endpoint_rejects_runtime_knobs_in_step_config(client: TestClient) -> None: + response = client.post( + "/workflows/validate", + json={ + "workflow": { + "workflow_id": "adhoc_invalid_config_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {"overwrite": True}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + "outputs": _standard_workflow_outputs(), + }, + "request": { + "workflow_id": "adhoc_invalid_config_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is False + assert body["publication_publishable"] is False + assert body["serving_supported"] is True + assert body["resolved_steps"] == [] + assert len(body["errors"]) == 1 + assert "validation failed" in body["errors"][0].lower() + + +def test_workflow_validate_endpoint_unknown_workflow_id(client: TestClient) -> None: + response = client.post("/workflows/validate", json={"workflow_id": "does_not_exist"}) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is False + assert body["step_count"] == 0 + assert body["publication_publishable"] is False + assert len(body["errors"]) == 1 + assert "Unknown workflow_id" in body["errors"][0] + + +def test_workflow_job_endpoints_return_persisted_result( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 200 + job_body = job_response.json() + assert job_body["job_id"] == run_id + assert job_body["status"] == "successful" + assert job_body["process_id"] == "generic-dhis2-workflow" + assert job_body["request"]["dataset_id"] == "chirps3_precipitation_daily" + assert job_body["request"]["start_date"] == "2024-01-01" + assert job_body["request"]["end_date"] == "2024-01-31" + assert job_body["orchestration"]["definition_source"] == "catalog" + assert job_body["orchestration"]["step_count"] == 5 + assert job_body["orchestration"]["components"] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + assert job_body["orchestration"]["steps"][0]["component"] == "feature_source" + assert job_body["orchestration"]["steps"][0]["id"] == "get_features" + assert job_body["orchestration"]["steps"][0]["version"] == "v1" + assert job_body["orchestration"]["steps"][1]["inputs"]["bbox"] == { + "from_step": "get_features", + "output": "bbox", + } + links = {item["rel"]: item["href"] for item in job_body["links"]} + assert links["self"].endswith(f"/workflows/jobs/{run_id}") + assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") + assert links["trace"].endswith(f"/workflows/jobs/{run_id}/trace") + assert links["collection"].endswith(f"/pygeoapi/collections/workflow-output-{run_id}") + assert "analytics" not in links + assert "result" not in job_body + + results_response = client.get(f"/workflows/jobs/{run_id}/result") + assert results_response.status_code == 200 + assert results_response.json()["run_id"] == run_id + + trace_response = client.get(f"/workflows/jobs/{run_id}/trace") + assert trace_response.status_code == 200 + trace_body = trace_response.json() + assert trace_body["run_id"] == run_id + assert trace_body["status"] == "completed" + assert [item["component"] for item in trace_body["component_runs"]] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_delete_workflow_job_cascades_derived_artifacts( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + output_path = job_store.DOWNLOAD_DIR / "cascade-test-datavalue-set.json" + output_path.write_text('{"dataValues": [{"value": "10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(output_path)), + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + output_file = Path(response.json()["output_file"]) + run_log_file = Path(response.json()["run_log_file"]) + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + derived = next( + item for item in publications_response.json()["resources"] if item["resource_id"] == f"workflow-output-{run_id}" + ) + publication_file = publication_services.DOWNLOAD_DIR / "published_resources" / f"workflow-output-{run_id}.json" + publication_asset = Path(derived["path"]) + job_file = job_store.DOWNLOAD_DIR / "workflow_jobs" / f"{run_id}.json" + + assert job_file.exists() + assert run_log_file.exists() + assert output_file.exists() + assert publication_file.exists() + assert publication_asset.exists() + + delete_response = client.delete(f"/workflows/jobs/{run_id}") + assert delete_response.status_code == 200 + delete_body = delete_response.json() + assert delete_body["job_id"] == run_id + assert delete_body["deleted"] is True + assert delete_body["deleted_publication"] == f"workflow-output-{run_id}" + assert delete_body["pygeoapi_runtime_reload_required"] is False + + assert not job_file.exists() + assert not run_log_file.exists() + assert not output_file.exists() + assert not publication_file.exists() + assert not publication_asset.exists() + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 404 + + publication_response = client.get(f"/publications/workflow-output-{run_id}") + assert publication_response.status_code == 404 + + +def test_cleanup_workflow_jobs_dry_run_lists_terminal_candidates_without_deleting( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + first = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + second = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert first.status_code == 200 + assert second.status_code == 200 + first_job_id = first.json()["run_id"] + second_job_id = second.json()["run_id"] + + cleanup_response = client.post("/workflows/jobs/cleanup", params={"dry_run": "true", "keep_latest": 1}) + assert cleanup_response.status_code == 200 + body = cleanup_response.json() + assert body["dry_run"] is True + assert body["candidate_count"] == 1 + assert body["deleted_count"] == 0 + assert body["candidates"][0]["job_id"] == first_job_id + assert body["deleted_job_ids"] == [] + + assert client.get(f"/workflows/jobs/{first_job_id}").status_code == 200 + assert client.get(f"/workflows/jobs/{second_job_id}").status_code == 200 + + +def test_cleanup_workflow_jobs_applies_retention_and_cascades_deletion( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + first = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + second = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert first.status_code == 200 + assert second.status_code == 200 + first_job_id = first.json()["run_id"] + second_job_id = second.json()["run_id"] + + apply_response = client.post("/workflows/jobs/cleanup", params={"dry_run": "false", "keep_latest": 1}) + assert apply_response.status_code == 200 + body = apply_response.json() + assert body["dry_run"] is False + assert body["deleted_count"] == 1 + assert body["deleted_job_ids"] == [first_job_id] + + assert client.get(f"/workflows/jobs/{first_job_id}").status_code == 404 + assert client.get(f"/publications/workflow-output-{first_job_id}").status_code == 404 + assert client.get(f"/workflows/jobs/{second_job_id}").status_code == 200 + + +def test_ogc_async_execution_creates_job_and_results( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + body = response.json() + assert body["status"] == "accepted" + job_id = body["jobID"] + + job_response = client.get(f"/ogcapi/jobs/{job_id}") + assert job_response.status_code == 200 + assert job_response.json()["status"] == "successful" + + results_response = client.get(f"/ogcapi/jobs/{job_id}/results") + assert results_response.status_code == 200 + body = results_response.json() + assert "outputs" in body + assert isinstance(body["outputs"], list) + output_ids = {item["id"] for item in body["outputs"]} + assert "data_value_set" in output_ids + assert "output_file" in output_ids + + +def test_ogc_job_results_extended_exposes_native_metadata( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + results_response = client.get(f"/ogcapi/jobs/{job_id}/results", params={"extended": "true"}) + assert results_response.status_code == 200 + body = results_response.json() + assert body["metadata"]["job_id"] == job_id + assert body["metadata"]["workflow_id"] == "dhis2_datavalue_set_v1" + + +def test_ogc_job_download_serves_native_output( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + download_response = client.get(f"/ogcapi/jobs/{job_id}/download") + assert download_response.status_code == 200 + + +def test_publications_endpoint_seeds_source_datasets( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + + response = client.get("/publications") + assert response.status_code == 200 + body = response.json() + resource_ids = {item["resource_id"] for item in body["resources"]} + assert "dataset-chirps3_precipitation_daily" in resource_ids + assert "dataset-worldpop_population_yearly" in resource_ids + + +def test_generated_pygeoapi_config_reflects_collection_registry( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + body = response.json() + resources = body["resources"] + assert len(resources) > 0 + assert "chirps3_precipitation_daily" in resources + first = resources["chirps3_precipitation_daily"] + assert first["type"] == "collection" + assert "title" in first + + +def test_generated_pygeoapi_config_contains_collection_detail( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + collection = response.json()["resources"]["chirps3_precipitation_daily"] + assert collection["type"] == "collection" + assert collection["title"]["en"] + assert collection["providers"][0]["type"] == "coverage" + raster_link = next(link for link in collection["links"] if link["rel"] == "raster-capabilities") + assert raster_link["href"].endswith("/raster/chirps3_precipitation_daily/capabilities") + assert raster_link["title"] == "Raster Rendering Capabilities" + + +def test_generated_pygeoapi_config_uses_real_source_coverage_extent( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "y", "x"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "y": xr.Variable(("y",), [9.5, 10.5], attrs={"units": "degrees_north"}), + "x": xr.Variable(("x",), [39.5, 40.5], attrs={"units": "degrees_east"}), + }, + ).rio.write_crs("EPSG:4326").to_zarr(zarr_path, mode="w") + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + monkeypatch.setattr( + publication_services, + "list_datasets", + lambda: [ + { + "id": "chirps3_precipitation_daily", + "name": "Total precipitation (CHIRPS3)", + "variable": "precip", + "period_type": "daily", + "source": "CHIRPS v3", + "source_url": "https://example.test/chirps", + "resolution": "5 km x 5 km", + "units": "mm", + } + ], + ) + monkeypatch.setattr( + publication_services, + "get_data_coverage", + lambda dataset: { + "coverage": { + "spatial": {"xmin": 39.5, "ymin": 9.5, "xmax": 40.5, "ymax": 10.5}, + "temporal": {"start": "2024-01-01", "end": "2024-01-02"}, + } + }, + ) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + collection = response.json()["resources"]["chirps3_precipitation_daily"] + assert collection["extents"]["spatial"]["bbox"] == [[39.5, 9.5, 40.5, 10.5]] + assert collection["extents"]["temporal"]["begin"] == "2024-01-01" + assert collection["extents"]["temporal"]["end"] == "2024-01-02" + + +def test_ogc_collection_html_for_coverage_includes_raster_controls( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "y", "x"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "y": xr.Variable(("y",), [9.5, 10.5], attrs={"units": "degrees_north"}), + "x": xr.Variable(("x",), [39.5, 40.5], attrs={"units": "degrees_east"}), + }, + ).rio.write_crs("EPSG:4326").to_zarr(zarr_path, mode="w") + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/pygeoapi/collections/chirps3_precipitation_daily?f=html") + assert response.status_code == 200 + assert "Update raster map" in response.text + assert "Single-date preview example" in response.text + assert "TileJSON example" in response.text + + +def test_workflow_success_registers_derived_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + derived = next(item for item in resources if item["resource_id"] == f"workflow-output-{run_id}") + assert derived["resource_class"] == "derived" + assert derived["job_id"] == run_id + assert derived["ogc_path"] == f"/pygeoapi/collections/workflow-output-{run_id}" + assert derived["exposure"] == "ogc" + assert derived["asset_format"] == "geojson" + assert derived["path"].endswith(".geojson") + assert derived["metadata"]["native_output_file"].endswith(".json") + assert derived["metadata"]["period_count"] == 1 + assert derived["metadata"]["analytics_eligible"] is False + assert not any(link["rel"] == "analytics" for link in derived["links"]) + geojson = Path(derived["path"]).read_text(encoding="utf-8") + assert '"org_unit_name"' in geojson + assert '"period": "2024-01"' in geojson + assert '"period_type"' not in geojson + assert '"dataset_id"' not in geojson + + +def test_dynamic_ogc_collection_routes_reflect_new_publication_without_restart( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + collection_id = f"workflow-output-{run_id}" + + collections_response = client.get("/pygeoapi/collections", params={"f": "json"}) + assert collections_response.status_code == 200 + collections = collections_response.json()["collections"] + derived = next(item for item in collections if item["id"] == collection_id) + assert derived["itemType"] == "feature" + + detail_response = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) + assert detail_response.status_code == 200 + detail = detail_response.json() + detail_links = {link["rel"]: link["href"] for link in detail["links"]} + assert detail["id"] == collection_id + assert "analytics" not in detail_links + + items_response = client.get(f"/pygeoapi/collections/{collection_id}/items", params={"f": "json", "limit": 5}) + assert items_response.status_code == 200 + items = items_response.json() + assert items["type"] == "FeatureCollection" + assert items["numberReturned"] == 1 + feature_props = items["features"][0]["properties"] + assert set(feature_props) == {"org_unit", "org_unit_name", "period", "value"} + + +def test_dynamic_ogc_collection_routes_drop_deleted_publication_without_restart( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + collection_id = f"workflow-output-{run_id}" + + before_delete = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) + assert before_delete.status_code == 200 + + delete_response = client.delete(f"/workflows/jobs/{run_id}") + assert delete_response.status_code == 200 + + after_delete = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) + assert after_delete.status_code == 404 + + +def test_analytics_viewer_config_and_html_for_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + resource_id = f"workflow-output-{run_id}" + + config_response = client.get(f"/analytics/publications/{resource_id}") + assert config_response.status_code == 200 + config = config_response.json() + assert config["resource_id"] == resource_id + assert config["data_url"].startswith("/data/") + assert config["links"]["collection"] == f"/pygeoapi/collections/{resource_id}" + + viewer_response = client.get(f"/analytics/publications/{resource_id}/viewer") + assert viewer_response.status_code == 200 + assert "Time-aware choropleth view" in viewer_response.text + assert resource_id in viewer_response.text + + +def test_multi_period_publication_adds_analytics_link( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution_multi_period(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + resource_id = f"workflow-output-{run_id}" + + publication_response = client.get(f"/publications/{resource_id}") + assert publication_response.status_code == 200 + publication = publication_response.json() + assert publication["metadata"]["period_count"] == 2 + assert publication["metadata"]["analytics_eligible"] is True + analytics_link = next(link for link in publication["links"] if link["rel"] == "analytics") + assert analytics_link["href"] == f"/analytics/publications/{resource_id}/viewer" + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 200 + job_links = {item["rel"]: item["href"] for item in job_response.json()["links"]} + assert job_links["analytics"].endswith(f"/analytics/publications/{resource_id}/viewer") + + config_response = client.get("/publications/pygeoapi/config") + assert config_response.status_code == 200 + derived = config_response.json()["resources"][resource_id] + analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") + assert analytics_link["type"] == "text/html" + assert analytics_link["title"] == "Analytics Viewer" + assert analytics_link["href"].endswith(f"/analytics/publications/{resource_id}/viewer") + + +def test_workflow_with_publication_disabled_does_not_register_derived_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + payload = _valid_public_payload() + payload["request"]["workflow_id"] = "dhis2_datavalue_set_without_temporal_aggregation_v1" + + response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get( + "/publications", + params={"workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1"}, + ) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + +def test_inline_workflow_publication_intent_is_blocked_by_server_guardrail( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.delenv("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", raising=False) + _patch_successful_execution(monkeypatch) + + payload = { + "workflow": { + "workflow_id": "adhoc_chirps_mixed_exec_v1", + "version": 1, + "publication": { + "publishable": True, + "strategy": "on_success", + "intent": "feature_collection", + "inputs": _standard_publication_inputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), + }, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), + }, + "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, + } + + response = client.post("/workflows/execute", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "adhoc_chirps_mixed_exec_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + +def test_inline_workflow_publication_intent_can_be_enabled_by_server_policy( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setenv("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", "true") + _patch_successful_execution(monkeypatch) + + payload = { + "workflow": { + "workflow_id": "adhoc_chirps_mixed_exec_v1", + "version": 1, + "publication": { + "publishable": True, + "strategy": "on_success", + "intent": "feature_collection", + "inputs": _standard_publication_inputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), + }, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), + }, + "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, + } + + response = client.post("/workflows/execute", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "adhoc_chirps_mixed_exec_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + derived = next(item for item in resources if item["resource_id"] == f"workflow-output-{run_id}") + assert derived["workflow_id"] == "adhoc_chirps_mixed_exec_v1" + assert derived["exposure"] == "registry_only" + + +def test_ogc_process_sync_execution_links_to_collection( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/ogcapi/processes/generic-dhis2-workflow/execution", json=_valid_public_payload()) + assert response.status_code == 200 + body = response.json() + collection_links = [item for item in body["links"] if item["rel"] == "collection"] + assert len(collection_links) == 1 + assert "/pygeoapi/collections/workflow-output-" in collection_links[0]["href"] + + +def test_generated_pygeoapi_config_reflects_publication_registry( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + body = response.json() + resources = body["resources"] + assert "chirps3_precipitation_daily" in resources + chirps = resources["chirps3_precipitation_daily"] + assert chirps["type"] == "collection" + assert chirps["providers"][0]["type"] == "coverage" + assert chirps["metadata"]["dataset_id"] == "chirps3_precipitation_daily" + + +def test_publishable_workflow_requires_request_publish_flag( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + payload = _valid_public_payload() + payload["request"]["publish"] = False + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + +def test_generated_pygeoapi_openapi_includes_derived_collection( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + response = client.get("/publications/pygeoapi/openapi") + assert response.status_code == 200 + body = response.json() + assert "/collections/chirps3_precipitation_daily" in body["paths"] + assert "chirps3_precipitation_daily" in body["x-generated-resources"] + assert f"/collections/workflow-output-{run_id}" in body["paths"] + assert f"workflow-output-{run_id}" in body["x-generated-resources"] + + +def test_generated_pygeoapi_config_includes_geojson_derived_resource( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + resources = response.json()["resources"] + derived = resources[f"workflow-output-{run_id}"] + assert derived["providers"][0]["name"] == "GeoJSON" + assert derived["providers"][0]["type"] == "feature" + assert derived["providers"][0]["data"].endswith(".geojson") + assert not any(link["rel"] == "analytics" for link in derived["links"]) + + +def test_generated_pygeoapi_config_includes_derived_coverage_resource( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "derived_coverage.zarr" + zarr_path.mkdir(parents=True) + + response = WorkflowExecuteResponse( + status="completed", + run_id="coverage-run-1", + workflow_id="coverage_publish_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + outputs={"output_file": str(zarr_path)}, + primary_output_name="output_file", + output_file=str(zarr_path), + run_log_file="/tmp/data/workflow_runs/coverage-run-1.json", + component_runs=[], + ) + publication_services.register_workflow_output_publication( + response=response, + kind=publication_services.PublishedResourceKind.COVERAGE, + exposure=publication_services.PublishedResourceExposure.OGC, + published_path=str(zarr_path), + asset_format="zarr", + ) + + config_response = client.get("/publications/pygeoapi/config") + assert config_response.status_code == 200 + resources = config_response.json()["resources"] + derived = resources["workflow-output-coverage-run-1"] + assert derived["providers"][0]["type"] == "coverage" + assert derived["providers"][0]["data"] == str(zarr_path) + link_rels = {link["rel"] for link in derived["links"]} + assert "collection" in link_rels + assert "raster-capabilities" in link_rels + + +def test_register_workflow_output_publication_rejects_unsupported_serving_contract( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + response = WorkflowExecuteResponse( + status="completed", + run_id="tiles-run-1", + workflow_id="tiles_publish_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + outputs={"output_file": "/tmp/tiles"}, + primary_output_name="output_file", + output_file="/tmp/tiles", + run_log_file="/tmp/data/workflow_runs/tiles-run-1.json", + component_runs=[], + ) + + with pytest.raises(ValueError, match="Unsupported publication serving contract"): + publication_services.register_workflow_output_publication( + response=response, + kind=publication_services.PublishedResourceKind.TILESET, + exposure=publication_services.PublishedResourceExposure.OGC, + published_path="/tmp/tiles", + asset_format="tiles", + ) + + +def test_materialize_generated_pygeoapi_documents_writes_files( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.post("/publications/pygeoapi/materialize") + assert response.status_code == 200 + body = response.json() + config_path = Path(body["config_path"]) + openapi_path = Path(body["openapi_path"]) + assert config_path.exists() + assert openapi_path.exists() + config_text = config_path.read_text(encoding="utf-8") + openapi_text = openapi_path.read_text(encoding="utf-8") + assert "resources:" in config_text + assert "http://127.0.0.1:8000/pygeoapi" in config_text + assert "http://127.0.0.1:8000/pygeoapi" in openapi_text + assert "http://127.0.0.1:8000/pygeoapi/collections/chirps3_precipitation_daily" in config_text + + +def test_get_published_resource_normalizes_legacy_pygeoapi_links( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + resources_dir = tmp_path / "published_resources" + resources_dir.mkdir(parents=True) + legacy_resource = { + "resource_id": "workflow-output-legacy", + "resource_class": "derived", + "kind": "feature_collection", + "title": "Legacy workflow output", + "description": "Legacy collection", + "dataset_id": "chirps3_precipitation_daily", + "workflow_id": "dhis2_datavalue_set_v1", + "job_id": "legacy", + "run_id": "legacy", + "path": "data/downloads/legacy.geojson", + "ogc_path": "/ogcapi/collections/workflow-output-legacy", + "asset_format": "geojson", + "exposure": "ogc", + "created_at": "2026-03-20T00:00:00+00:00", + "updated_at": "2026-03-20T00:00:00+00:00", + "metadata": {}, + "links": [ + {"rel": "collection", "href": "/ogcapi/collections/workflow-output-legacy"}, + {"rel": "job", "href": "/workflows/jobs/legacy"}, + ], + } + (resources_dir / "workflow-output-legacy.json").write_text(json.dumps(legacy_resource), encoding="utf-8") + + resource = publication_services.get_published_resource("workflow-output-legacy") + + assert resource is not None + assert resource.ogc_path == "/pygeoapi/collections/workflow-output-legacy" + assert resource.links[0]["href"] == "/pygeoapi/collections/workflow-output-legacy" + + +def test_component_spatial_aggregation_serializes_numpy_datetime64( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr( + "eo_api.components.routes.services.require_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + "eo_api.components.routes.services.feature_source_component", + lambda feature_source: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr( + "eo_api.components.routes.services.spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": np.datetime64("2024-01-01"), "value": 10.0}], + ) + + response = client.post( + "/components/spatial-aggregation", + json={ + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01", + "end": "2024-01", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 2}, + "method": "mean", + "include_records": True, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["record_count"] == 1 + assert body["records"][0]["time"] == "2024-01-01T00:00:00" + + +def test_temporal_aggregation_component_passes_through_matching_period_type( + monkeypatch: pytest.MonkeyPatch, +) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + aggregate_called = {"value": False} + + monkeypatch.setattr(component_services, "get_data", lambda **kwargs: ds) + + def _aggregate_temporal(**kwargs: Any) -> xr.Dataset: + aggregate_called["value"] = True + return ds + + monkeypatch.setattr(component_services, "aggregate_temporal", _aggregate_temporal) + + result = component_services.temporal_aggregation_component( + dataset={"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "daily"}, + start="2024-01-01", + end="2024-01-31", + bbox=None, + target_period_type=PeriodType.DAILY, + method=AggregationMethod.SUM, + ) + + assert result is ds + assert aggregate_called["value"] is False + + +def test_engine_orchestrates_components(monkeypatch: pytest.MonkeyPatch) -> None: + request = { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "country_code": "SLE", + "feature_source": { + "source_type": "geojson_file", + "geojson_path": "tests/data/sierra_leone_districts.geojson", + "feature_id_property": "id", + }, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + + dataset = {"id": "chirps3_precipitation_daily", "variable": "precip"} + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr(engine, "get_dataset", lambda dataset_id: dataset) + + called: dict[str, Any] = {"downloaded": False} + + def _download_dataset_component(**kwargs: Any) -> None: + called["downloaded"] = True + assert kwargs["bbox"] == [0.0, 0.0, 1.0, 1.0] + assert kwargs["country_code"] == "SLE" + + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", _download_dataset_component) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow( + engine.WorkflowExecuteRequest.model_validate(request), + include_component_run_details=True, + ) + assert response.status == "completed" + assert response.run_id + assert response.value_count == 1 + assert response.run_log_file.endswith(".json") + assert len(response.component_runs) == 5 + assert [c.component for c in response.component_runs] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + assert response.component_run_details_included is True + assert response.component_run_details_available is True + assert called["downloaded"] is True + + +def test_engine_spatial_aggregation_uses_temporally_aggregated_dataset(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + temporal_ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[31.0]]])}, + coords={"time": ["2024-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: temporal_ds) + + def _spatial_aggregation_component(**kwargs: Any) -> list[dict[str, Any]]: + assert kwargs["aggregated_dataset"] is temporal_ds + return [{"org_unit": "OU_1", "time": "2024-01", "value": 31.0}] + + monkeypatch.setattr(engine.component_services, "spatial_aggregation_component", _spatial_aggregation_component) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "31.0", "period": "202401"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request, include_component_run_details=True) + assert response.status == "completed" + assert response.data_value_set is not None + assert response.data_value_set["dataValues"][0]["period"] == "202401" + + +def test_engine_hides_component_details_by_default(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.component_runs == [] + assert response.component_run_details_included is False + assert response.component_run_details_available is True + + +def test_engine_rejects_remote_spatial_after_temporal_aggregation(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + temporal_ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[31.0]]])}, + coords={"time": ["2024-01"], "lat": [0], "lon": [0]}, + ) + workflow = WorkflowDefinition.model_validate( + { + "workflow_id": "dhis2_datavalue_set_v1", + "version": 1, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + {"component": "temporal_aggregation"}, + { + "component": "spatial_aggregation", + "config": { + "execution_mode": "remote", + "remote_url": "http://localhost:8000/components/spatial-aggregation", + }, + }, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), + } + ) + + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: temporal_ds) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request, workflow_definition=workflow) + + assert exc_info.value.status_code == 500 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["failed_component"] == "spatial_aggregation" + assert "local spatial_aggregation" in detail["message"] + + +def test_engine_returns_503_when_upstream_unreachable(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr(engine, "get_dataset", lambda dataset_id: {"id": "chirps3_precipitation_daily"}) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr( + engine.component_services, + "download_dataset_component", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("Failed to connect to server")), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 503 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "upstream_unreachable" + assert detail["error_code"] == "UPSTREAM_UNREACHABLE" + assert detail["failed_component"] == "download_dataset" + assert detail["failed_component_version"] == "v1" + + +def test_mapper_uses_year_format_for_yearly_dataset() -> None: + normalized, _warnings = normalize_simple_request( + WorkflowRequest.model_validate( + { + "dataset_id": "worldpop_population_yearly", + "country_code": "SLE", + "start_year": 2015, + "end_year": 2026, + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "yearly", + } + ) + ) + assert normalized.start == "2015" + assert normalized.end == "2026" + + +def test_mapper_uses_month_format_for_chirps_date_window() -> None: + normalized, _warnings = normalize_simple_request( + WorkflowRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 2, + "data_element": "DE_UID", + } + ) + ) + assert normalized.start == "2024-01" + assert normalized.end == "2024-05" + + +def test_default_workflow_definition_has_expected_steps() -> None: + workflow = load_workflow_definition() + assert workflow.workflow_id == "dhis2_datavalue_set_v1" + assert workflow.version == 1 + assert [step.id for step in workflow.steps] == [ + "get_features", + "download", + "temporal_agg", + "spatial_agg", + "build_dhis2_payload", + ] + assert [step.component for step in workflow.steps] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_engine_follows_declarative_workflow_order(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"id": "features", "component": "feature_source"}, + { + "id": "download", + "component": "download_dataset", + "inputs": {"bbox": {"from_step": "features", "output": "bbox"}}, + }, + { + "id": "aggregate", + "component": "spatial_aggregation", + "inputs": { + "bbox": {"from_step": "features", "output": "bbox"}, + "features": {"from_step": "features", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "inputs": {"records": {"from_step": "aggregate", "output": "records"}}, + }, + ], + "outputs": { + "bbox": {"from_step": "features", "output": "bbox"}, + "features": {"from_step": "features", "output": "features"}, + "records": {"from_step": "aggregate", "output": "records"}, + "data_value_set": {"from_step": "build", "output": "data_value_set"}, + "output_file": {"from_step": "build", "output": "output_file"}, + }, + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request, include_component_run_details=True) + assert response.workflow_id == "dhis2_datavalue_set_v1" + assert response.workflow_version == 1 + assert [c.component for c in response.component_runs] == [ + "feature_source", + "download_dataset", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_validate_workflow_reports_explicit_input_wiring(client: TestClient) -> None: + response = client.post("/workflows/validate", json={"workflow_id": "dhis2_datavalue_set_v1"}) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is True + assert body["publication_publishable"] is True + assert body["publication_intent"] == "feature_collection" + assert body["publication_exposure"] == "ogc" + assert body["publication_inputs"]["records"]["from_step"] == "spatial_agg" + assert body["serving_supported"] is True + assert body["serving_asset_format"] == "geojson" + assert body["serving_targets"] == ["pygeoapi", "analytics"] + assert body["resolved_steps"][0]["id"] == "get_features" + assert body["resolved_steps"][1]["resolved_inputs"]["bbox"] == { + "from_step": "get_features", + "output": "bbox", + } + assert body["resolved_steps"][3]["resolved_inputs"]["temporal_dataset"] == { + "from_step": "temporal_agg", + "output": "temporal_dataset", + } + + +def test_schedule_trigger_reuses_existing_job(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + _patch_successful_execution(monkeypatch) + + create_response = client.post( + "/workflows/schedules", + json={ + "cron_expression": "0 2 * * *", + "request": _valid_public_payload()["request"], + }, + ) + assert create_response.status_code == 200 + schedule_id = create_response.json()["schedule_id"] + + trigger_payload = {"execution_time": "2026-03-19T02:00:00Z"} + first_trigger = client.post(f"/workflows/schedules/{schedule_id}/trigger", json=trigger_payload) + assert first_trigger.status_code == 200 + first_body = first_trigger.json() + assert first_body["reused_existing_job"] is False + assert first_body["status"] == "successful" + + second_trigger = client.post(f"/workflows/schedules/{schedule_id}/trigger", json=trigger_payload) + assert second_trigger.status_code == 200 + second_body = second_trigger.json() + assert second_body["reused_existing_job"] is True + assert second_body["job_id"] == first_body["job_id"] + + job_response = client.get(f"/workflows/jobs/{first_body['job_id']}") + assert job_response.status_code == 200 + job_body = job_response.json() + assert job_body["trigger_type"] == "scheduled" + assert job_body["schedule_id"] == schedule_id + assert job_body["idempotency_key"] == first_body["idempotency_key"] + + +def test_engine_rejects_unknown_workflow_id(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request, workflow_id="not_allowlisted") + + assert exc_info.value.status_code == 422 + + +def test_engine_resolves_step_config_from_request_params(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 2, + "steps": [ + {"component": "feature_source"}, + { + "component": "download_dataset", + "config": {"execution_mode": "$request.download_execution_mode"}, + }, + { + "component": "temporal_aggregation", + "config": {}, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + + def _temporal_component(**kwargs: Any) -> xr.Dataset: + assert kwargs["method"].value == "sum" + assert kwargs["target_period_type"].value == "monthly" + return ds + + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", _temporal_component) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow( + request, + request_params={"download_execution_mode": "local"}, + ) + assert response.status == "completed" + + +def test_engine_rejects_invalid_step_config(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 2, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + {"component": "temporal_aggregation", "config": {"invalid_key": 1}}, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + persisted: dict[str, Any] = {} + + def _persist_run_log(**kwargs: Any) -> str: + persisted.update(kwargs) + return "/tmp/data/workflow_runs/run.json" + + monkeypatch.setattr(engine, "persist_run_log", _persist_run_log) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "workflow_execution_failed" + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "temporal_aggregation" + assert detail["failed_component_version"] == "v1" + assert persisted["error_code"] == "CONFIG_VALIDATION_FAILED" + assert persisted["failed_component"] == "temporal_aggregation" + assert persisted["failed_component_version"] == "v1" + + +def test_engine_download_dataset_remote_mode_uses_remote_adapter(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + { + "component": "download_dataset", + "config": { + "execution_mode": "remote", + "remote_url": "http://component-host/components/download-dataset", + "remote_retries": 2, + "remote_timeout_sec": 9, + }, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + remote_called: dict[str, Any] = {} + + def _remote_adapter(**kwargs: Any) -> dict[str, Any]: + remote_called.update(kwargs) + return {"status": "downloaded"} + + monkeypatch.setattr(component_services, "_invoke_registered_remote_component", _remote_adapter) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.status == "completed" + assert remote_called["component_key"] == "download_dataset@v1" + assert remote_called["remote_url"] == "http://component-host/components/download-dataset" + assert remote_called["request"].dataset_id == "chirps3_precipitation_daily" + + +def test_engine_rejects_remote_download_without_remote_url(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset", "config": {"execution_mode": "remote"}}, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "download_dataset" + + +def test_engine_rejects_remote_fields_in_local_mode(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + { + "component": "download_dataset", + "config": { + "execution_mode": "local", + "remote_url": "http://should-not-be-here/components/download-dataset", + }, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "download_dataset" + + +def test_engine_supports_remote_mode_for_remote_compatible_component_chain( + monkeypatch: pytest.MonkeyPatch, +) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + { + "component": "feature_source", + "config": {"execution_mode": "remote", "remote_url": "http://x/components/feature-source"}, + }, + { + "component": "download_dataset", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/download-dataset", + }, + }, + { + "component": "spatial_aggregation", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/spatial-aggregation", + }, + }, + { + "component": "build_datavalueset", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/build-datavalue-set", + }, + }, + ], + "outputs": _standard_workflow_outputs(), + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + + called: dict[str, bool] = { + "feature": False, + "download": False, + "spatial": False, + "build": False, + } + + def _remote_adapter(**kwargs: Any) -> dict[str, Any]: + component_key = kwargs["component_key"] + if component_key == "feature_source@v1": + called["feature"] = True + return { + "features": {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + "bbox": [0, 0, 1, 1], + } + if component_key == "download_dataset@v1": + called["download"] = True + return {"status": "downloaded"} + if component_key == "spatial_aggregation@v1": + called["spatial"] = True + return {"records": [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}]} + if component_key == "build_datavalueset@v1": + called["build"] = True + return {"data_value_set": {"dataValues": [{"value": "10.0"}]}, "output_file": "/tmp/data/out.json"} + raise AssertionError(f"Unexpected remote component key: {component_key}") + + monkeypatch.setattr(component_services, "_invoke_registered_remote_component", _remote_adapter) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.status == "completed" + assert all(called.values()) diff --git a/uv.lock b/uv.lock index 6399c0c..4f11d71 100644 --- a/uv.lock +++ b/uv.lock @@ -814,6 +814,7 @@ dependencies = [ { name = "pygeoapi" }, { name = "python-dotenv" }, { name = "titiler-core" }, + { name = "titiler-xarray" }, { name = "uvicorn" }, { name = "zarr" }, ] @@ -841,6 +842,7 @@ requires-dist = [ { name = "pygeoapi", specifier = ">=0.22.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "titiler-core", specifier = ">=1.2.0" }, + { name = "titiler-xarray", specifier = ">=1.2.0" }, { name = "uvicorn", specifier = ">=0.41.0" }, { name = "zarr", specifier = "==3.1.5" }, ] @@ -1917,6 +1919,53 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, ] +[[package]] +name = "obstore" +version = "0.9.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/18/cab734edaeb495a861cfbdced9fecdc0866ed1a85aa5a9202ec77cf4723e/obstore-0.9.2.tar.gz", hash = "sha256:7ef94323127a971c9dea2484109d6c706eb2b2594a2df13c2dd0a6d21a9a69ae", size = 123731, upload-time = "2026-03-11T19:10:18.19Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/d2/b98058a552849719df56d59a53f7d97e6507b37fca0399a866534800f9fa/obstore-0.9.2-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:50d9c9d6de601ad4805a5a76a1a3d731f7b899383f96ef57276f97bc35202f95", size = 4105494, upload-time = "2026-03-11T19:09:06.573Z" }, + { url = "https://files.pythonhosted.org/packages/ec/55/4386622b94fd028cb2298b4780d5a8e2d959fc4c71e599fb63be869aa83d/obstore-0.9.2-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:4c6dcd9b76b802a2278e1cd88ad7305caf3c3c16f800b2bf5f86a606e9e83d96", size = 3878429, upload-time = "2026-03-11T19:09:07.962Z" }, + { url = "https://files.pythonhosted.org/packages/91/8d/0bfad11f1ee5fb1fbdb7833607212ad2586dbd1824b30cf328af63fe92fc/obstore-0.9.2-cp311-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8d46e629beb47565fa67b6ef05919434258d72ef848efa340f911af5de2536da", size = 4041157, upload-time = "2026-03-11T19:09:09.278Z" }, + { url = "https://files.pythonhosted.org/packages/eb/98/bfde825f61a8b2541be9185cd6a4ddbb820de94c79750edc32f9f9dfb795/obstore-0.9.2-cp311-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:350d8cc1cd9564369291396e160ebfa133d705ec349d8c0d444a39158d6ef3e7", size = 4144757, upload-time = "2026-03-11T19:09:10.938Z" }, + { url = "https://files.pythonhosted.org/packages/19/35/1c101f6660ef91e5280c824677d8b5ab11ee25ed52e59b075cd795a86e69/obstore-0.9.2-cp311-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dddd38c9f98fd8eaf11a9805464f0bec7e57d8e04a5e0b0cb17582ec58d2fe41", size = 4427897, upload-time = "2026-03-11T19:09:12.137Z" }, + { url = "https://files.pythonhosted.org/packages/fb/eb/a9bdb64474d4e0ab4e4c0105c959090d6bd7ce38d4a945cae3679ead8c52/obstore-0.9.2-cp311-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca872e88e5c719faf1581632e348a6b01331b4f838d7ac29aff226107088dc35", size = 4336227, upload-time = "2026-03-11T19:09:13.822Z" }, + { url = "https://files.pythonhosted.org/packages/b2/ec/e6d39aa311afec2241adb6f2067d7d6ca2eb4e0aab5a95c47796edadd524/obstore-0.9.2-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ee61ac2af5c32c5282fc13b9eba7ffa332f268cb65bc29134ad8ac45e069871", size = 4229010, upload-time = "2026-03-11T19:09:15.503Z" }, + { url = "https://files.pythonhosted.org/packages/1c/fb/a24fd972b66b2d83829e2e89ccf236a759a82f881f909bf4fbe0b6c398ae/obstore-0.9.2-cp311-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:2f430cf8af76985e7ebb8d5f20c8ccef858c608103af6ea95c870f5380cd62f7", size = 4103835, upload-time = "2026-03-11T19:09:16.729Z" }, + { url = "https://files.pythonhosted.org/packages/d0/d4/c8cc60c8afc597712bf6c5059d629e050de521d901dad0f554b268c2d77f/obstore-0.9.2-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1df403f80feef7ac483ed66a2a5a964a469f3756ded533935640c4baf986dd49", size = 4292174, upload-time = "2026-03-11T19:09:18.461Z" }, + { url = "https://files.pythonhosted.org/packages/a7/80/dcf8f31814f25c390aa5501a95b78b9f6456d30cd4625109c2a6a5105ad1/obstore-0.9.2-cp311-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c20f62b7c2f57c6f449215c36af4a8d502082ced2185c0b28f07a5e7c9698181", size = 4276266, upload-time = "2026-03-11T19:09:19.787Z" }, + { url = "https://files.pythonhosted.org/packages/16/71/5f5369fba652c5f83b44381d9e7a3cfe00793301d01802059b52b8663f2c/obstore-0.9.2-cp311-abi3-musllinux_1_2_i686.whl", hash = "sha256:c296e7d60ee132babb7fd01eab946396fa28eb0d88264b9e60320922174e6010", size = 4264118, upload-time = "2026-03-11T19:09:21.081Z" }, + { url = "https://files.pythonhosted.org/packages/c5/50/a5bd1948f2b2efb1039852542829a33a198be0586da7d4247996d3f15d26/obstore-0.9.2-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:76f274a170731a4461d0fe3eefde38f3bdaf346011ae020c94a0bd18bfd3c4bc", size = 4446876, upload-time = "2026-03-11T19:09:22.401Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d6/bcc266e391403163ed12dd8cab53012f4db8f5020fb49e3b0a505d7a1bba/obstore-0.9.2-cp311-abi3-win_amd64.whl", hash = "sha256:f644fef2a91973b6c055623692524baf830abb1f8bb3ad348611f0e25224e160", size = 4190639, upload-time = "2026-03-11T19:09:23.637Z" }, + { url = "https://files.pythonhosted.org/packages/9a/da/ea7c5095cf15c026819958f74d3ab7b69aff7ce5bf74188e5df5bba4c252/obstore-0.9.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7161a977e94a94dfd2c4ef66846371bdff46bb8b5f9b91dc29c912deb88a5bb2", size = 4087051, upload-time = "2026-03-11T19:09:24.944Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9f/16d6f41ab87e75a6400959a4708343eaca782b78a5f9de7846c70e2b1381/obstore-0.9.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e3a31fbd68bbe7e061272420337d5ccaf2df7927c2b44ff768531dda02196746", size = 3869338, upload-time = "2026-03-11T19:09:26.404Z" }, + { url = "https://files.pythonhosted.org/packages/99/61/5f13cc91b054d8c93db77e9113ca4924c4320e988284840c8a98238709e6/obstore-0.9.2-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:928da0d131ea33d0b88aa8c3a0dd3f7423261e0c9495444cc14ce0cf62808558", size = 4037703, upload-time = "2026-03-11T19:09:27.743Z" }, + { url = "https://files.pythonhosted.org/packages/58/a2/669620821881559819b8911c4820defa3ffc30a9e49e9d5aca05bd57da45/obstore-0.9.2-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79667de1f0c7eed64b658b3e696bb0565fba4069f6134db502bf4f5f5835aeee", size = 4135488, upload-time = "2026-03-11T19:09:29.232Z" }, + { url = "https://files.pythonhosted.org/packages/9f/12/019e523e97415b4fcfc35b230b270d452fdf5578a7612034c8043c8f2cbf/obstore-0.9.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7318253bc8d03b64473150dad31e611f5bd70a3cc945e3e1d6ac59a901f397c0", size = 4412922, upload-time = "2026-03-11T19:09:30.462Z" }, + { url = "https://files.pythonhosted.org/packages/a6/52/d4a8c1bf588a10bfd17a5a11ebc6af834850fe174a0369648d534a2acb81/obstore-0.9.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:133507229632fde08bc202ca2c81119b2314662dab7a96f8348e97f8e97ae36a", size = 4337193, upload-time = "2026-03-11T19:09:31.773Z" }, + { url = "https://files.pythonhosted.org/packages/aa/59/46c1bdaeae2904bb1edddbfc78e35cb0521ab7c58fe92b147a981873fcdc/obstore-0.9.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c73f208abcddcd3edb7a739d5cac777bdb6fac12a358c9b251654ec7df7866", size = 4221641, upload-time = "2026-03-11T19:09:33.067Z" }, + { url = "https://files.pythonhosted.org/packages/44/9c/b0203594666d11da31e4a7f25ace0718cb1591792e3c1de5225fbd7c8246/obstore-0.9.2-cp313-cp313t-manylinux_2_24_aarch64.whl", hash = "sha256:857b2e7d78c8fb36dcb7c6f1fa89401429667195186ced746a500e54a6aaecdb", size = 4103500, upload-time = "2026-03-11T19:09:34.687Z" }, + { url = "https://files.pythonhosted.org/packages/95/bc/b215712ef24a21247d6e8a4049a76d95e2dca517b8b24efb496600c333c7/obstore-0.9.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:24c24fdba5080524ce79b36782a11563ea40d9ae5aa26bb6b81a6d089184e4eb", size = 4290492, upload-time = "2026-03-11T19:09:35.936Z" }, + { url = "https://files.pythonhosted.org/packages/ad/28/5aa0ecdc6c01b6e020f1ff8efcca35493e0c6091a0b72ec1bbb16b5b18a8/obstore-0.9.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:778785266aaaf3a73d44ee15e33b72c7ecf0585efeaf8745a1889cc02930ae59", size = 4272220, upload-time = "2026-03-11T19:09:37.223Z" }, + { url = "https://files.pythonhosted.org/packages/06/65/c47b0f972bc7acd64385a964dfbc2efc7361207f490b4d16da789da26fd5/obstore-0.9.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:305c415fdb2230a1e096f6f290cf524d030329ad5c5e1c9c41f121e7d2fb27d7", size = 4256524, upload-time = "2026-03-11T19:09:38.592Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/9f826fd49cd17cdbc8d2a7a75698d1cc9d731ca98d645f1ca9366ac93781/obstore-0.9.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a544aad84ae774fac339c686f8a4d7b187c4927b6e33ebb9758c58991d4f27f", size = 4440986, upload-time = "2026-03-11T19:09:40.231Z" }, + { url = "https://files.pythonhosted.org/packages/b9/24/0af1af62239c539975b6c9095428f7597e8f5f9617e897e58dbf7b63f1c5/obstore-0.9.2-cp313-cp313t-win_amd64.whl", hash = "sha256:52da6bd719c4962fdfb3c7504e790a89a9b5d27703ee872db01e2075162706fd", size = 4175182, upload-time = "2026-03-11T19:09:41.617Z" }, + { url = "https://files.pythonhosted.org/packages/fa/63/02ca0378938efd1111aa5d689b527c6f3f0c59f4ee440a7b0bf36c528f46/obstore-0.9.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:1bd4790eaa2bb384b58e1c430b2c8816edd7e60216e813c8120014f742e5d280", size = 4087916, upload-time = "2026-03-11T19:09:43.162Z" }, + { url = "https://files.pythonhosted.org/packages/86/9b/604bfb0ec9f117dbb8e936d64e45d95cd9a1fcb63640453566fb3dc66e9d/obstore-0.9.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6417ac0b5cb32498490ceb7034ea357ea2ea965c855590496d64b2d7808a621", size = 3869703, upload-time = "2026-03-11T19:09:44.673Z" }, + { url = "https://files.pythonhosted.org/packages/44/6a/04bcb394f2a6bb12c4325e6ff3f7ead24592582a593c70669d9cdb5b4e9c/obstore-0.9.2-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dc07d71e2f9cd30d2db6ac15c2b162d5b14f6a0e7f575ad66676335c256b1a80", size = 4038164, upload-time = "2026-03-11T19:09:45.922Z" }, + { url = "https://files.pythonhosted.org/packages/34/39/2cc1c2c2a7027dd32ae010ac2ae4491b5f653f86c499e6ec20a6a54e799d/obstore-0.9.2-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7606d5f5c682cc8be9f55d3b07d282dfc0e0262ddfd31b8a26b0a6a3787e5b78", size = 4135199, upload-time = "2026-03-11T19:09:47.242Z" }, + { url = "https://files.pythonhosted.org/packages/e7/4c/defabe9c19bddf44f22591bcf0fffbc3b2b3202eb5ab99a0d894562f56de/obstore-0.9.2-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80e870ab402ac0f93799049a6680faacbfc2995c60fa87fd683807ce1366e544", size = 4413291, upload-time = "2026-03-11T19:09:48.934Z" }, + { url = "https://files.pythonhosted.org/packages/10/ce/fcfd0436834657a6617d06f07de7630889036c722d35ed9df7913e6caac7/obstore-0.9.2-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:534049c4b970e1e49c33b47a3e2a051fdc9727f844c3d4737aac4e4c89939fe4", size = 4337512, upload-time = "2026-03-11T19:09:50.13Z" }, + { url = "https://files.pythonhosted.org/packages/70/12/565d0cd60f7ae6bb65bde745e182f745a0520f314b32cb802d5f445ad10a/obstore-0.9.2-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c903949b9994003bda82b57f938ab88f458e75fd27eed809547533bffad99a77", size = 4221955, upload-time = "2026-03-11T19:09:51.499Z" }, + { url = "https://files.pythonhosted.org/packages/0e/27/3fb7f28277fbc929168ff7e02a36a64a56e1288936ac10fce49420c343f4/obstore-0.9.2-cp314-cp314t-manylinux_2_24_aarch64.whl", hash = "sha256:3f07a060702c8b1af51ca15a92658a34bb3ff2e38625173c5592c5aae7fdbfcd", size = 4103438, upload-time = "2026-03-11T19:09:52.748Z" }, + { url = "https://files.pythonhosted.org/packages/67/8f/53ed223ee069da797b09f45e9dbf4a1ed24743081be1ec1411ab6baf8ce9/obstore-0.9.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:462a864782a8d7a1a60c55ac19ce4ad53668a39e35d16b98b787fe97d3fec193", size = 4290842, upload-time = "2026-03-11T19:09:54.3Z" }, + { url = "https://files.pythonhosted.org/packages/05/cd/fc94afca13776c4eb8b7a2f27ecb9ee964156d20d699100b719c6c8b6246/obstore-0.9.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:afe36e0452e753c2fece5e6849dd13f209400d5feca668514c0cca2242b0eee8", size = 4273457, upload-time = "2026-03-11T19:09:55.715Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8e/fb02a7a8d4f966af5e069315075bc4388eb63d9cff1c2f3283f3c5781919/obstore-0.9.2-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3bfae2c634bca903141ef09d6d65e343402de0470e595799881a47ac7c08b2bd", size = 4256979, upload-time = "2026-03-11T19:09:56.983Z" }, + { url = "https://files.pythonhosted.org/packages/c0/87/5621ea304d39b4099d36bfa50dce901eb37b3861e2592d76baa26031d407/obstore-0.9.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:71d4059b5e948fe6e8cfc2b77da9c2fc944dfe0ee98090d985e60dd6ebecd7f6", size = 4441545, upload-time = "2026-03-11T19:09:58.59Z" }, + { url = "https://files.pythonhosted.org/packages/30/44/5a7b98d5d92a2267df7a9a905b3cc4f0ca98fbf207b9fae5179a6838a80b/obstore-0.9.2-cp314-cp314t-win_amd64.whl", hash = "sha256:e75295c9c522dde5020d4ff763315af75a165a8a6b8d7f9ed247ce17b7d7f7b0", size = 4175247, upload-time = "2026-03-11T19:10:00.111Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.39.1" @@ -3324,6 +3373,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/d8/20d2982580c1e13025f7e54391f0b2bbf669cb2b1462f42b64d8fe3cf50c/titiler_core-1.2.0-py3-none-any.whl", hash = "sha256:ba7f34f83b3dab0cae612b88ad087be230bbce2043562e17b8ed9182484c4642", size = 88373, upload-time = "2026-02-09T14:37:52.263Z" }, ] +[[package]] +name = "titiler-xarray" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "obstore" }, + { name = "rioxarray" }, + { name = "titiler-core" }, + { name = "xarray" }, + { name = "zarr" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/b2/e6aec77d4160f610b49e95b9edd2ef585c7f8c83900a0ca66b5c6a02acfc/titiler_xarray-1.2.0.tar.gz", hash = "sha256:7e13b753e636ee5af4db1d7fbc84e8dfb58ba0ae0fdcccefb01d4ffdae82ba8d", size = 32428, upload-time = "2026-02-09T14:37:55.718Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/d3/a3238916c0016a349f309e4ff4ab119c02063317c26d9eacdf4da136c27a/titiler_xarray-1.2.0-py3-none-any.whl", hash = "sha256:781489360d4562e33dd782187b10706ed619b7e0a0ce13c6ff7f459e6ff75915", size = 34150, upload-time = "2026-02-09T14:37:54.446Z" }, +] + [[package]] name = "toml" version = "0.10.2"