From 5f9627f65476f93e09e57f57f81faeeb3ae67b47 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 11 Mar 2026 23:33:59 +0100 Subject: [PATCH 01/15] Implement declarative workflow engine with discoverable components --- data/workflows/dhis2_datavalue_set.yaml | 8 + ...alue_set_without_temporal_aggregation.yaml | 7 + src/eo_api/components/__init__.py | 6 + src/eo_api/components/routes.py | 117 +++++ src/eo_api/components/schemas.py | 121 +++++ src/eo_api/components/services.py | 141 +++++ src/eo_api/data_accessor/__init__.py | 5 +- src/eo_api/data_accessor/routes.py | 27 +- src/eo_api/data_accessor/services/__init__.py | 4 +- src/eo_api/data_accessor/services/accessor.py | 27 +- src/eo_api/data_manager/__init__.py | 5 +- src/eo_api/data_manager/routes.py | 16 +- src/eo_api/data_manager/services/__init__.py | 6 +- .../data_manager/services/downloader.py | 19 +- src/eo_api/data_manager/services/utils.py | 3 - src/eo_api/data_registry/__init__.py | 5 +- src/eo_api/data_registry/routes.py | 9 +- src/eo_api/data_registry/services/__init__.py | 4 +- src/eo_api/data_registry/services/datasets.py | 2 +- src/eo_api/main.py | 12 +- src/eo_api/shared/dhis2_adapter.py | 2 +- src/eo_api/shared/time.py | 5 +- src/eo_api/startup.py | 3 +- src/eo_api/system/__init__.py | 5 +- src/eo_api/system/routes.py | 1 - src/eo_api/workflows/__init__.py | 6 + src/eo_api/workflows/routes.py | 41 ++ src/eo_api/workflows/schemas.py | 176 +++++++ src/eo_api/workflows/services/__init__.py | 1 + src/eo_api/workflows/services/datavalueset.py | 65 +++ src/eo_api/workflows/services/definitions.py | 132 +++++ src/eo_api/workflows/services/engine.py | 207 ++++++++ src/eo_api/workflows/services/features.py | 69 +++ src/eo_api/workflows/services/preflight.py | 23 + src/eo_api/workflows/services/run_logs.py | 37 ++ src/eo_api/workflows/services/runtime.py | 89 ++++ .../workflows/services/simple_mapper.py | 86 +++ src/eo_api/workflows/services/spatial.py | 56 ++ src/eo_api/workflows/services/temporal.py | 25 + tests/test_root.py | 2 +- tests/test_workflows.py | 490 ++++++++++++++++++ 41 files changed, 1999 insertions(+), 66 deletions(-) create mode 100644 data/workflows/dhis2_datavalue_set.yaml create mode 100644 data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml create mode 100644 src/eo_api/components/__init__.py create mode 100644 src/eo_api/components/routes.py create mode 100644 src/eo_api/components/schemas.py create mode 100644 src/eo_api/components/services.py create mode 100644 src/eo_api/workflows/__init__.py create mode 100644 src/eo_api/workflows/routes.py create mode 100644 src/eo_api/workflows/schemas.py create mode 100644 src/eo_api/workflows/services/__init__.py create mode 100644 src/eo_api/workflows/services/datavalueset.py create mode 100644 src/eo_api/workflows/services/definitions.py create mode 100644 src/eo_api/workflows/services/engine.py create mode 100644 src/eo_api/workflows/services/features.py create mode 100644 src/eo_api/workflows/services/preflight.py create mode 100644 src/eo_api/workflows/services/run_logs.py create mode 100644 src/eo_api/workflows/services/runtime.py create mode 100644 src/eo_api/workflows/services/simple_mapper.py create mode 100644 src/eo_api/workflows/services/spatial.py create mode 100644 src/eo_api/workflows/services/temporal.py create mode 100644 tests/test_workflows.py diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml new file mode 100644 index 0000000..180da1d --- /dev/null +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -0,0 +1,8 @@ +workflow_id: dhis2_datavalue_set_v1 +version: 1 +steps: + - component: feature_source + - component: download_dataset + - component: temporal_aggregation + - component: spatial_aggregation + - component: build_datavalueset diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml new file mode 100644 index 0000000..6d1b1f0 --- /dev/null +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -0,0 +1,7 @@ +workflow_id: dhis2_datavalue_set_without_temporal_aggregation_v1 +version: 1 +steps: + - component: feature_source + - component: download_dataset + - component: spatial_aggregation + - component: build_datavalueset diff --git a/src/eo_api/components/__init__.py b/src/eo_api/components/__init__.py new file mode 100644 index 0000000..dc357fd --- /dev/null +++ b/src/eo_api/components/__init__.py @@ -0,0 +1,6 @@ +"""Reusable workflow components exposed as API capabilities.""" + +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/components/routes.py b/src/eo_api/components/routes.py new file mode 100644 index 0000000..73519c2 --- /dev/null +++ b/src/eo_api/components/routes.py @@ -0,0 +1,117 @@ +"""Component discovery and execution endpoints.""" + +from __future__ import annotations + +from fastapi import APIRouter + +from ..data_manager.services.constants import BBOX +from . import services +from .schemas import ( + BuildDataValueSetRunRequest, + BuildDataValueSetRunResponse, + ComponentCatalogResponse, + DownloadDatasetRunRequest, + DownloadDatasetRunResponse, + FeatureSourceRunRequest, + FeatureSourceRunResponse, + SpatialAggregationRunRequest, + SpatialAggregationRunResponse, + TemporalAggregationRunRequest, + TemporalAggregationRunResponse, +) + +router = APIRouter() + + +@router.get("/components", response_model=ComponentCatalogResponse) +def list_components() -> ComponentCatalogResponse: + """List all discoverable reusable components.""" + return ComponentCatalogResponse(components=services.component_catalog()) + + +@router.post("/components/feature-source", response_model=FeatureSourceRunResponse) +def run_feature_source(payload: FeatureSourceRunRequest) -> FeatureSourceRunResponse: + """Resolve feature source to features and bbox.""" + features, bbox = services.feature_source_component(payload.feature_source) + return FeatureSourceRunResponse( + bbox=bbox, + feature_count=len(features["features"]), + features=features if payload.include_features else None, + ) + + +@router.post("/components/download-dataset", response_model=DownloadDatasetRunResponse) +def run_download_dataset(payload: DownloadDatasetRunRequest) -> DownloadDatasetRunResponse: + """Download dataset files for the selected period/scope.""" + dataset = services.require_dataset(payload.dataset_id) + bbox = payload.bbox or BBOX + services.download_dataset_component( + dataset=dataset, + start=payload.start, + end=payload.end, + overwrite=payload.overwrite, + country_code=payload.country_code, + bbox=bbox, + ) + return DownloadDatasetRunResponse( + status="completed", + dataset_id=payload.dataset_id, + start=payload.start, + end=payload.end, + ) + + +@router.post("/components/temporal-aggregation", response_model=TemporalAggregationRunResponse) +def run_temporal_aggregation(payload: TemporalAggregationRunRequest) -> TemporalAggregationRunResponse: + """Aggregate a dataset temporally.""" + dataset = services.require_dataset(payload.dataset_id) + ds = services.temporal_aggregation_component( + dataset=dataset, + start=payload.start, + end=payload.end, + bbox=payload.bbox, + target_period_type=payload.target_period_type, + method=payload.method, + ) + return TemporalAggregationRunResponse( + dataset_id=payload.dataset_id, + sizes={str(k): int(v) for k, v in ds.sizes.items()}, + dims=[str(d) for d in ds.dims], + ) + + +@router.post("/components/spatial-aggregation", response_model=SpatialAggregationRunResponse) +def run_spatial_aggregation(payload: SpatialAggregationRunRequest) -> SpatialAggregationRunResponse: + """Aggregate a dataset spatially to features.""" + dataset = services.require_dataset(payload.dataset_id) + features, bbox = services.feature_source_component(payload.feature_source) + records = services.spatial_aggregation_component( + dataset=dataset, + start=payload.start, + end=payload.end, + bbox=payload.bbox or bbox, + features=features, + method=payload.method, + feature_id_property=payload.feature_id_property, + ) + return SpatialAggregationRunResponse( + dataset_id=payload.dataset_id, + record_count=len(records), + preview=records[: payload.max_preview_rows], + ) + + +@router.post("/components/build-datavalue-set", response_model=BuildDataValueSetRunResponse) +def run_build_datavalueset(payload: BuildDataValueSetRunRequest) -> BuildDataValueSetRunResponse: + """Build and serialize a DHIS2 DataValueSet from records.""" + data_value_set, output_file = services.build_datavalueset_component( + dataset_id=payload.dataset_id, + period_type=payload.period_type, + records=payload.records, + dhis2=payload.dhis2, + ) + return BuildDataValueSetRunResponse( + value_count=len(data_value_set.get("dataValues", [])), + output_file=output_file, + data_value_set=data_value_set, + ) diff --git a/src/eo_api/components/schemas.py b/src/eo_api/components/schemas.py new file mode 100644 index 0000000..80c772e --- /dev/null +++ b/src/eo_api/components/schemas.py @@ -0,0 +1,121 @@ +"""Schemas for component discovery and execution endpoints.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + +from ..workflows.schemas import ( + AggregationMethod, + Dhis2DataValueSetConfig, + FeatureSourceConfig, + PeriodType, +) + + +class ComponentDefinition(BaseModel): + """Component metadata for discovery.""" + + name: str + description: str + inputs: list[str] + outputs: list[str] + + +class ComponentCatalogResponse(BaseModel): + """List of discoverable components.""" + + components: list[ComponentDefinition] + + +class FeatureSourceRunRequest(BaseModel): + """Execute feature source component.""" + + feature_source: FeatureSourceConfig + include_features: bool = False + + +class FeatureSourceRunResponse(BaseModel): + """Feature source component result.""" + + bbox: list[float] + feature_count: int + features: dict[str, Any] | None = None + + +class DownloadDatasetRunRequest(BaseModel): + """Execute dataset download component.""" + + dataset_id: str + start: str + end: str + overwrite: bool = False + country_code: str | None = None + bbox: list[float] | None = None + + +class DownloadDatasetRunResponse(BaseModel): + """Download component result.""" + + status: str + dataset_id: str + start: str + end: str + + +class TemporalAggregationRunRequest(BaseModel): + """Execute temporal aggregation component from cached dataset.""" + + dataset_id: str + start: str + end: str + target_period_type: PeriodType + method: AggregationMethod = AggregationMethod.SUM + bbox: list[float] | None = None + + +class TemporalAggregationRunResponse(BaseModel): + """Temporal aggregation result summary.""" + + dataset_id: str + sizes: dict[str, int] + dims: list[str] + + +class SpatialAggregationRunRequest(BaseModel): + """Execute spatial aggregation component from cached dataset.""" + + dataset_id: str + start: str + end: str + feature_source: FeatureSourceConfig + method: AggregationMethod = AggregationMethod.MEAN + bbox: list[float] | None = None + feature_id_property: str = "id" + max_preview_rows: int = 20 + + +class SpatialAggregationRunResponse(BaseModel): + """Spatial aggregation result with sample rows.""" + + dataset_id: str + record_count: int + preview: list[dict[str, Any]] + + +class BuildDataValueSetRunRequest(BaseModel): + """Execute build_datavalueset component directly from records.""" + + dataset_id: str + period_type: PeriodType + records: list[dict[str, Any]] = Field(default_factory=list) + dhis2: Dhis2DataValueSetConfig + + +class BuildDataValueSetRunResponse(BaseModel): + """Build_datavalueset component output.""" + + value_count: int + output_file: str + data_value_set: dict[str, Any] diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py new file mode 100644 index 0000000..2e99133 --- /dev/null +++ b/src/eo_api/components/services.py @@ -0,0 +1,141 @@ +"""Component service implementations and discovery metadata.""" + +from __future__ import annotations + +from typing import Any + +import xarray as xr +from fastapi import HTTPException + +from ..data_accessor.services.accessor import get_data +from ..data_manager.services import downloader +from ..data_registry.services.datasets import get_dataset +from ..workflows.schemas import ( + AggregationMethod, + Dhis2DataValueSetConfig, + FeatureSourceConfig, + PeriodType, +) +from ..workflows.services.datavalueset import build_data_value_set +from ..workflows.services.features import resolve_features +from ..workflows.services.preflight import check_upstream_connectivity +from ..workflows.services.spatial import aggregate_to_features +from ..workflows.services.temporal import aggregate_temporal +from .schemas import ComponentDefinition + + +def component_catalog() -> list[ComponentDefinition]: + """Return all discoverable component definitions.""" + return [ + ComponentDefinition( + name="feature_source", + description="Resolve feature source and compute bbox.", + inputs=["feature_source"], + outputs=["features", "bbox"], + ), + ComponentDefinition( + name="download_dataset", + description="Download dataset files for period and bbox.", + inputs=["dataset_id", "start", "end", "overwrite", "country_code", "bbox"], + outputs=["status"], + ), + ComponentDefinition( + name="temporal_aggregation", + description="Aggregate dataset over time dimension.", + inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], + outputs=["dataset"], + ), + ComponentDefinition( + name="spatial_aggregation", + description="Aggregate gridded dataset to features.", + inputs=["dataset_id", "start", "end", "feature_source", "method"], + outputs=["records"], + ), + ComponentDefinition( + name="build_datavalueset", + description="Build and serialize DHIS2 DataValueSet JSON.", + inputs=["dataset_id", "period_type", "records", "dhis2"], + outputs=["data_value_set", "output_file"], + ), + ] + + +def feature_source_component(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: + """Run feature source component.""" + return resolve_features(config) + + +def download_dataset_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + overwrite: bool, + country_code: str | None, + bbox: list[float], +) -> None: + """Run connectivity preflight and download dataset files.""" + check_upstream_connectivity(dataset) + downloader.download_dataset( + dataset=dataset, + start=start, + end=end, + overwrite=overwrite, + background_tasks=None, + country_code=country_code, + bbox=bbox, + ) + + +def temporal_aggregation_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + bbox: list[float] | None, + target_period_type: PeriodType, + method: AggregationMethod, +) -> xr.Dataset: + """Load dataset and aggregate over time.""" + ds = get_data(dataset=dataset, start=start, end=end, bbox=bbox) + return aggregate_temporal(ds=ds, period_type=target_period_type, method=method) + + +def spatial_aggregation_component( + *, + dataset: dict[str, Any], + start: str, + end: str, + bbox: list[float] | None, + features: dict[str, Any], + method: AggregationMethod, + feature_id_property: str, +) -> list[dict[str, Any]]: + """Load dataset and aggregate spatially to provided features.""" + ds = get_data(dataset=dataset, start=start, end=end, bbox=bbox) + return aggregate_to_features( + ds=ds, + variable=dataset["variable"], + features=features, + method=method.value, + feature_id_property=feature_id_property, + ) + + +def build_datavalueset_component( + *, + dataset_id: str, + period_type: PeriodType, + records: list[dict[str, Any]], + dhis2: Dhis2DataValueSetConfig, +) -> tuple[dict[str, Any], str]: + """Build and serialize DHIS2 DataValueSet from records.""" + return build_data_value_set(records=records, dataset_id=dataset_id, period_type=period_type, config=dhis2) + + +def require_dataset(dataset_id: str) -> dict[str, Any]: + """Resolve dataset or raise 404.""" + dataset = get_dataset(dataset_id) + if dataset is None: + raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + return dataset diff --git a/src/eo_api/data_accessor/__init__.py b/src/eo_api/data_accessor/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_accessor/__init__.py +++ b/src/eo_api/data_accessor/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_accessor/routes.py b/src/eo_api/data_accessor/routes.py index 8e3a6f5..507b27d 100644 --- a/src/eo_api/data_accessor/routes.py +++ b/src/eo_api/data_accessor/routes.py @@ -1,45 +1,44 @@ """FastAPI router exposing dataset endpoints.""" -from typing import Any - -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response +from fastapi import APIRouter from fastapi.responses import FileResponse from starlette.background import BackgroundTask -from .services.accessor import cleanup_file, get_data, xarray_to_temporary_netcdf from ..data_registry.routes import _get_dataset_or_404 +from .services.accessor import cleanup_file, get_data, xarray_to_temporary_netcdf router = APIRouter() + @router.get("/{dataset_id}") def get_file( dataset_id: str, start: str, end: str, - xmin: float = None, - ymin: float = None, - xmax: float = None, - ymax: float = None, - format: str = 'netcdf', + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + format: str = "netcdf", ) -> FileResponse: """Get a dataset filtered to a timeperiod and bbox as a downloadable raster file.""" dataset = _get_dataset_or_404(dataset_id) # get filtered data - if all([xmin, ymin, xmax, ymax]): + bbox: list[float] | None + if xmin is not None and ymin is not None and xmax is not None and ymax is not None: bbox = [xmin, ymin, xmax, ymax] else: bbox = None ds = get_data(dataset, start, end, bbox) # save to temporary file - if format.lower() == 'netcdf': + if format.lower() == "netcdf": # convert to netcdf file_path = xarray_to_temporary_netcdf(ds) else: - raise ValueError(f'Unsupported output format: {format}') + raise ValueError(f"Unsupported output format: {format}") # return as file return FileResponse( @@ -47,4 +46,4 @@ def get_file( media_type="application/x-netcdf", filename="eo-api-raster-download.nc", background=BackgroundTask(cleanup_file, file_path), - ) \ No newline at end of file + ) diff --git a/src/eo_api/data_accessor/services/__init__.py b/src/eo_api/data_accessor/services/__init__.py index 512ee18..209f276 100644 --- a/src/eo_api/data_accessor/services/__init__.py +++ b/src/eo_api/data_accessor/services/__init__.py @@ -1 +1,3 @@ -from . import accessor \ No newline at end of file +from . import accessor as accessor + +__all__ = ["accessor"] diff --git a/src/eo_api/data_accessor/services/accessor.py b/src/eo_api/data_accessor/services/accessor.py index d254a94..b9b78e9 100644 --- a/src/eo_api/data_accessor/services/accessor.py +++ b/src/eo_api/data_accessor/services/accessor.py @@ -1,25 +1,30 @@ """Loading raster data from downloaded files into xarray.""" -import os -import json import logging +import os import tempfile from typing import Any import xarray as xr from ...data_manager.services.downloader import get_cache_files, get_zarr_path -from ...data_manager.services.utils import get_time_dim, get_lon_lat_dims +from ...data_manager.services.utils import get_lon_lat_dims, get_time_dim from ...shared.time import numpy_datetime_to_period_string logger = logging.getLogger(__name__) -def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: list = None) -> xr.Dataset: + +def get_data( + dataset: dict[str, Any], + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, +) -> xr.Dataset: """Load an xarray raster dataset for a given time range and bbox.""" logger.info("Opening dataset") zarr_path = get_zarr_path(dataset) if zarr_path: - logger.info(f'Using optimized zarr file: {zarr_path}') + logger.info(f"Using optimized zarr file: {zarr_path}") ds = xr.open_zarr(zarr_path, consolidated=True) else: logger.warning( @@ -40,8 +45,8 @@ def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: if bbox is not None: logger.info(f"Subsetting xy to {bbox}") - xmin,ymin,xmax,ymax = list(map(float, bbox)) - lon_dim,lat_dim = get_lon_lat_dims(ds) + xmin, ymin, xmax, ymax = list(map(float, bbox)) + lon_dim, lat_dim = get_lon_lat_dims(ds) # TODO: this assumes y axis increases towards north and is not very stable # ...and also does not consider partial pixels at the edges # ...should probably switch to rioxarray.clip instead @@ -49,9 +54,9 @@ def get_data(dataset: dict[str, Any], start: str = None, end: str = None, bbox: return ds # type: ignore[no-any-return] + def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: """Return temporal and spatial coverage metadata for downloaded data.""" - ds = get_data(dataset) if not ds: @@ -60,8 +65,8 @@ def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: time_dim = get_time_dim(ds) lon_dim, lat_dim = get_lon_lat_dims(ds) - start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset['period_type']) # type: ignore[arg-type] - end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset['period_type']) # type: ignore[arg-type] + start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset["period_type"]) # type: ignore[arg-type] + end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset["period_type"]) # type: ignore[arg-type] xmin, xmax = ds[lon_dim].min().item(), ds[lon_dim].max().item() ymin, ymax = ds[lat_dim].min().item(), ds[lat_dim].max().item() @@ -73,6 +78,7 @@ def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: } } + def xarray_to_temporary_netcdf(ds: xr.Dataset) -> str: """Write a dataset to a temporary NetCDF file and return the path.""" fd = tempfile.NamedTemporaryFile(suffix=".nc", delete=False) @@ -81,6 +87,7 @@ def xarray_to_temporary_netcdf(ds: xr.Dataset) -> str: ds.to_netcdf(path) return path + def cleanup_file(path: str) -> None: """Remove a file from disk.""" os.remove(path) diff --git a/src/eo_api/data_manager/__init__.py b/src/eo_api/data_manager/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_manager/__init__.py +++ b/src/eo_api/data_manager/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_manager/routes.py b/src/eo_api/data_manager/routes.py index c6c5679..f369dbe 100644 --- a/src/eo_api/data_manager/routes.py +++ b/src/eo_api/data_manager/routes.py @@ -1,14 +1,9 @@ """FastAPI router exposing dataset endpoints.""" -from typing import Any +from fastapi import APIRouter, BackgroundTasks -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response -from fastapi.responses import FileResponse -from starlette.background import BackgroundTask - -from .services import constants, downloader from ..data_registry.routes import _get_dataset_or_404 +from .services import downloader router = APIRouter() @@ -17,9 +12,9 @@ def download_dataset( dataset_id: str, start: str, + background_tasks: BackgroundTasks, end: str | None = None, overwrite: bool = False, - background_tasks: BackgroundTasks = None, ) -> dict[str, str]: """Download dataset as local netcdf files direct from the source.""" dataset = _get_dataset_or_404(dataset_id) @@ -30,10 +25,9 @@ def download_dataset( @router.get("/{dataset_id}/build_zarr", response_model=dict) def build_dataset_zarr( dataset_id: str, - background_tasks: BackgroundTasks = None, + background_tasks: BackgroundTasks, ) -> dict[str, str]: """Optimize dataset downloads by collecting all files to a single zarr archive.""" dataset = _get_dataset_or_404(dataset_id) - if background_tasks is not None: - background_tasks.add_task(downloader.build_dataset_zarr, dataset) + background_tasks.add_task(downloader.build_dataset_zarr, dataset) return {"status": "Building zarr file from dataset downloads"} diff --git a/src/eo_api/data_manager/services/__init__.py b/src/eo_api/data_manager/services/__init__.py index 2ba6614..a20a096 100644 --- a/src/eo_api/data_manager/services/__init__.py +++ b/src/eo_api/data_manager/services/__init__.py @@ -1 +1,5 @@ -from . import constants, downloader, utils \ No newline at end of file +from . import constants as constants +from . import downloader as downloader +from . import utils as utils + +__all__ = ["constants", "downloader", "utils"] diff --git a/src/eo_api/data_manager/services/downloader.py b/src/eo_api/data_manager/services/downloader.py index 93c8a27..c74c06c 100644 --- a/src/eo_api/data_manager/services/downloader.py +++ b/src/eo_api/data_manager/services/downloader.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) SCRIPT_DIR = Path(__file__).parent.resolve() -_download_dir = SCRIPT_DIR.parent.parent.parent.parent / 'data' / 'downloads' +_download_dir = SCRIPT_DIR.parent.parent.parent.parent / "data" / "downloads" if CACHE_OVERRIDE: _download_dir = Path(CACHE_OVERRIDE) DOWNLOAD_DIR = _download_dir @@ -29,6 +29,8 @@ def download_dataset( end: str | None, overwrite: bool, background_tasks: BackgroundTasks | None, + country_code: str | None = None, + bbox: list[float] | None = None, ) -> None: """Download dataset from source and store as local NetCDF cache files.""" cache_info = dataset["cache_info"] @@ -48,15 +50,22 @@ def download_dataset( sig = inspect.signature(eo_download_func) if "bbox" in sig.parameters: - params["bbox"] = BBOX + params["bbox"] = bbox or BBOX elif "country_code" in sig.parameters: - if COUNTRY_CODE: - params["country_code"] = COUNTRY_CODE + resolved_country_code = country_code or COUNTRY_CODE + if resolved_country_code: + params["country_code"] = resolved_country_code else: - raise Exception('Downloading WorldPop data requires COUNTRY_CODE environment variable') + raise Exception( + "Downloading WorldPop data requires country_code input (or COUNTRY_CODE environment variable)" + ) + + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) if background_tasks is not None: background_tasks.add_task(eo_download_func, **params) + else: + eo_download_func(**params) def build_dataset_zarr(dataset: dict[str, Any]) -> None: diff --git a/src/eo_api/data_manager/services/utils.py b/src/eo_api/data_manager/services/utils.py index aa797cc..7e3bc0c 100644 --- a/src/eo_api/data_manager/services/utils.py +++ b/src/eo_api/data_manager/services/utils.py @@ -2,9 +2,6 @@ from typing import Any -import numpy as np -import pandas as pd - def get_time_dim(ds: Any) -> str: """Return the name of the time dimension in a dataset or dataframe.""" diff --git a/src/eo_api/data_registry/__init__.py b/src/eo_api/data_registry/__init__.py index 0fbcaa5..ee5067d 100644 --- a/src/eo_api/data_registry/__init__.py +++ b/src/eo_api/data_registry/__init__.py @@ -1 +1,4 @@ -from . import routes, services \ No newline at end of file +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/data_registry/routes.py b/src/eo_api/data_registry/routes.py index ffa306d..36f4fce 100644 --- a/src/eo_api/data_registry/routes.py +++ b/src/eo_api/data_registry/routes.py @@ -2,15 +2,13 @@ from typing import Any -import xarray as xr -from fastapi import APIRouter, BackgroundTasks, HTTPException, Response -from fastapi.responses import FileResponse -from starlette.background import BackgroundTask +from fastapi import APIRouter, HTTPException from .services import datasets router = APIRouter() + @router.get("/") def list_datasets() -> list[dict[str, Any]]: """Return list of available datasets from registry.""" @@ -30,7 +28,8 @@ def get_dataset(dataset_id: str) -> dict[str, Any]: """Get a single dataset by ID.""" # Note: have to import inside function to avoid circular import from ..data_accessor.services.accessor import get_data_coverage + dataset = _get_dataset_or_404(dataset_id) coverage = get_data_coverage(dataset) dataset.update(coverage) - return dataset \ No newline at end of file + return dataset diff --git a/src/eo_api/data_registry/services/__init__.py b/src/eo_api/data_registry/services/__init__.py index 08014f6..9d0231a 100644 --- a/src/eo_api/data_registry/services/__init__.py +++ b/src/eo_api/data_registry/services/__init__.py @@ -1 +1,3 @@ -from . import datasets \ No newline at end of file +from . import datasets as datasets + +__all__ = ["datasets"] diff --git a/src/eo_api/data_registry/services/datasets.py b/src/eo_api/data_registry/services/datasets.py index 371e8fc..9bcc5de 100644 --- a/src/eo_api/data_registry/services/datasets.py +++ b/src/eo_api/data_registry/services/datasets.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) SCRIPT_DIR = Path(__file__).parent.resolve() -CONFIGS_DIR = SCRIPT_DIR.parent.parent.parent.parent / 'data' / 'datasets' +CONFIGS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "datasets" def list_datasets() -> list[dict[str, Any]]: diff --git a/src/eo_api/main.py b/src/eo_api/main.py index e12ab58..0ab8abb 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -4,7 +4,7 @@ from fastapi.middleware.cors import CORSMiddleware import eo_api.startup # noqa: F401 # pyright: ignore[reportUnusedImport] -from eo_api import data_accessor, data_manager, data_registry, system +from eo_api import components, data_accessor, data_manager, data_registry, system, workflows app = FastAPI() @@ -16,7 +16,9 @@ allow_headers=["*"], ) -app.include_router(system.routes.router, tags=['System']) -app.include_router(data_registry.routes.router, prefix='/registry', tags=['Data registry']) -app.include_router(data_manager.routes.router, prefix='/manage', tags=['Data manager']) -app.include_router(data_accessor.routes.router, prefix='/retrieve', tags=['Data retrieval']) +app.include_router(system.routes.router, tags=["System"]) +app.include_router(data_registry.routes.router, prefix="/registry", tags=["Data registry"]) +app.include_router(data_manager.routes.router, prefix="/manage", tags=["Data manager"]) +app.include_router(data_accessor.routes.router, prefix="/retrieve", tags=["Data retrieval"]) +app.include_router(workflows.routes.router, prefix="/workflows", tags=["Workflows"]) +app.include_router(components.routes.router, tags=["Components"]) diff --git a/src/eo_api/shared/dhis2_adapter.py b/src/eo_api/shared/dhis2_adapter.py index 0e9e5b1..ea54348 100644 --- a/src/eo_api/shared/dhis2_adapter.py +++ b/src/eo_api/shared/dhis2_adapter.py @@ -86,4 +86,4 @@ def get_org_unit_geojson(client: DHIS2Client, uid: str) -> dict[str, Any]: def get_org_unit_subtree_geojson(client: DHIS2Client, uid: str) -> dict[str, Any]: """Fetch a subtree of organisation units as GeoJSON.""" - return cast(dict[str, Any], client.get_org_unit_subtree_geojson(uid)) \ No newline at end of file + return cast(dict[str, Any], client.get_org_unit_subtree_geojson(uid)) diff --git a/src/eo_api/shared/time.py b/src/eo_api/shared/time.py index 269740f..5690a11 100644 --- a/src/eo_api/shared/time.py +++ b/src/eo_api/shared/time.py @@ -1,7 +1,10 @@ +"""Shared time conversion helpers.""" + from typing import Any import numpy as np + def numpy_datetime_to_period_string(datetimes: np.ndarray[Any, Any], period_type: str) -> np.ndarray[Any, Any]: """Convert an array of numpy datetimes to truncated period strings.""" # TODO: this and numpy_period_string should be merged @@ -9,4 +12,4 @@ def numpy_datetime_to_period_string(datetimes: np.ndarray[Any, Any], period_type # Map periods to string lengths: YYYY-MM-DDTHH (13), YYYY-MM-DD (10), etc. lengths = {"hourly": 13, "daily": 10, "monthly": 7, "yearly": 4} - return s.astype(f"U{lengths[period_type]}") \ No newline at end of file + return s.astype(f"U{lengths[period_type]}") diff --git a/src/eo_api/startup.py b/src/eo_api/startup.py index 1d1ba51..5c33ffa 100644 --- a/src/eo_api/startup.py +++ b/src/eo_api/startup.py @@ -1,10 +1,11 @@ -"""Early-boot side effects +"""Early-boot side effects. This module is imported before any other eo_api modules so that environment variables and logging are configured before other imports. """ import logging + from dotenv import load_dotenv # noqa: E402 # -- Load .env (must happen before pygeoapi reads PYGEOAPI_CONFIG) ------------ diff --git a/src/eo_api/system/__init__.py b/src/eo_api/system/__init__.py index 00e7846..865a013 100644 --- a/src/eo_api/system/__init__.py +++ b/src/eo_api/system/__init__.py @@ -1 +1,4 @@ -from . import routes, schemas \ No newline at end of file +from . import routes as routes +from . import schemas as schemas + +__all__ = ["routes", "schemas"] diff --git a/src/eo_api/system/routes.py b/src/eo_api/system/routes.py index 2639ea3..1c41b1f 100644 --- a/src/eo_api/system/routes.py +++ b/src/eo_api/system/routes.py @@ -4,7 +4,6 @@ from importlib.metadata import version from fastapi import APIRouter, Request -from fastapi.responses import RedirectResponse from .schemas import AppInfo, HealthStatus, Link, RootResponse, Status diff --git a/src/eo_api/workflows/__init__.py b/src/eo_api/workflows/__init__.py new file mode 100644 index 0000000..23f4ad6 --- /dev/null +++ b/src/eo_api/workflows/__init__.py @@ -0,0 +1,6 @@ +"""Workflow APIs for generic gridded-data to DHIS2 pipelines.""" + +from . import routes as routes +from . import services as services + +__all__ = ["routes", "services"] diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py new file mode 100644 index 0000000..da9d271 --- /dev/null +++ b/src/eo_api/workflows/routes.py @@ -0,0 +1,41 @@ +"""API routes for workflow discovery and execution.""" + +from fastapi import APIRouter, HTTPException + +from .schemas import WorkflowCatalogItem, WorkflowCatalogResponse, WorkflowExecuteResponse, WorkflowRequest +from .services.definitions import list_workflow_definitions +from .services.engine import execute_workflow +from .services.simple_mapper import normalize_simple_request + +router = APIRouter() + + +@router.get("", response_model=WorkflowCatalogResponse) +def list_workflows() -> WorkflowCatalogResponse: + """List all allowlisted workflow definitions.""" + try: + definitions = list_workflow_definitions() + except ValueError as exc: + raise HTTPException(status_code=500, detail=str(exc)) from exc + return WorkflowCatalogResponse( + workflows=[ + WorkflowCatalogItem( + workflow_id=definition.workflow_id, + version=definition.version, + step_count=len(definition.steps), + components=[step.component for step in definition.steps], + ) + for definition in definitions + ] + ) + + +@router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) +def run_dhis2_datavalue_set_workflow(payload: WorkflowRequest) -> WorkflowExecuteResponse: + """Run workflow from a single flat request payload.""" + request, _warnings = normalize_simple_request(payload) + return execute_workflow( + request, + workflow_id=payload.workflow_id, + include_component_run_details=payload.include_component_run_details, + ) diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py new file mode 100644 index 0000000..7e48c5f --- /dev/null +++ b/src/eo_api/workflows/schemas.py @@ -0,0 +1,176 @@ +"""Schemas for generic DHIS2 workflow execution.""" + +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, Field, model_validator + + +class FeatureSourceType(StrEnum): + """Supported feature source backends.""" + + GEOJSON_FILE = "geojson_file" + DHIS2_LEVEL = "dhis2_level" + DHIS2_IDS = "dhis2_ids" + + +class AggregationMethod(StrEnum): + """Supported numeric aggregation methods.""" + + MEAN = "mean" + SUM = "sum" + MIN = "min" + MAX = "max" + + +class PeriodType(StrEnum): + """Supported temporal period types.""" + + HOURLY = "hourly" + DAILY = "daily" + MONTHLY = "monthly" + YEARLY = "yearly" + + +class FeatureSourceConfig(BaseModel): + """How to fetch features for spatial aggregation.""" + + source_type: FeatureSourceType + geojson_path: str | None = None + dhis2_level: int | None = None + dhis2_ids: list[str] | None = None + dhis2_parent: str | None = None + feature_id_property: str = "id" + + @model_validator(mode="after") + def validate_by_source(self) -> "FeatureSourceConfig": + """Enforce required fields per source backend.""" + if self.source_type == FeatureSourceType.GEOJSON_FILE and not self.geojson_path: + raise ValueError("geojson_path is required when source_type='geojson_file'") + if self.source_type == FeatureSourceType.DHIS2_LEVEL and self.dhis2_level is None: + raise ValueError("dhis2_level is required when source_type='dhis2_level'") + if self.source_type == FeatureSourceType.DHIS2_IDS and not self.dhis2_ids: + raise ValueError("dhis2_ids is required when source_type='dhis2_ids'") + return self + + +class TemporalAggregationConfig(BaseModel): + """Temporal rollup config.""" + + target_period_type: PeriodType + method: AggregationMethod = AggregationMethod.SUM + + +class SpatialAggregationConfig(BaseModel): + """Spatial aggregation config.""" + + method: AggregationMethod = AggregationMethod.MEAN + + +class Dhis2DataValueSetConfig(BaseModel): + """Mapping from aggregate outputs to DHIS2 DataValueSet fields.""" + + data_element_uid: str + category_option_combo_uid: str = "HllvX50cXC0" + attribute_option_combo_uid: str = "HllvX50cXC0" + data_set_uid: str | None = None + org_unit_property: str = "id" + stored_by: str | None = None + + +class WorkflowExecuteRequest(BaseModel): + """End-to-end workflow request.""" + + dataset_id: str + start: str + end: str + overwrite: bool = False + country_code: str | None = None + feature_source: FeatureSourceConfig + temporal_aggregation: TemporalAggregationConfig + spatial_aggregation: SpatialAggregationConfig = Field(default_factory=SpatialAggregationConfig) + dhis2: Dhis2DataValueSetConfig + + +class ComponentRun(BaseModel): + """Execution metadata for one workflow component.""" + + component: str + status: str + started_at: str + ended_at: str + duration_ms: int + inputs: dict[str, Any] + outputs: dict[str, Any] | None = None + error: str | None = None + + +class WorkflowExecuteResponse(BaseModel): + """Workflow execution response.""" + + status: str + run_id: str + workflow_id: str + workflow_version: int + dataset_id: str + bbox: list[float] + feature_count: int + value_count: int + output_file: str + run_log_file: str + data_value_set: dict[str, Any] + component_runs: list[ComponentRun] + component_run_details_included: bool = False + component_run_details_available: bool = True + + +class WorkflowCatalogItem(BaseModel): + """Discoverable workflow definition summary.""" + + workflow_id: str + version: int + step_count: int + components: list[str] + + +class WorkflowCatalogResponse(BaseModel): + """List of allowlisted workflow definitions.""" + + workflows: list[WorkflowCatalogItem] + + +class WorkflowRequest(BaseModel): + """Public flat workflow request payload.""" + + workflow_id: str = "dhis2_datavalue_set_v1" + dataset_id: str + start_date: str | None = None + end_date: str | None = None + start_year: int | None = None + end_year: int | None = None + org_unit_level: int | None = None + org_unit_ids: list[str] | None = None + data_element: str + temporal_resolution: PeriodType = PeriodType.MONTHLY + temporal_reducer: AggregationMethod = AggregationMethod.SUM + spatial_reducer: AggregationMethod = AggregationMethod.MEAN + overwrite: bool = False + dry_run: bool = True + feature_id_property: str = "id" + stage: str | None = None + flavor: str | None = None + country_code: str | None = None + output_format: str | None = None + reducer: str | None = None + include_component_run_details: bool = False + + @model_validator(mode="after") + def validate_time_window(self) -> "WorkflowRequest": + """Require either date range or year range.""" + has_dates = bool(self.start_date and self.end_date) + has_years = self.start_year is not None and self.end_year is not None + if not has_dates and not has_years: + raise ValueError("Provide either start_date/end_date or start_year/end_year") + if self.org_unit_level is None and not self.org_unit_ids: + raise ValueError("Provide org_unit_level or org_unit_ids") + return self diff --git a/src/eo_api/workflows/services/__init__.py b/src/eo_api/workflows/services/__init__.py new file mode 100644 index 0000000..4e6122f --- /dev/null +++ b/src/eo_api/workflows/services/__init__.py @@ -0,0 +1 @@ +"""Workflow service components.""" diff --git a/src/eo_api/workflows/services/datavalueset.py b/src/eo_api/workflows/services/datavalueset.py new file mode 100644 index 0000000..3768872 --- /dev/null +++ b/src/eo_api/workflows/services/datavalueset.py @@ -0,0 +1,65 @@ +"""DHIS2 DataValueSet builder component.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +import numpy as np + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import Dhis2DataValueSetConfig, PeriodType + + +def build_data_value_set( + records: list[dict[str, Any]], + *, + dataset_id: str, + period_type: PeriodType, + config: Dhis2DataValueSetConfig, +) -> tuple[dict[str, Any], str]: + """Build and serialize a DHIS2-compatible DataValueSet JSON payload.""" + data_values: list[dict[str, Any]] = [] + for record in records: + period = _format_period(record["time"], period_type) + data_values.append( + { + "dataElement": config.data_element_uid, + "period": period, + "orgUnit": record["org_unit"], + "categoryOptionCombo": config.category_option_combo_uid, + "attributeOptionCombo": config.attribute_option_combo_uid, + "value": str(record["value"]), + } + ) + + payload: dict[str, Any] = {"dataValues": data_values} + if config.data_set_uid: + payload["dataSet"] = config.data_set_uid + if config.stored_by: + payload["storedBy"] = config.stored_by + output_file = _write_data_value_set(payload, dataset_id) + return payload, output_file + + +def _write_data_value_set(payload: dict[str, Any], dataset_id: str) -> str: + """Persist DataValueSet payload and return file path.""" + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) + now = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = DOWNLOAD_DIR / f"{dataset_id}_datavalueset_{now}.json" + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(path) + + +def _format_period(time_value: Any, period_type: PeriodType) -> str: + ts = np.datetime64(time_value) + s = np.datetime_as_string(ts, unit="D") + year, month, day = s.split("-") + if period_type == PeriodType.DAILY: + return f"{year}{month}{day}" + if period_type == PeriodType.MONTHLY: + return f"{year}{month}" + if period_type == PeriodType.YEARLY: + return year + return s.replace("-", "") diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py new file mode 100644 index 0000000..9876406 --- /dev/null +++ b/src/eo_api/workflows/services/definitions.py @@ -0,0 +1,132 @@ +"""Declarative workflow definition loading and validation.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Final, Literal + +import yaml +from pydantic import BaseModel, model_validator + +ComponentName = Literal[ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", +] + +SUPPORTED_COMPONENTS: Final[set[str]] = set(ComponentName.__args__) # type: ignore[attr-defined] + +COMPONENT_INPUTS: Final[dict[str, set[str]]] = { + "feature_source": set(), + "download_dataset": {"bbox"}, + "temporal_aggregation": {"bbox"}, + "spatial_aggregation": {"bbox", "features"}, + "build_datavalueset": {"records"}, +} + +COMPONENT_OUTPUTS: Final[dict[str, set[str]]] = { + "feature_source": {"features", "bbox"}, + "download_dataset": set(), + "temporal_aggregation": {"temporal_dataset"}, + "spatial_aggregation": {"records"}, + "build_datavalueset": {"data_value_set", "output_file"}, +} + +SCRIPT_DIR = Path(__file__).parent.resolve() +WORKFLOWS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "workflows" +DEFAULT_WORKFLOW_ID = "dhis2_datavalue_set_v1" + + +class WorkflowStep(BaseModel): + """One component step in a declarative workflow definition.""" + + component: ComponentName + + +class WorkflowDefinition(BaseModel): + """Declarative workflow definition.""" + + workflow_id: str + version: int = 1 + steps: list[WorkflowStep] + + @model_validator(mode="after") + def validate_steps(self) -> "WorkflowDefinition": + """Require terminal DataValueSet step and validate component compatibility.""" + if not self.steps: + raise ValueError("Workflow steps cannot be empty") + if self.steps[-1].component != "build_datavalueset": + raise ValueError("The last workflow step must be 'build_datavalueset'") + available_context: set[str] = set() + for step in self.steps: + required_inputs = COMPONENT_INPUTS[step.component] + missing_inputs = required_inputs - available_context + if missing_inputs: + missing = ", ".join(sorted(missing_inputs)) + raise ValueError(f"Component '{step.component}' is missing required upstream outputs: {missing}") + available_context.update(COMPONENT_OUTPUTS[step.component]) + return self + + +def load_workflow_definition( + workflow_id: str = DEFAULT_WORKFLOW_ID, + *, + path: Path | None = None, +) -> WorkflowDefinition: + """Load and validate workflow definition from discovered YAML files.""" + if path is not None: + workflow_file = path + else: + workflow_files = _discover_workflow_files() + workflow_file_or_none = workflow_files.get(workflow_id) + if workflow_file_or_none is None: + known = ", ".join(sorted(workflow_files)) + raise ValueError(f"Unknown workflow_id '{workflow_id}'. Allowed values: {known}") + workflow_file = workflow_file_or_none + + if not workflow_file.exists(): + raise ValueError(f"Workflow definition file not found: {workflow_file}") + with open(workflow_file, encoding="utf-8") as f: + raw = yaml.safe_load(f) + if raw is None: + raise ValueError(f"Workflow definition file is empty: {workflow_file}") + definition = WorkflowDefinition.model_validate(raw) + if path is None and definition.workflow_id != workflow_id: + raise ValueError( + f"workflow_id mismatch: requested '{workflow_id}' but definition declares '{definition.workflow_id}'" + ) + return definition + + +def list_workflow_definitions() -> list[WorkflowDefinition]: + """Load and return all discovered workflow definitions.""" + workflow_files = _discover_workflow_files() + return [load_workflow_definition(workflow_id) for workflow_id in sorted(workflow_files)] + + +def _discover_workflow_files() -> dict[str, Path]: + """Discover and validate workflow IDs from all YAML files in workflows folder.""" + if not WORKFLOWS_DIR.is_dir(): + raise ValueError(f"Workflow directory not found: {WORKFLOWS_DIR}") + + discovered: dict[str, Path] = {} + for workflow_file in sorted(WORKFLOWS_DIR.glob("*.y*ml")): + with open(workflow_file, encoding="utf-8") as f: + raw = yaml.safe_load(f) + if raw is None: + raise ValueError(f"Workflow definition file is empty: {workflow_file}") + if not isinstance(raw, dict): + raise ValueError(f"Workflow definition must be a mapping/object: {workflow_file}") + + workflow_id = raw.get("workflow_id") + if not isinstance(workflow_id, str) or not workflow_id: + raise ValueError(f"Missing/invalid workflow_id in: {workflow_file}") + + existing = discovered.get(workflow_id) + if existing is not None: + raise ValueError(f"Duplicate workflow_id '{workflow_id}' in files: {existing.name}, {workflow_file.name}") + discovered[workflow_id] = workflow_file + + return discovered diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py new file mode 100644 index 0000000..9e7da9e --- /dev/null +++ b/src/eo_api/workflows/services/engine.py @@ -0,0 +1,207 @@ +"""Workflow orchestration engine for gridded-data pipelines.""" + +from __future__ import annotations + +from typing import Any + +from fastapi import HTTPException + +from ...components import services as component_services +from ...data_registry.services.datasets import get_dataset +from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse +from .definitions import WorkflowDefinition, load_workflow_definition +from .run_logs import persist_run_log +from .runtime import WorkflowRuntime + + +def execute_workflow( + request: WorkflowExecuteRequest, + *, + workflow_id: str = "dhis2_datavalue_set_v1", + include_component_run_details: bool = False, +) -> WorkflowExecuteResponse: + """Execute the feature->download->aggregate->DataValueSet workflow.""" + runtime = WorkflowRuntime() + + dataset = get_dataset(request.dataset_id) + if dataset is None: + raise HTTPException(status_code=404, detail=f"Dataset '{request.dataset_id}' not found") + + context: dict[str, Any] = {} + + try: + try: + workflow = load_workflow_definition(workflow_id) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + _execute_workflow_steps( + workflow=workflow, + runtime=runtime, + request=request, + dataset=dataset, + context=context, + ) + features = _require_context(context, "features") + bbox = _require_context(context, "bbox") + data_value_set = _require_context(context, "data_value_set") + output_file = _require_context(context, "output_file") + run_log_file = persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="completed", + output_file=output_file, + ) + + return WorkflowExecuteResponse( + status="completed", + run_id=runtime.run_id, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + dataset_id=request.dataset_id, + bbox=bbox, + feature_count=len(features["features"]), + value_count=len(data_value_set["dataValues"]), + output_file=output_file, + run_log_file=run_log_file, + data_value_set=data_value_set, + component_runs=runtime.component_runs if include_component_run_details else [], + component_run_details_included=include_component_run_details, + component_run_details_available=True, + ) + except HTTPException: + persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error="http_exception", + ) + raise + except Exception as exc: + persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error=str(exc), + ) + last_component = runtime.component_runs[-1].component if runtime.component_runs else "unknown" + if _is_upstream_connectivity_error(exc): + raise HTTPException( + status_code=503, + detail={ + "error": "upstream_unreachable", + "message": "Could not reach upstream data source. Check network/proxy and retry.", + "failed_component": last_component, + "run_id": runtime.run_id, + }, + ) from exc + raise HTTPException( + status_code=500, + detail={ + "error": "workflow_execution_failed", + "message": str(exc), + "failed_component": last_component, + "run_id": runtime.run_id, + }, + ) from exc + + +def _is_upstream_connectivity_error(exc: Exception) -> bool: + message = str(exc).lower() + patterns = ( + "could not connect to server", + "failed to connect", + "connection refused", + "name or service not known", + "temporary failure in name resolution", + "timed out", + "curl error", + ) + return any(pattern in message for pattern in patterns) + + +def _execute_workflow_steps( + *, + workflow: WorkflowDefinition, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], +) -> None: + """Execute workflow components using declarative YAML step order.""" + for step in workflow.steps: + if step.component == "feature_source": + features, bbox = runtime.run( + "feature_source", + component_services.feature_source_component, + config=request.feature_source, + ) + context["features"] = features + context["bbox"] = bbox + continue + + if step.component == "download_dataset": + runtime.run( + "download_dataset", + component_services.download_dataset_component, + dataset=dataset, + start=request.start, + end=request.end, + overwrite=request.overwrite, + country_code=request.country_code, + bbox=_require_context(context, "bbox"), + ) + continue + + if step.component == "temporal_aggregation": + temporal_ds = runtime.run( + "temporal_aggregation", + component_services.temporal_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + target_period_type=request.temporal_aggregation.target_period_type, + method=request.temporal_aggregation.method, + ) + context["temporal_dataset"] = temporal_ds + continue + + if step.component == "spatial_aggregation": + records = runtime.run( + "spatial_aggregation", + component_services.spatial_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + features=_require_context(context, "features"), + method=request.spatial_aggregation.method, + feature_id_property=request.dhis2.org_unit_property, + ) + context["records"] = records + continue + + if step.component == "build_datavalueset": + data_value_set, output_file = runtime.run( + "build_datavalueset", + component_services.build_datavalueset_component, + records=_require_context(context, "records"), + dataset_id=request.dataset_id, + period_type=request.temporal_aggregation.target_period_type, + dhis2=request.dhis2, + ) + context["data_value_set"] = data_value_set + context["output_file"] = output_file + continue + + raise RuntimeError(f"Unsupported workflow component '{step.component}'") + + +def _require_context(context: dict[str, Any], key: str) -> Any: + """Return required context value or raise a clear orchestration error.""" + if key not in context: + raise RuntimeError(f"Workflow definition missing prerequisite for '{key}'") + return context[key] diff --git a/src/eo_api/workflows/services/features.py b/src/eo_api/workflows/services/features.py new file mode 100644 index 0000000..95d8cc4 --- /dev/null +++ b/src/eo_api/workflows/services/features.py @@ -0,0 +1,69 @@ +"""Feature source component for workflow execution.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import geopandas as gpd + +from ...shared.dhis2_adapter import create_client, get_org_unit_geojson, get_org_units_geojson +from ..schemas import FeatureSourceConfig, FeatureSourceType + + +def resolve_features(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: + """Resolve features from a source and return FeatureCollection + bbox.""" + if config.source_type == FeatureSourceType.GEOJSON_FILE: + collection = _read_geojson_file(config.geojson_path or "") + elif config.source_type == FeatureSourceType.DHIS2_LEVEL: + client = create_client() + collection = get_org_units_geojson(client, level=config.dhis2_level, parent=config.dhis2_parent) + else: + client = create_client() + collection = _collection_from_dhis2_ids(client, config.dhis2_ids or []) + + collection = _normalize_feature_collection(collection) + bbox = _bbox_from_feature_collection(collection) + return collection, bbox + + +def feature_id(feature: dict[str, Any], key: str) -> str: + """Get feature identifier from properties, feature id, or UID fallbacks.""" + properties = feature.get("properties", {}) + value = properties.get(key) or feature.get("id") or properties.get("id") or properties.get("uid") + if value is None: + raise ValueError(f"Unable to find feature identifier using key '{key}'") + return str(value) + + +def _read_geojson_file(path: str) -> dict[str, Any]: + raw = json.loads(Path(path).read_text(encoding="utf-8")) + return _normalize_feature_collection(raw) + + +def _collection_from_dhis2_ids(client: Any, ou_ids: list[str]) -> dict[str, Any]: + features: list[dict[str, Any]] = [] + for uid in ou_ids: + unit_geojson = get_org_unit_geojson(client, uid) + normalized = _normalize_feature_collection(unit_geojson) + features.extend(normalized["features"]) + return {"type": "FeatureCollection", "features": features} + + +def _normalize_feature_collection(raw: dict[str, Any]) -> dict[str, Any]: + raw_type = raw.get("type") + if raw_type == "FeatureCollection": + return raw + if raw_type == "Feature": + return {"type": "FeatureCollection", "features": [raw]} + if "features" in raw and isinstance(raw["features"], list): + return {"type": "FeatureCollection", "features": raw["features"]} + raise ValueError("Input is not a valid GeoJSON feature or feature collection") + + +def _bbox_from_feature_collection(collection: dict[str, Any]) -> list[float]: + if not collection.get("features"): + raise ValueError("Feature collection is empty") + bounds = gpd.read_file(json.dumps(collection)).total_bounds + return [float(v) for v in bounds] diff --git a/src/eo_api/workflows/services/preflight.py b/src/eo_api/workflows/services/preflight.py new file mode 100644 index 0000000..9ff6c3f --- /dev/null +++ b/src/eo_api/workflows/services/preflight.py @@ -0,0 +1,23 @@ +"""Preflight checks for external data source connectivity.""" + +from __future__ import annotations + +import socket +from urllib.parse import urlparse + + +def check_upstream_connectivity(dataset: dict[str, object], timeout_seconds: float = 5.0) -> None: + """Fail fast if a dataset source host is not reachable.""" + source_url = dataset.get("source_url") + if not isinstance(source_url, str) or not source_url: + return + + parsed = urlparse(source_url) + hostname = parsed.hostname + if not hostname: + return + port = parsed.port or (443 if parsed.scheme == "https" else 80) + + # Fail quickly on DNS/TCP connectivity issues instead of waiting for long GDAL timeouts. + with socket.create_connection((hostname, port), timeout=timeout_seconds): + pass diff --git a/src/eo_api/workflows/services/run_logs.py b/src/eo_api/workflows/services/run_logs.py new file mode 100644 index 0000000..ea4d375 --- /dev/null +++ b/src/eo_api/workflows/services/run_logs.py @@ -0,0 +1,37 @@ +"""Run-log persistence for workflow executions.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import ComponentRun, WorkflowExecuteRequest + + +def persist_run_log( + *, + run_id: str, + request: WorkflowExecuteRequest, + component_runs: list[ComponentRun], + status: str, + output_file: str | None = None, + error: str | None = None, +) -> str: + """Write workflow run metadata to disk and return file path.""" + logs_dir = DOWNLOAD_DIR / "workflow_runs" + logs_dir.mkdir(parents=True, exist_ok=True) + timestamp = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = logs_dir / f"{timestamp}_{run_id}.json" + + payload: dict[str, Any] = { + "run_id": run_id, + "status": status, + "request": request.model_dump(mode="json"), + "component_runs": [run.model_dump(mode="json") for run in component_runs], + "output_file": output_file, + "error": error, + } + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return str(path) diff --git a/src/eo_api/workflows/services/runtime.py b/src/eo_api/workflows/services/runtime.py new file mode 100644 index 0000000..905ba54 --- /dev/null +++ b/src/eo_api/workflows/services/runtime.py @@ -0,0 +1,89 @@ +"""Component runtime wrapper for workflow housekeeping metadata.""" + +from __future__ import annotations + +import datetime as dt +import time +import uuid +from collections.abc import Callable +from typing import Any + +from ..schemas import ComponentRun + + +class WorkflowRuntime: + """Capture execution metadata for component orchestration.""" + + def __init__(self) -> None: + self.run_id = str(uuid.uuid4()) + self.component_runs: list[ComponentRun] = [] + + def run(self, component: str, fn: Callable[..., Any], **kwargs: Any) -> Any: + """Execute one component and record start/end/input/output metadata.""" + started = dt.datetime.now(dt.timezone.utc) + started_perf = time.perf_counter() + + try: + result = fn(**kwargs) + ended = dt.datetime.now(dt.timezone.utc) + self.component_runs.append( + ComponentRun( + component=component, + status="completed", + started_at=started.isoformat(), + ended_at=ended.isoformat(), + duration_ms=int((time.perf_counter() - started_perf) * 1000), + inputs=_to_json_summary(kwargs), + outputs={"result": _to_json_summary(result)}, + ) + ) + return result + except Exception as exc: + ended = dt.datetime.now(dt.timezone.utc) + self.component_runs.append( + ComponentRun( + component=component, + status="failed", + started_at=started.isoformat(), + ended_at=ended.isoformat(), + duration_ms=int((time.perf_counter() - started_perf) * 1000), + inputs=_to_json_summary(kwargs), + outputs=None, + error=str(exc), + ) + ) + raise + + +def _to_json_summary(value: Any, *, depth: int = 0, max_depth: int = 2) -> Any: + """Convert arbitrary values into a compact JSON-safe summary.""" + if depth >= max_depth: + return _fallback_summary(value) + + if value is None or isinstance(value, (str, int, float, bool)): + return value + + if isinstance(value, list): + return [_to_json_summary(v, depth=depth + 1, max_depth=max_depth) for v in value[:20]] + + if isinstance(value, tuple): + return [_to_json_summary(v, depth=depth + 1, max_depth=max_depth) for v in value[:20]] + + if isinstance(value, dict): + out: dict[str, Any] = {} + for i, (k, v) in enumerate(value.items()): + if i >= 30: + out["..."] = "truncated" + break + out[str(k)] = _to_json_summary(v, depth=depth + 1, max_depth=max_depth) + return out + + return _fallback_summary(value) + + +def _fallback_summary(value: Any) -> str: + if hasattr(value, "shape"): + return f"{type(value).__name__}(shape={getattr(value, 'shape')})" + if hasattr(value, "sizes"): + return f"{type(value).__name__}(sizes={getattr(value, 'sizes')})" + return type(value).__name__ diff --git a/src/eo_api/workflows/services/simple_mapper.py b/src/eo_api/workflows/services/simple_mapper.py new file mode 100644 index 0000000..74b8758 --- /dev/null +++ b/src/eo_api/workflows/services/simple_mapper.py @@ -0,0 +1,86 @@ +"""Mapper from simplified workflow inputs to internal workflow request.""" + +from __future__ import annotations + +from fastapi import HTTPException + +from ...data_registry.services.datasets import get_dataset +from ..schemas import ( + AggregationMethod, + Dhis2DataValueSetConfig, + FeatureSourceConfig, + FeatureSourceType, + SpatialAggregationConfig, + TemporalAggregationConfig, + WorkflowExecuteRequest, + WorkflowRequest, +) + +_IGNORED_FIELDS = ["dry_run", "stage", "flavor", "output_format"] + + +def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteRequest, list[str]]: + """Translate public workflow request format to internal workflow request.""" + inputs = payload + dataset_id = inputs.dataset_id + dataset = get_dataset(dataset_id) + + period_type = str(dataset.get("period_type", "")).lower() if dataset else "" + + if inputs.start_date and inputs.end_date: + if period_type == "yearly": + start = inputs.start_date[:4] + end = inputs.end_date[:4] + elif period_type in {"hourly", "daily", "monthly"}: + # dhis2eo downloaders expect month windows for these dataset types. + start = inputs.start_date[:7] + end = inputs.end_date[:7] + else: + start = inputs.start_date + end = inputs.end_date + elif inputs.start_year is not None and inputs.end_year is not None: + if period_type == "yearly": + start = str(inputs.start_year) + end = str(inputs.end_year) + else: + start = f"{inputs.start_year}-01-01" + end = f"{inputs.end_year}-12-31" + else: + raise HTTPException(status_code=422, detail="Provide either start_date/end_date or start_year/end_year") + + if inputs.org_unit_level is not None: + feature_source = FeatureSourceConfig( + source_type=FeatureSourceType.DHIS2_LEVEL, + dhis2_level=inputs.org_unit_level, + feature_id_property=inputs.feature_id_property, + ) + elif inputs.org_unit_ids: + feature_source = FeatureSourceConfig( + source_type=FeatureSourceType.DHIS2_IDS, + dhis2_ids=inputs.org_unit_ids, + feature_id_property=inputs.feature_id_property, + ) + else: + raise HTTPException(status_code=422, detail="Provide org_unit_level or org_unit_ids") + + reducer_alias = AggregationMethod(inputs.reducer.lower()) if inputs.reducer else None + spatial_method = reducer_alias or inputs.spatial_reducer + temporal_method = reducer_alias or inputs.temporal_reducer + + normalized = WorkflowExecuteRequest( + dataset_id=dataset_id, + start=start, + end=end, + overwrite=inputs.overwrite, + country_code=inputs.country_code, + feature_source=feature_source, + temporal_aggregation=TemporalAggregationConfig( + target_period_type=inputs.temporal_resolution, + method=temporal_method, + ), + spatial_aggregation=SpatialAggregationConfig(method=spatial_method), + dhis2=Dhis2DataValueSetConfig(data_element_uid=inputs.data_element), + ) + + warnings = [f"Input field '{field}' is currently accepted but not used in execution" for field in _IGNORED_FIELDS] + return normalized, warnings diff --git a/src/eo_api/workflows/services/spatial.py b/src/eo_api/workflows/services/spatial.py new file mode 100644 index 0000000..668141e --- /dev/null +++ b/src/eo_api/workflows/services/spatial.py @@ -0,0 +1,56 @@ +"""Spatial aggregation component for gridded datasets.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import xarray as xr +from shapely import contains_xy +from shapely.geometry import shape + +from ...data_manager.services.utils import get_lon_lat_dims, get_time_dim +from .features import feature_id + + +def aggregate_to_features( + ds: xr.Dataset, + *, + variable: str, + features: dict[str, Any], + method: str, + feature_id_property: str, +) -> list[dict[str, Any]]: + """Aggregate one gridded variable into per-feature time series.""" + da = ds[variable] + time_dim = get_time_dim(da) + lon_dim, lat_dim = get_lon_lat_dims(da) + lon_values = da[lon_dim].values + lat_values = da[lat_dim].values + lon_grid, lat_grid = np.meshgrid(lon_values, lat_values) + + output: list[dict[str, Any]] = [] + for feature in features.get("features", []): + geom = shape(feature["geometry"]) + mask = contains_xy(geom, lon_grid, lat_grid) + if not np.any(mask): + continue + + mask_da = xr.DataArray( + mask, + dims=(lat_dim, lon_dim), + coords={lat_dim: da[lat_dim], lon_dim: da[lon_dim]}, + ) + reduced = getattr(da.where(mask_da), method)(dim=[lat_dim, lon_dim], skipna=True) + org_unit = feature_id(feature, feature_id_property) + for t, value in zip(reduced[time_dim].values, reduced.values, strict=True): + if np.isnan(value): + continue + output.append( + { + "org_unit": org_unit, + "time": t, + "value": float(value), + } + ) + return output diff --git a/src/eo_api/workflows/services/temporal.py b/src/eo_api/workflows/services/temporal.py new file mode 100644 index 0000000..e244f85 --- /dev/null +++ b/src/eo_api/workflows/services/temporal.py @@ -0,0 +1,25 @@ +"""Temporal aggregation component.""" + +from __future__ import annotations + +from typing import cast + +import xarray as xr + +from ...data_manager.services.utils import get_time_dim +from ..schemas import AggregationMethod, PeriodType + +_PERIOD_TO_FREQ: dict[PeriodType, str] = { + PeriodType.HOURLY: "1h", + PeriodType.DAILY: "1D", + PeriodType.MONTHLY: "MS", + PeriodType.YEARLY: "YS", +} + + +def aggregate_temporal(ds: xr.Dataset, *, period_type: PeriodType, method: AggregationMethod) -> xr.Dataset: + """Resample a dataset over the time dimension to the target period.""" + time_dim = get_time_dim(ds) + freq = _PERIOD_TO_FREQ[period_type] + resampled = ds.resample({time_dim: freq}) + return cast(xr.Dataset, getattr(resampled, method.value)(keep_attrs=True)) diff --git a/tests/test_root.py b/tests/test_root.py index 5353344..f90d3c6 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -1,6 +1,6 @@ from fastapi.testclient import TestClient -from eo_api.schemas import HealthStatus, RootResponse +from eo_api.system.schemas import HealthStatus, RootResponse def test_root_returns_200(client: TestClient) -> None: diff --git a/tests/test_workflows.py b/tests/test_workflows.py new file mode 100644 index 0000000..da16352 --- /dev/null +++ b/tests/test_workflows.py @@ -0,0 +1,490 @@ +from __future__ import annotations + +from typing import Any + +import pytest +import xarray as xr +from fastapi import HTTPException +from fastapi.routing import APIRoute +from fastapi.testclient import TestClient + +from eo_api.main import app +from eo_api.workflows.schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowRequest +from eo_api.workflows.services import engine +from eo_api.workflows.services.definitions import WorkflowDefinition, load_workflow_definition +from eo_api.workflows.services.simple_mapper import normalize_simple_request + + +def _valid_public_payload() -> dict[str, Any]: + return { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "dry_run": True, + "include_component_run_details": False, + } + + +def test_workflow_endpoint_exists_once() -> None: + workflow_routes = { + route.path + for route in app.routes + if isinstance(route, APIRoute) and route.path.startswith("/workflows") and "POST" in route.methods + } + assert workflow_routes == {"/workflows/dhis2-datavalue-set"} + + +def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClient) -> None: + response = client.get("/workflows") + assert response.status_code == 200 + body = response.json() + assert "workflows" in body + assert len(body["workflows"]) >= 2 + by_id = {item["workflow_id"]: item for item in body["workflows"]} + + default = by_id["dhis2_datavalue_set_v1"] + assert default["version"] == 1 + assert default["step_count"] == 5 + assert default["components"] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + fast = by_id["dhis2_datavalue_set_without_temporal_aggregation_v1"] + assert fast["version"] == 1 + assert fast["step_count"] == 4 + assert fast["components"] == [ + "feature_source", + "download_dataset", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_components_catalog_endpoint_returns_five_components(client: TestClient) -> None: + response = client.get("/components") + assert response.status_code == 200 + items = response.json()["components"] + names = {item["name"] for item in items} + assert names == { + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + } + + +def test_workflow_endpoint_returns_response_shape(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-123", + workflow_id="dhis2_datavalue_set_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-123.json", + data_value_set={ + "dataValues": [ + { + "dataElement": "abc123def45", + "period": "202401", + "orgUnit": "OU_1", + "categoryOptionCombo": "HllvX50cXC0", + "attributeOptionCombo": "HllvX50cXC0", + "value": "12.3", + } + ] + }, + component_runs=[], + ) + monkeypatch.setattr( + "eo_api.workflows.routes.execute_workflow", + lambda payload, workflow_id="dhis2_datavalue_set_v1", include_component_run_details=False: stub, + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + body = response.json() + assert body["status"] == "completed" + assert body["run_id"] == "run-123" + assert body["workflow_id"] == "dhis2_datavalue_set_v1" + assert body["workflow_version"] == 1 + assert body["run_log_file"].endswith(".json") + assert "dataValues" in body["data_value_set"] + assert body["component_run_details_included"] is False + assert body["component_run_details_available"] is True + + +def test_workflow_endpoint_validates_required_fields(client: TestClient) -> None: + payload = _valid_public_payload() + payload.pop("org_unit_level") + + response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert response.status_code == 422 + + +def test_workflow_endpoint_accepts_simplified_payload(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + normalized = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3, "feature_id_property": "id"}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-123", + workflow_id="dhis2_datavalue_set_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-123.json", + data_value_set={"dataValues": []}, + component_runs=[], + ) + monkeypatch.setattr("eo_api.workflows.routes.normalize_simple_request", lambda payload: (normalized, [])) + monkeypatch.setattr( + "eo_api.workflows.routes.execute_workflow", + lambda payload, workflow_id="dhis2_datavalue_set_v1", include_component_run_details=False: stub, + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + assert response.json()["status"] == "completed" + + +def test_engine_orchestrates_components(monkeypatch: pytest.MonkeyPatch) -> None: + request = { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "country_code": "SLE", + "feature_source": { + "source_type": "geojson_file", + "geojson_path": "tests/data/sierra_leone_districts.geojson", + "feature_id_property": "id", + }, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + + dataset = {"id": "chirps3_precipitation_daily", "variable": "precip"} + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr(engine, "get_dataset", lambda dataset_id: dataset) + + called: dict[str, Any] = {"downloaded": False} + + def _download_dataset_component(**kwargs: Any) -> None: + called["downloaded"] = True + assert kwargs["bbox"] == [0.0, 0.0, 1.0, 1.0] + assert kwargs["country_code"] == "SLE" + + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", _download_dataset_component) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow( + engine.WorkflowExecuteRequest.model_validate(request), + include_component_run_details=True, + ) + assert response.status == "completed" + assert response.run_id + assert response.value_count == 1 + assert response.run_log_file.endswith(".json") + assert len(response.component_runs) == 5 + assert [c.component for c in response.component_runs] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + assert response.component_run_details_included is True + assert response.component_run_details_available is True + assert called["downloaded"] is True + + +def test_engine_hides_component_details_by_default(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.component_runs == [] + assert response.component_run_details_included is False + assert response.component_run_details_available is True + + +def test_engine_returns_503_when_upstream_unreachable(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr(engine, "get_dataset", lambda dataset_id: {"id": "chirps3_precipitation_daily"}) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr( + engine.component_services, + "download_dataset_component", + lambda **kwargs: (_ for _ in ()).throw(RuntimeError("Failed to connect to server")), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 503 + + +def test_mapper_uses_year_format_for_yearly_dataset() -> None: + normalized, _warnings = normalize_simple_request( + WorkflowRequest.model_validate( + { + "dataset_id": "worldpop_population_yearly", + "country_code": "SLE", + "start_year": 2015, + "end_year": 2026, + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "yearly", + } + ) + ) + assert normalized.start == "2015" + assert normalized.end == "2026" + + +def test_mapper_reducer_alias_overrides_spatial_and_temporal_reducers() -> None: + normalized, _warnings = normalize_simple_request( + WorkflowRequest.model_validate( + { + "dataset_id": "worldpop_population_yearly", + "country_code": "SLE", + "start_year": 2015, + "end_year": 2026, + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "yearly", + "reducer": "sum", + } + ) + ) + assert normalized.spatial_aggregation.method.value == "sum" + assert normalized.temporal_aggregation.method.value == "sum" + + +def test_mapper_uses_month_format_for_chirps_date_window() -> None: + normalized, _warnings = normalize_simple_request( + WorkflowRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 2, + "data_element": "DE_UID", + } + ) + ) + assert normalized.start == "2024-01" + assert normalized.end == "2024-05" + + +def test_default_workflow_definition_has_expected_steps() -> None: + workflow = load_workflow_definition() + assert workflow.workflow_id == "dhis2_datavalue_set_v1" + assert workflow.version == 1 + assert [step.component for step in workflow.steps] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_engine_follows_declarative_workflow_order(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request, include_component_run_details=True) + assert response.workflow_id == "dhis2_datavalue_set_v1" + assert response.workflow_version == 1 + assert [c.component for c in response.component_runs] == [ + "feature_source", + "download_dataset", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_engine_rejects_unknown_workflow_id(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request, workflow_id="not_allowlisted") + + assert exc_info.value.status_code == 422 From feed7959c77df53fa7874243979980d285138012 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 11 Mar 2026 23:34:19 +0100 Subject: [PATCH 02/15] Document workflow architecture and manual E2E testing --- README.md | 18 ++ docs/workflow-orchestration.md | 517 +++++++++++++++++++++++++++++++++ 2 files changed, 535 insertions(+) create mode 100644 docs/workflow-orchestration.md diff --git a/README.md b/README.md index ce64dec..224f48b 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,24 @@ Docs: http://127.0.0.1:8000/docs +Workflow (single payload contract): + +`POST /workflows/dhis2-datavalue-set` + +```json +{ + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 3, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "dry_run": true +} +``` + OGC API http://127.0.0.1:8000/ogcapi diff --git a/docs/workflow-orchestration.md b/docs/workflow-orchestration.md new file mode 100644 index 0000000..22315c8 --- /dev/null +++ b/docs/workflow-orchestration.md @@ -0,0 +1,517 @@ +# Workflow Orchestration Design (Single Endpoint, Componentized Runtime) + +## Purpose + +This document describes the implemented approach for generating a DHIS2 DataValueSet from gridded EO datasets through one workflow endpoint and reusable components. + +It documents: + +1. What has been achieved. +2. The architecture and execution flow. +3. Public API contract and normalization rules. +4. Runtime metadata, observability, and error handling. +5. Current componentization strategy and extension path. + +--- + +## What Is Implemented + +The current implementation provides: + +1. One canonical workflow execution endpoint: + - `POST /workflows/dhis2-datavalue-set` +2. One public flat request payload contract (`WorkflowRequest`). +3. Internal normalization into a canonical execution model (`WorkflowExecuteRequest`). +4. A fixed generic orchestration chain with exactly 5 components: + - `feature_source` + - `download_dataset` + - `temporal_aggregation` + - `spatial_aggregation` + - `build_datavalueset` +5. Per-component runtime instrumentation (`WorkflowRuntime`) with timing, status, and summarized inputs/outputs. +6. Run-log persistence for both success and failure. +7. Structured error responses, including upstream connectivity failures. +8. Optional inclusion of detailed component run traces in API responses. +9. Discoverable standalone component endpoints under `/components` for direct execution and future orchestrator integration. +10. Declarative workflow assembly via YAML (`data/workflows/dhis2_datavalue_set.yaml`) executed by the workflow engine. + +--- + +## Final API Surface + +### Primary Workflow Endpoint + +- `POST /workflows/dhis2-datavalue-set` + +### Workflow Discovery Endpoint + +- `GET /workflows` (discovered workflow catalog from `data/workflows/*.yaml` with `workflow_id`, `version`, and component chain) + +### Component Discovery/Execution Endpoints + +- `GET /components` +- `POST /components/feature-source` +- `POST /components/download-dataset` +- `POST /components/temporal-aggregation` +- `POST /components/spatial-aggregation` +- `POST /components/build-datavalue-set` + +`/components/*` endpoints are for reusable task-level execution. The workflow endpoint remains the single end-to-end API for generating DHIS2 DataValueSet output. + +--- + +## Public Workflow Request Contract + +The workflow endpoint accepts one flat payload shape: + +```json +{ + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": false +} +``` + +Important fields: + +1. `dataset_id` (required) +2. `workflow_id` (optional, default `dhis2_datavalue_set_v1`, must exist in discovered workflow YAMLs) +3. Time window (required as one of): + - `start_date` + `end_date`, or + - `start_year` + `end_year` +4. Spatial scope (required as one of): + - `org_unit_level`, or + - `org_unit_ids` +5. `data_element` (required) +6. `include_component_run_details` (optional, default `false`) + +Notes: + +1. `feature_id_property` defaults to `"id"` and controls which feature property maps to DHIS2 org unit ID in spatial aggregation/DataValueSet construction. +2. `country_code` is accepted in request and passed to dataset downloaders (instead of forcing `.env` only). +3. `reducer` is accepted as an alias and mapped to both temporal and spatial reducer when provided. + +--- + +## Normalization and Mapping Approach + +File: `src/eo_api/workflows/services/simple_mapper.py` + +Public flat payload is normalized to internal `WorkflowExecuteRequest` with component-ready nested configs: + +1. `feature_source` config: + - `org_unit_level` -> `source_type=dhis2_level` + - `org_unit_ids` -> `source_type=dhis2_ids` +2. `temporal_aggregation` config: + - `target_period_type` from `temporal_resolution` + - `method` from `temporal_reducer` (or `reducer` alias) +3. `spatial_aggregation` config: + - `method` from `spatial_reducer` (or `reducer` alias) +4. `dhis2` config: + - `data_element_uid` from `data_element` + +Time normalization depends on dataset registry metadata (`period_type`): + +1. Yearly datasets: + - normalize to `YYYY` +2. Hourly/Daily/Monthly datasets: + - normalize to month windows (`YYYY-MM`) for downloader compatibility +3. Fallback: + - pass date strings as provided + +This mapping keeps the public contract simple while preserving an extensible internal orchestration model. + +--- + +## Architecture + +### API Routing Layer + +Files: + +1. `src/eo_api/workflows/routes.py` +2. `src/eo_api/components/routes.py` +3. `src/eo_api/main.py` + +Responsibilities: + +1. Expose one workflow endpoint and reusable component endpoints. +2. Keep payload and response models explicit with Pydantic. +3. Delegate execution logic to service layers. + +### Workflow Engine Layer + +File: `src/eo_api/workflows/services/engine.py` + +Responsibilities: + +1. Validate dataset existence via registry. +2. Execute the 5 components in fixed order. +3. Collect runtime telemetry for each component. +4. Persist run logs on both success and error paths. +5. Return workflow result with optional component-run detail inclusion. + +### Workflow Definition Layer + +Files: + +1. `src/eo_api/workflows/services/definitions.py` +2. `data/workflows/dhis2_datavalue_set.yaml` + +Responsibilities: + +1. Discover, load, and validate declarative workflow definitions from `data/workflows/*.yaml`. +2. Enforce supported component names. +3. Enforce terminal `build_datavalueset` step for this end-to-end workflow. +4. Enforce output-to-input compatibility across the full accumulated context (not just adjacent steps). +5. Drive runtime execution order from YAML instead of hardcoded sequence. + +### Reusable Component Service Layer + +File: `src/eo_api/components/services.py` + +Responsibilities: + +1. Provide discoverable component catalog metadata. +2. Implement component functions used by: + - workflow engine, and + - `/components/*` task endpoints. +3. Reuse existing EO API capabilities (`downloader`, `accessor`, temporal/spatial aggregators, DataValueSet builder). + +--- + +## Layering Rationale + +The repository uses three layers with different responsibilities: + +1. `data_xxx` services (`data_manager`, `data_accessor`, `data_registry`) + - Core domain capabilities (download, load/subset, dataset metadata). + - No workflow-specific orchestration state required. +2. `components/` + - Thin reusable wrappers around core capabilities. + - Standardized component contracts for discovery (`GET /components`) and direct task execution. + - Runtime-friendly boundaries for future orchestrators (Prefect/Airflow). +3. `workflows/` + - End-to-end orchestration, request normalization, workflow selection, runtime tracing, and run-log persistence. + - Declarative assembly from `data/workflows/*.yaml`. + +Example: + +1. `download_dataset` workflow/component step delegates actual download work to `src/eo_api/data_manager/services/downloader.py`. +2. The wrapper adds orchestration-level concerns (preflight, context wiring, component runtime metadata) without duplicating downloader logic. + +This separation keeps core data services reusable and prevents workflow-specific concerns from leaking into the low-level data modules. + +--- + +## Component Chain (Exact Runtime Order) + +The workflow engine executes these components, no more and no less: + +1. `feature_source` +2. `download_dataset` +3. `temporal_aggregation` +4. `spatial_aggregation` +5. `build_datavalueset` + +Details: + +1. `feature_source` + - Resolves features from DHIS2 org unit level/ids or GeoJSON source config. + - Returns `FeatureCollection` and `bbox`. +2. `download_dataset` + - Runs connectivity preflight and downloads source data using `data_manager/services/downloader.py`. + - Supports request-supplied `country_code` where needed (e.g., WorldPop). +3. `temporal_aggregation` + - Loads/subsets data and performs period aggregation with selected reducer. +4. `spatial_aggregation` + - Aggregates gridded data over feature geometries. + - Produces normalized record rows (`org_unit`, `time`, `value`). +5. `build_datavalueset` + - Builds valid DHIS2 DataValueSet JSON from records. + - Serializes output to file and returns both payload and output path. + +`load_data` and `write_datavalueset` are intentionally not separate top-level components anymore; loading and writing are internalized within aggregation/build steps. + +Execution order is currently defined in: + +- `data/workflows/dhis2_datavalue_set.yaml` + +The default YAML remains the same 5-step sequence, but the engine now reads it declaratively. + +--- + +## Runtime Observability and Housekeeping + +File: `src/eo_api/workflows/services/runtime.py` + +For each component run, runtime captures: + +1. `component` +2. `status` +3. `started_at` +4. `ended_at` +5. `duration_ms` +6. `inputs` (summarized) +7. `outputs` (summarized) +8. `error` (on failure) + +Each workflow execution gets a unique `run_id`. + +### Response-Level Control of Run Details + +`include_component_run_details` controls response verbosity: + +1. If `false`: + - `component_runs: []` + - `component_run_details_included: false` + - `component_run_details_available: true` +2. If `true`: + - `component_runs` contains full per-component run records + - `component_run_details_included: true` + - `component_run_details_available: true` + +This keeps default responses clean while preserving debuggability when explicitly requested. + +--- + +## Run Logs + +File: `src/eo_api/workflows/services/run_logs.py` + +Workflow run logs are persisted under: + +- `/workflow_runs/` + +Persisted fields include: + +1. `run_id` +2. `status` (`completed` or `failed`) +3. normalized request payload +4. `component_runs` +5. output file path (when completed) +6. error details (when failed) + +--- + +## Error Handling Strategy + +1. `422` for request validation failures (Pydantic model constraints). +2. `404` when `dataset_id` does not exist in registry. +3. `503` for upstream connectivity issues detected during download/preflight: + - `error: "upstream_unreachable"` +4. `500` for other execution failures: + - `error: "workflow_execution_failed"` + +Failure responses include `failed_component` and `run_id` for traceability. + +--- + +## Achieved Behavior from Manual Verification + +Manual runs validated the following: + +1. WorldPop workflows now accept `country_code` from payload and execute without mandatory `.env` coupling. +2. Yearly dataset date normalization issues were resolved by period-aware mapping logic. +3. CHIRPS multi-month workflows execute correctly, with behavior improving as cache warms. +4. Workflow responses and run logs align with the 5-component chain. +5. Default response trimming works and detail flags remove ambiguity. + +Reference sample outputs: + +1. `docs/response/worldpop.json` +2. `docs/response/chirps3.json` +3. `docs/response/without_component_runs.json` +4. `docs/response/with_component_runs.json` + +--- + +## Testing and Quality Gates + +Primary tests: + +- `tests/test_workflows.py` + +Coverage includes: + +1. Single workflow endpoint behavior. +2. Payload validation and normalization paths. +3. Exact 5-component orchestration order. +4. Component detail include/exclude behavior. +5. Upstream connectivity error mapping. +6. Component catalog endpoint expectations. +7. Declarative workflow definition loading and default step validation. +8. Engine execution follows the definition-provided step order. + +Quality gates: + +1. `make lint` (ruff, mypy, pyright) +2. `uv run pytest -q` + +--- + +## Why This Approach + +This design intentionally balances: + +1. Simplicity for clients: + - one end-to-end endpoint and one public payload. +2. Generic dataset support: + - dataset-specific behavior comes from registry metadata and downloader wiring, not endpoint proliferation. +3. Reusability: + - component services are discoverable and executable independently. +4. Future orchestration readiness: + - component boundaries and run metadata are explicit, making Prefect/Airflow integration straightforward. + +--- + +## Sequence Diagram + +```mermaid +sequenceDiagram + autonumber + participant C as Client + participant R as /workflows route + participant M as simple_mapper + participant E as engine + participant RT as WorkflowRuntime + participant CS as components.services + participant RL as run_logs + + C->>R: POST /workflows/dhis2-datavalue-set (flat payload) + R->>M: normalize_simple_request(payload) + M-->>R: WorkflowExecuteRequest + R->>E: execute_workflow(request, include_component_run_details) + + E->>RT: run(feature_source) + RT->>CS: feature_source_component(...) + CS-->>RT: features, bbox + + E->>RT: run(download_dataset) + RT->>CS: download_dataset_component(...) + CS-->>RT: status + + E->>RT: run(temporal_aggregation) + RT->>CS: temporal_aggregation_component(...) + CS-->>RT: aggregated dataset + + E->>RT: run(spatial_aggregation) + RT->>CS: spatial_aggregation_component(...) + CS-->>RT: records + + E->>RT: run(build_datavalueset) + RT->>CS: build_datavalueset_component(...) + CS-->>RT: data_value_set, output_file + + E->>RL: persist_run_log(completed|failed) + RL-->>E: run_log_file + E-->>R: WorkflowExecuteResponse + R-->>C: 200 response (trimmed or detailed component runs) +``` + +Failure path: + +1. Any component exception is captured by runtime on the failing step. +2. Engine persists failed run log with `run_id` and `failed_component`. +3. Engine returns structured error: + - `503` with `error=upstream_unreachable` for connectivity failures. + - `500` with `error=workflow_execution_failed` for all other failures. + +--- + +## Manual E2E Testing + +Use the following commands to validate discovery, execution, and error behavior end-to-end. + +1. Start API: + +```bash +uvicorn eo_api.main:app --reload +``` + +2. Verify discovered workflows: + +```bash +curl -s http://127.0.0.1:8000/workflows | jq +``` + +3. Run default 5-step workflow: + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": true + }' | jq +``` + +Expected component order: + +1. `feature_source` +2. `download_dataset` +3. `temporal_aggregation` +4. `spatial_aggregation` +5. `build_datavalueset` + +4. Run 4-step workflow (without temporal aggregation): + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "spatial_reducer": "mean", + "include_component_run_details": true + }' | jq +``` + +Expected component order: + +1. `feature_source` +2. `download_dataset` +3. `spatial_aggregation` +4. `build_datavalueset` + +5. Negative test for unknown workflow: + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ + -H "Content-Type: application/json" \ + -d '{ + "workflow_id": "does_not_exist", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 2, + "data_element": "DE_UID" + }' | jq +``` + +Expected result: `422` with allowed/discovered `workflow_id` values in error detail. + +--- + +## Next Technical Step + +Add a workflow governance model for multi-user environments: workflow metadata (owner/status), promotion states (draft/staging/prod), and optional signature/checksum validation before a discovered YAML can execute. From f30e84b2dc98d428e02862704820387392d3ce04 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 11 Mar 2026 23:40:10 +0100 Subject: [PATCH 03/15] Restore pygeoapi config and generated OpenAPI artifact for CI --- pygeoapi-config.yml | 40 +++++ pygeoapi-openapi.yml | 362 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 402 insertions(+) create mode 100644 pygeoapi-config.yml create mode 100644 pygeoapi-openapi.yml diff --git a/pygeoapi-config.yml b/pygeoapi-config.yml new file mode 100644 index 0000000..ca32423 --- /dev/null +++ b/pygeoapi-config.yml @@ -0,0 +1,40 @@ +server: + bind: + host: 0.0.0.0 + port: 5000 + url: http://127.0.0.1:8000/ogcapi + mimetype: application/json; charset=UTF-8 + encoding: utf-8 + languages: + - en-US + limits: + default_items: 20 + max_items: 50 + map: + url: https://tile.openstreetmap.org/{z}/{x}/{y}.png + attribution: OpenStreetMap + +metadata: + identification: + title: + en: DHIS2 EO API + description: + en: OGC API facade for EO services + keywords: + en: + - EO + - DHIS2 + terms_of_service: https://dhis2.org + url: https://dhis2.org + license: + name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + provider: + name: DHIS2 EO API + url: https://dhis2.org + contact: + name: DHIS2 + position: Team + email: climate@dhis2.org + +resources: {} diff --git a/pygeoapi-openapi.yml b/pygeoapi-openapi.yml new file mode 100644 index 0000000..d70eca1 --- /dev/null +++ b/pygeoapi-openapi.yml @@ -0,0 +1,362 @@ +components: + parameters: + bbox: + description: Only features that have a geometry that intersects the bounding + box are selected.The bounding box is provided as four or six numbers, depending + on whether the coordinate reference system includes a vertical axis (height + or depth). + explode: false + in: query + name: bbox + required: false + schema: + items: + type: number + maxItems: 6 + minItems: 4 + type: array + style: form + bbox-crs: + description: Indicates the coordinate reference system for the given bbox coordinates. + explode: false + in: query + name: bbox-crs + required: false + schema: + format: uri + type: string + style: form + bbox-crs-epsg: + description: Indicates the EPSG for the given bbox coordinates. + explode: false + in: query + name: bbox-crs + required: false + schema: + default: 4326 + type: integer + style: form + crs: + description: Indicates the coordinate reference system for the results. + explode: false + in: query + name: crs + required: false + schema: + format: uri + type: string + style: form + f: + description: The optional f parameter indicates the output format which the + server shall provide as part of the response document. The default format + is GeoJSON. + explode: false + in: query + name: f + required: false + schema: + default: json + enum: + - json + - html + - jsonld + type: string + style: form + lang: + description: The optional lang parameter instructs the server return a response + in a certain language, if supported. If the language is not among the available + values, the Accept-Language header language will be used if it is supported. + If the header is missing, the default server language is used. Note that providers + may only support a single language (or often no language at all), that can + be different from the server language. Language strings can be written in + a complex (e.g. "fr-CA,fr;q=0.9,en-US;q=0.8,en;q=0.7"), simple (e.g. "de") + or locale-like (e.g. "de-CH" or "fr_BE") fashion. + in: query + name: lang + required: false + schema: + default: en-US + enum: + - en-US + type: string + offset: + description: The optional offset parameter indicates the index within the result + set from which the server shall begin presenting results in the response document. The + first element has an index of 0 (default). + explode: false + in: query + name: offset + required: false + schema: + default: 0 + minimum: 0 + type: integer + style: form + resourceId: + description: Configuration resource identifier + in: path + name: resourceId + required: true + schema: + type: string + skipGeometry: + description: This option can be used to skip response geometries for each feature. + explode: false + in: query + name: skipGeometry + required: false + schema: + default: false + type: boolean + style: form + vendorSpecificParameters: + description: Additional "free-form" parameters that are not explicitly defined + in: query + name: vendorSpecificParameters + schema: + additionalProperties: true + type: object + style: form + responses: + '200': + description: successful operation + '204': + description: no content + Queryables: + content: + application/json: + schema: + $ref: '#/components/schemas/queryables' + description: successful queryables operation + default: + content: + application/json: + schema: + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/schemas/exception.yaml + description: Unexpected error + schemas: + queryable: + properties: + description: + description: a human-readable narrative describing the queryable + type: string + language: + default: en + description: the language used for the title and description + type: string + queryable: + description: the token that may be used in a CQL predicate + type: string + title: + description: a human readable title for the queryable + type: string + type: + description: the data type of the queryable + type: string + type-ref: + description: a reference to the formal definition of the type + format: url + type: string + required: + - queryable + - type + type: object + queryables: + properties: + queryables: + items: + $ref: '#/components/schemas/queryable' + type: array + required: + - queryables + type: object +info: + contact: + name: DHIS2 EO API + url: https://dhis2.org + x-ogc-serviceContact: + addresses: [] + emails: + - value: climate@dhis2.org + name: DHIS2 + position: Team + description: OGC API facade for EO services + license: + name: CC-BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + termsOfService: https://dhis2.org + title: DHIS2 EO API + version: 0.22.0 + x-keywords: + - EO + - DHIS2 +openapi: 3.0.2 +paths: + /: + get: + description: Landing page + operationId: getLandingPage + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: Landing page + tags: + - server + /collections: + get: + description: Collections + operationId: getCollections + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: Collections + tags: + - server + /conformance: + get: + description: API conformance definition + operationId: getConformanceDeclaration + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + responses: + '200': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/LandingPage + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + '500': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/ServerError + summary: API conformance definition + tags: + - server + /jobs: + get: + description: Retrieve a list of jobs + operationId: getJobs + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve jobs list + tags: + - jobs + /jobs/{jobId}: + delete: + description: Cancel / delete job + operationId: deleteJob + parameters: + - &id001 + description: job identifier + in: path + name: jobId + required: true + schema: + type: string + responses: + '204': + $ref: '#/components/responses/204' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Cancel / delete job + tags: + - jobs + get: + description: Retrieve job details + operationId: getJob + parameters: + - *id001 + - $ref: '#/components/parameters/f' + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve job details + tags: + - jobs + /jobs/{jobId}/results: + get: + description: Retrieve job results + operationId: getJobResults + parameters: + - *id001 + - $ref: '#/components/parameters/f' + responses: + '200': + $ref: '#/components/responses/200' + '404': + $ref: https://schemas.opengis.net/ogcapi/processes/part1/1.0/openapi/responses/NotFound.yaml + default: + $ref: '#/components/responses/default' + summary: Retrieve job results + tags: + - jobs + /openapi: + get: + description: This document + operationId: getOpenapi + parameters: + - $ref: '#/components/parameters/f' + - $ref: '#/components/parameters/lang' + - description: UI to render the OpenAPI document + explode: false + in: query + name: ui + required: false + schema: + default: swagger + enum: + - swagger + - redoc + type: string + style: form + responses: + '200': + $ref: '#/components/responses/200' + '400': + $ref: https://schemas.opengis.net/ogcapi/features/part1/1.0/openapi/ogcapi-features-1.yaml#/components/responses/InvalidParameter + default: + $ref: '#/components/responses/default' + summary: This document + tags: + - server +servers: +- description: OGC API facade for EO services + url: http://127.0.0.1:8000/ogcapi +tags: +- description: OGC API facade for EO services + externalDocs: + description: information + url: https://dhis2.org + name: server +- name: coverages +- name: edr +- name: records +- name: features +- name: maps +- name: processes +- name: jobs +- name: tiles +- name: stac + From f616ce53e9983b91f7edacc044e309e5cb8d11c3 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Thu, 12 Mar 2026 08:57:33 +0100 Subject: [PATCH 04/15] Harden workflow engine with step configs, contracts, and error codes --- data/workflows/dhis2_datavalue_set.yaml | 5 + ...alue_set_without_temporal_aggregation.yaml | 4 + docs/workflow-orchestration.md | 60 ++- src/eo_api/components/schemas.py | 5 + src/eo_api/components/services.py | 189 +++++++-- src/eo_api/workflows/routes.py | 1 + src/eo_api/workflows/schemas.py | 1 - src/eo_api/workflows/services/definitions.py | 18 +- src/eo_api/workflows/services/engine.py | 359 ++++++++++++++---- src/eo_api/workflows/services/run_logs.py | 6 + .../workflows/services/simple_mapper.py | 9 +- tests/test_workflows.py | 201 ++++++++-- 12 files changed, 691 insertions(+), 167 deletions(-) diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml index 180da1d..8f5afd8 100644 --- a/data/workflows/dhis2_datavalue_set.yaml +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -2,7 +2,12 @@ workflow_id: dhis2_datavalue_set_v1 version: 1 steps: - component: feature_source + version: v1 - component: download_dataset + version: v1 - component: temporal_aggregation + version: v1 - component: spatial_aggregation + version: v1 - component: build_datavalueset + version: v1 diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml index 6d1b1f0..c7baaf5 100644 --- a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -2,6 +2,10 @@ workflow_id: dhis2_datavalue_set_without_temporal_aggregation_v1 version: 1 steps: - component: feature_source + version: v1 - component: download_dataset + version: v1 - component: spatial_aggregation + version: v1 - component: build_datavalueset + version: v1 diff --git a/docs/workflow-orchestration.md b/docs/workflow-orchestration.md index 22315c8..97d26a1 100644 --- a/docs/workflow-orchestration.md +++ b/docs/workflow-orchestration.md @@ -34,6 +34,9 @@ The current implementation provides: 8. Optional inclusion of detailed component run traces in API responses. 9. Discoverable standalone component endpoints under `/components` for direct execution and future orchestrator integration. 10. Declarative workflow assembly via YAML (`data/workflows/dhis2_datavalue_set.yaml`) executed by the workflow engine. +11. Registry-driven component dispatch in engine (no component-specific `if/elif` chain). +12. Step-level YAML config support with strict validation and `$request.` interpolation. +13. Stable workflow error contract with `error_code` and `failed_component_version`. --- @@ -96,7 +99,6 @@ Notes: 1. `feature_id_property` defaults to `"id"` and controls which feature property maps to DHIS2 org unit ID in spatial aggregation/DataValueSet construction. 2. `country_code` is accepted in request and passed to dataset downloaders (instead of forcing `.env` only). -3. `reducer` is accepted as an alias and mapped to both temporal and spatial reducer when provided. --- @@ -111,9 +113,9 @@ Public flat payload is normalized to internal `WorkflowExecuteRequest` with comp - `org_unit_ids` -> `source_type=dhis2_ids` 2. `temporal_aggregation` config: - `target_period_type` from `temporal_resolution` - - `method` from `temporal_reducer` (or `reducer` alias) + - `method` from `temporal_reducer` 3. `spatial_aggregation` config: - - `method` from `spatial_reducer` (or `reducer` alias) + - `method` from `spatial_reducer` 4. `dhis2` config: - `data_element_uid` from `data_element` @@ -169,9 +171,10 @@ Responsibilities: 1. Discover, load, and validate declarative workflow definitions from `data/workflows/*.yaml`. 2. Enforce supported component names. -3. Enforce terminal `build_datavalueset` step for this end-to-end workflow. -4. Enforce output-to-input compatibility across the full accumulated context (not just adjacent steps). -5. Drive runtime execution order from YAML instead of hardcoded sequence. +3. Enforce supported component versions (currently `v1`) and validate per-step `config`. +4. Enforce terminal `build_datavalueset` step for this end-to-end workflow. +5. Enforce output-to-input compatibility across the full accumulated context (not just adjacent steps). +6. Drive runtime execution order from YAML through a registry-dispatch model. ### Reusable Component Service Layer @@ -238,13 +241,17 @@ Details: - Builds valid DHIS2 DataValueSet JSON from records. - Serializes output to file and returns both payload and output path. -`load_data` and `write_datavalueset` are intentionally not separate top-level components anymore; loading and writing are internalized within aggregation/build steps. - -Execution order is currently defined in: +Execution order and step metadata are currently defined in: - `data/workflows/dhis2_datavalue_set.yaml` -The default YAML remains the same 5-step sequence, but the engine now reads it declaratively. +Workflow step schema now supports: + +1. `component` +2. `version` (default `v1`) +3. `config` (default `{}`) + +The default YAML remains the same 5-step sequence, but the engine reads it declaratively and dispatches components through a registry map. --- @@ -298,38 +305,29 @@ Persisted fields include: 4. `component_runs` 5. output file path (when completed) 6. error details (when failed) +7. `error_code` (when failed) +8. `failed_component` (when failed) +9. `failed_component_version` (when failed) --- ## Error Handling Strategy -1. `422` for request validation failures (Pydantic model constraints). +1. `422` for request/definition/config validation failures. 2. `404` when `dataset_id` does not exist in registry. -3. `503` for upstream connectivity issues detected during download/preflight: +3. `503` for upstream connectivity failures: - `error: "upstream_unreachable"` + - `error_code: "UPSTREAM_UNREACHABLE"` 4. `500` for other execution failures: - `error: "workflow_execution_failed"` + - `error_code: "EXECUTION_FAILED"` (or other stable mapped codes) -Failure responses include `failed_component` and `run_id` for traceability. - ---- - -## Achieved Behavior from Manual Verification - -Manual runs validated the following: - -1. WorldPop workflows now accept `country_code` from payload and execute without mandatory `.env` coupling. -2. Yearly dataset date normalization issues were resolved by period-aware mapping logic. -3. CHIRPS multi-month workflows execute correctly, with behavior improving as cache warms. -4. Workflow responses and run logs align with the 5-component chain. -5. Default response trimming works and detail flags remove ambiguity. - -Reference sample outputs: +Failure responses include: -1. `docs/response/worldpop.json` -2. `docs/response/chirps3.json` -3. `docs/response/without_component_runs.json` -4. `docs/response/with_component_runs.json` +1. `error_code` +2. `failed_component` +3. `failed_component_version` +4. `run_id` --- diff --git a/src/eo_api/components/schemas.py b/src/eo_api/components/schemas.py index 80c772e..a0920dc 100644 --- a/src/eo_api/components/schemas.py +++ b/src/eo_api/components/schemas.py @@ -18,9 +18,14 @@ class ComponentDefinition(BaseModel): """Component metadata for discovery.""" name: str + version: str = "v1" description: str inputs: list[str] outputs: list[str] + input_schema: dict[str, Any] = Field(default_factory=dict) + config_schema: dict[str, Any] = Field(default_factory=dict) + output_schema: dict[str, Any] = Field(default_factory=dict) + error_codes: list[str] = Field(default_factory=list) class ComponentCatalogResponse(BaseModel): diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index 2e99133..8bd0e1d 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any +from typing import Any, Final import xarray as xr from fastapi import HTTPException @@ -23,41 +23,164 @@ from ..workflows.services.temporal import aggregate_temporal from .schemas import ComponentDefinition +_ERROR_CODES_V1: Final[list[str]] = [ + "INPUT_VALIDATION_FAILED", + "CONFIG_VALIDATION_FAILED", + "OUTPUT_VALIDATION_FAILED", + "UPSTREAM_UNREACHABLE", + "EXECUTION_FAILED", +] + +_COMPONENT_REGISTRY: Final[dict[str, ComponentDefinition]] = { + "feature_source@v1": ComponentDefinition( + name="feature_source", + version="v1", + description="Resolve feature source and compute bbox.", + inputs=["feature_source"], + outputs=["features", "bbox"], + input_schema={ + "type": "object", + "properties": {"feature_source": {"type": "object"}}, + "required": ["feature_source"], + }, + config_schema={"type": "object", "properties": {}, "additionalProperties": False}, + output_schema={ + "type": "object", + "properties": { + "features": {"type": "object"}, + "bbox": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + }, + "required": ["features", "bbox"], + }, + error_codes=_ERROR_CODES_V1, + ), + "download_dataset@v1": ComponentDefinition( + name="download_dataset", + version="v1", + description="Download dataset files for period and bbox.", + inputs=["dataset_id", "start", "end", "overwrite", "country_code", "bbox"], + outputs=["status"], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "overwrite": {"type": "boolean"}, + "country_code": {"type": ["string", "null"]}, + "bbox": {"type": "array", "items": {"type": "number"}, "minItems": 4, "maxItems": 4}, + }, + "required": ["dataset_id", "start", "end", "overwrite", "bbox"], + }, + config_schema={ + "type": "object", + "properties": { + "overwrite": {"type": "boolean"}, + "country_code": {"type": ["string", "null"]}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"status": {"type": "string"}}}, + error_codes=_ERROR_CODES_V1, + ), + "temporal_aggregation@v1": ComponentDefinition( + name="temporal_aggregation", + version="v1", + description="Aggregate dataset over time dimension.", + inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], + outputs=["dataset"], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "target_period_type": {"type": "string"}, + "method": {"type": "string"}, + "bbox": {"type": ["array", "null"], "items": {"type": "number"}}, + }, + "required": ["dataset_id", "start", "end", "target_period_type", "method"], + }, + config_schema={ + "type": "object", + "properties": { + "target_period_type": {"type": "string"}, + "method": {"type": "string"}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"dataset": {"type": "object"}}}, + error_codes=_ERROR_CODES_V1, + ), + "spatial_aggregation@v1": ComponentDefinition( + name="spatial_aggregation", + version="v1", + description="Aggregate gridded dataset to features.", + inputs=["dataset_id", "start", "end", "feature_source", "method"], + outputs=["records"], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "start": {"type": "string"}, + "end": {"type": "string"}, + "feature_source": {"type": "object"}, + "method": {"type": "string"}, + }, + "required": ["dataset_id", "start", "end", "feature_source", "method"], + }, + config_schema={ + "type": "object", + "properties": { + "method": {"type": "string"}, + "feature_id_property": {"type": "string"}, + }, + "additionalProperties": False, + }, + output_schema={"type": "object", "properties": {"records": {"type": "array"}}}, + error_codes=_ERROR_CODES_V1, + ), + "build_datavalueset@v1": ComponentDefinition( + name="build_datavalueset", + version="v1", + description="Build and serialize DHIS2 DataValueSet JSON.", + inputs=["dataset_id", "period_type", "records", "dhis2"], + outputs=["data_value_set", "output_file"], + input_schema={ + "type": "object", + "properties": { + "dataset_id": {"type": "string"}, + "period_type": {"type": "string"}, + "records": {"type": "array"}, + "dhis2": {"type": "object"}, + }, + "required": ["dataset_id", "period_type", "records", "dhis2"], + }, + config_schema={ + "type": "object", + "properties": { + "period_type": {"type": "string"}, + }, + "additionalProperties": False, + }, + output_schema={ + "type": "object", + "properties": {"data_value_set": {"type": "object"}, "output_file": {"type": "string"}}, + "required": ["data_value_set", "output_file"], + }, + error_codes=_ERROR_CODES_V1, + ), +} + def component_catalog() -> list[ComponentDefinition]: """Return all discoverable component definitions.""" - return [ - ComponentDefinition( - name="feature_source", - description="Resolve feature source and compute bbox.", - inputs=["feature_source"], - outputs=["features", "bbox"], - ), - ComponentDefinition( - name="download_dataset", - description="Download dataset files for period and bbox.", - inputs=["dataset_id", "start", "end", "overwrite", "country_code", "bbox"], - outputs=["status"], - ), - ComponentDefinition( - name="temporal_aggregation", - description="Aggregate dataset over time dimension.", - inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], - outputs=["dataset"], - ), - ComponentDefinition( - name="spatial_aggregation", - description="Aggregate gridded dataset to features.", - inputs=["dataset_id", "start", "end", "feature_source", "method"], - outputs=["records"], - ), - ComponentDefinition( - name="build_datavalueset", - description="Build and serialize DHIS2 DataValueSet JSON.", - inputs=["dataset_id", "period_type", "records", "dhis2"], - outputs=["data_value_set", "output_file"], - ), - ] + return list(_COMPONENT_REGISTRY.values()) + + +def component_registry() -> dict[str, ComponentDefinition]: + """Return registry entries keyed by component@version.""" + return dict(_COMPONENT_REGISTRY) def feature_source_component(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index da9d271..6716c51 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -37,5 +37,6 @@ def run_dhis2_datavalue_set_workflow(payload: WorkflowRequest) -> WorkflowExecut return execute_workflow( request, workflow_id=payload.workflow_id, + request_params=payload.model_dump(), include_component_run_details=payload.include_component_run_details, ) diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index 7e48c5f..cfd0645 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -161,7 +161,6 @@ class WorkflowRequest(BaseModel): flavor: str | None = None country_code: str | None = None output_format: str | None = None - reducer: str | None = None include_component_run_details: bool = False @model_validator(mode="after") diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py index 9876406..a05965c 100644 --- a/src/eo_api/workflows/services/definitions.py +++ b/src/eo_api/workflows/services/definitions.py @@ -3,10 +3,10 @@ from __future__ import annotations from pathlib import Path -from typing import Final, Literal +from typing import Any, Final, Literal import yaml -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, Field, model_validator ComponentName = Literal[ "feature_source", @@ -17,6 +17,7 @@ ] SUPPORTED_COMPONENTS: Final[set[str]] = set(ComponentName.__args__) # type: ignore[attr-defined] +SUPPORTED_COMPONENT_VERSIONS: Final[dict[str, set[str]]] = {component: {"v1"} for component in SUPPORTED_COMPONENTS} COMPONENT_INPUTS: Final[dict[str, set[str]]] = { "feature_source": set(), @@ -43,6 +44,19 @@ class WorkflowStep(BaseModel): """One component step in a declarative workflow definition.""" component: ComponentName + version: str = "v1" + config: dict[str, Any] = Field(default_factory=dict) + + @model_validator(mode="after") + def validate_component_version(self) -> "WorkflowStep": + """Ensure component@version exists in the registered component catalog.""" + supported_versions = SUPPORTED_COMPONENT_VERSIONS.get(self.component, set()) + if self.version not in supported_versions: + known = ", ".join(sorted(supported_versions)) or "" + raise ValueError( + f"Unsupported component version '{self.component}@{self.version}'. Supported versions: {known}" + ) + return self class WorkflowDefinition(BaseModel): diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index 9e7da9e..d8e1ad7 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -2,22 +2,44 @@ from __future__ import annotations +from collections.abc import Callable from typing import Any from fastapi import HTTPException +from pydantic import BaseModel, ConfigDict, ValidationError from ...components import services as component_services from ...data_registry.services.datasets import get_dataset -from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse +from ..schemas import AggregationMethod, PeriodType, WorkflowExecuteRequest, WorkflowExecuteResponse from .definitions import WorkflowDefinition, load_workflow_definition from .run_logs import persist_run_log from .runtime import WorkflowRuntime +class WorkflowComponentError(RuntimeError): + """Typed component failure with stable error code and component context.""" + + def __init__( + self, + *, + error_code: str, + message: str, + component: str, + component_version: str, + status_code: int, + ) -> None: + super().__init__(message) + self.error_code = error_code + self.component = component + self.component_version = component_version + self.status_code = status_code + + def execute_workflow( request: WorkflowExecuteRequest, *, workflow_id: str = "dhis2_datavalue_set_v1", + request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, ) -> WorkflowExecuteResponse: """Execute the feature->download->aggregate->DataValueSet workflow.""" @@ -38,6 +60,7 @@ def execute_workflow( workflow=workflow, runtime=runtime, request=request, + request_params=request_params, dataset=dataset, context=context, ) @@ -69,6 +92,29 @@ def execute_workflow( component_run_details_included=include_component_run_details, component_run_details_available=True, ) + except WorkflowComponentError as exc: + persist_run_log( + run_id=runtime.run_id, + request=request, + component_runs=runtime.component_runs, + status="failed", + error=str(exc), + error_code=exc.error_code, + failed_component=exc.component, + failed_component_version=exc.component_version, + ) + error = "upstream_unreachable" if exc.error_code == "UPSTREAM_UNREACHABLE" else "workflow_execution_failed" + raise HTTPException( + status_code=exc.status_code, + detail={ + "error": error, + "error_code": exc.error_code, + "message": str(exc), + "failed_component": exc.component, + "failed_component_version": exc.component_version, + "run_id": runtime.run_id, + }, + ) from exc except HTTPException: persist_run_log( run_id=runtime.run_id, @@ -85,24 +131,17 @@ def execute_workflow( component_runs=runtime.component_runs, status="failed", error=str(exc), + error_code="EXECUTION_FAILED", ) last_component = runtime.component_runs[-1].component if runtime.component_runs else "unknown" - if _is_upstream_connectivity_error(exc): - raise HTTPException( - status_code=503, - detail={ - "error": "upstream_unreachable", - "message": "Could not reach upstream data source. Check network/proxy and retry.", - "failed_component": last_component, - "run_id": runtime.run_id, - }, - ) from exc raise HTTPException( status_code=500, detail={ "error": "workflow_execution_failed", + "error_code": "EXECUTION_FAILED", "message": str(exc), "failed_component": last_component, + "failed_component_version": "unknown", "run_id": runtime.run_id, }, ) from exc @@ -127,77 +166,180 @@ def _execute_workflow_steps( workflow: WorkflowDefinition, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, + request_params: dict[str, Any] | None, dataset: dict[str, Any], context: dict[str, Any], ) -> None: """Execute workflow components using declarative YAML step order.""" + executors: dict[str, StepExecutor] = { + "feature_source": _run_feature_source, + "download_dataset": _run_download_dataset, + "temporal_aggregation": _run_temporal_aggregation, + "spatial_aggregation": _run_spatial_aggregation, + "build_datavalueset": _run_build_datavalueset, + } + for step in workflow.steps: - if step.component == "feature_source": - features, bbox = runtime.run( - "feature_source", - component_services.feature_source_component, - config=request.feature_source, + executor = executors.get(step.component) + if executor is None: + raise WorkflowComponentError( + error_code="INPUT_VALIDATION_FAILED", + message=f"Unsupported workflow component '{step.component}'", + component=step.component, + component_version=step.version, + status_code=422, ) - context["features"] = features - context["bbox"] = bbox - continue - - if step.component == "download_dataset": - runtime.run( - "download_dataset", - component_services.download_dataset_component, - dataset=dataset, - start=request.start, - end=request.end, - overwrite=request.overwrite, - country_code=request.country_code, - bbox=_require_context(context, "bbox"), - ) - continue + try: + step_config = _resolve_step_config(step.config, request_params or {}) + _validate_step_config(step.component, step.version, step_config) + except ValueError as exc: + raise WorkflowComponentError( + error_code="CONFIG_VALIDATION_FAILED", + message=str(exc), + component=step.component, + component_version=step.version, + status_code=422, + ) from exc - if step.component == "temporal_aggregation": - temporal_ds = runtime.run( - "temporal_aggregation", - component_services.temporal_aggregation_component, + try: + updates = executor( + runtime=runtime, + request=request, dataset=dataset, - start=request.start, - end=request.end, - bbox=_require_context(context, "bbox"), - target_period_type=request.temporal_aggregation.target_period_type, - method=request.temporal_aggregation.method, + context=context, + step_config=step_config, ) - context["temporal_dataset"] = temporal_ds - continue + except Exception as exc: + if _is_upstream_connectivity_error(exc): + raise WorkflowComponentError( + error_code="UPSTREAM_UNREACHABLE", + message="Could not reach upstream data source. Check network/proxy and retry.", + component=step.component, + component_version=step.version, + status_code=503, + ) from exc + raise WorkflowComponentError( + error_code="EXECUTION_FAILED", + message=str(exc), + component=step.component, + component_version=step.version, + status_code=500, + ) from exc - if step.component == "spatial_aggregation": - records = runtime.run( - "spatial_aggregation", - component_services.spatial_aggregation_component, - dataset=dataset, - start=request.start, - end=request.end, - bbox=_require_context(context, "bbox"), - features=_require_context(context, "features"), - method=request.spatial_aggregation.method, - feature_id_property=request.dhis2.org_unit_property, - ) - context["records"] = records - continue - - if step.component == "build_datavalueset": - data_value_set, output_file = runtime.run( - "build_datavalueset", - component_services.build_datavalueset_component, - records=_require_context(context, "records"), - dataset_id=request.dataset_id, - period_type=request.temporal_aggregation.target_period_type, - dhis2=request.dhis2, - ) - context["data_value_set"] = data_value_set - context["output_file"] = output_file - continue + context.update(updates) + + +type StepExecutor = Callable[..., dict[str, Any]] - raise RuntimeError(f"Unsupported workflow component '{step.component}'") + +def _run_feature_source( + *, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + del dataset, context, step_config + features, bbox = runtime.run( + "feature_source", + component_services.feature_source_component, + config=request.feature_source, + ) + return {"features": features, "bbox": bbox} + + +def _run_download_dataset( + *, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + overwrite = bool(step_config.get("overwrite", request.overwrite)) + country_code = step_config.get("country_code", request.country_code) + runtime.run( + "download_dataset", + component_services.download_dataset_component, + dataset=dataset, + start=request.start, + end=request.end, + overwrite=overwrite, + country_code=country_code, + bbox=_require_context(context, "bbox"), + ) + return {} + + +def _run_temporal_aggregation( + *, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + target_period_type = PeriodType( + str(step_config.get("target_period_type", request.temporal_aggregation.target_period_type)) + ) + method = AggregationMethod(str(step_config.get("method", request.temporal_aggregation.method))) + temporal_ds = runtime.run( + "temporal_aggregation", + component_services.temporal_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + target_period_type=target_period_type, + method=method, + ) + return {"temporal_dataset": temporal_ds} + + +def _run_spatial_aggregation( + *, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + method = AggregationMethod(str(step_config.get("method", request.spatial_aggregation.method))) + feature_id_property = str(step_config.get("feature_id_property", request.dhis2.org_unit_property)) + records = runtime.run( + "spatial_aggregation", + component_services.spatial_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + features=_require_context(context, "features"), + method=method, + feature_id_property=feature_id_property, + ) + return {"records": records} + + +def _run_build_datavalueset( + *, + runtime: WorkflowRuntime, + request: WorkflowExecuteRequest, + dataset: dict[str, Any], + context: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + del dataset + period_type = PeriodType(str(step_config.get("period_type", request.temporal_aggregation.target_period_type))) + data_value_set, output_file = runtime.run( + "build_datavalueset", + component_services.build_datavalueset_component, + records=_require_context(context, "records"), + dataset_id=request.dataset_id, + period_type=period_type, + dhis2=request.dhis2, + ) + return {"data_value_set": data_value_set, "output_file": output_file} def _require_context(context: dict[str, Any], key: str) -> Any: @@ -205,3 +347,78 @@ def _require_context(context: dict[str, Any], key: str) -> Any: if key not in context: raise RuntimeError(f"Workflow definition missing prerequisite for '{key}'") return context[key] + + +def _resolve_step_config(config: dict[str, Any], request_params: dict[str, Any]) -> dict[str, Any]: + """Resolve $request. tokens in step config.""" + resolved: dict[str, Any] = {} + for key, value in config.items(): + resolved[key] = _resolve_value(value, request_params) + return resolved + + +def _resolve_value(value: Any, request_params: dict[str, Any]) -> Any: + """Resolve a config value recursively.""" + if isinstance(value, str) and value.startswith("$request."): + field = value.removeprefix("$request.") + if field not in request_params: + raise ValueError(f"Unknown request field in config token: {value}") + return request_params[field] + if isinstance(value, dict): + return {k: _resolve_value(v, request_params) for k, v in value.items()} + if isinstance(value, list): + return [_resolve_value(v, request_params) for v in value] + return value + + +class _FeatureSourceStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + +class _DownloadDatasetStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + overwrite: bool | None = None + country_code: str | None = None + + +class _TemporalAggregationStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + target_period_type: PeriodType | None = None + method: AggregationMethod | None = None + + +class _SpatialAggregationStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + method: AggregationMethod | None = None + feature_id_property: str | None = None + + +class _BuildDataValueSetStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + period_type: PeriodType | None = None + + +_STEP_CONFIG_MODELS: dict[str, type[BaseModel]] = { + "feature_source": _FeatureSourceStepConfig, + "download_dataset": _DownloadDatasetStepConfig, + "temporal_aggregation": _TemporalAggregationStepConfig, + "spatial_aggregation": _SpatialAggregationStepConfig, + "build_datavalueset": _BuildDataValueSetStepConfig, +} + + +def _validate_step_config(component: str, version: str, config: dict[str, Any]) -> None: + """Validate step config with strict Pydantic models.""" + if version != "v1": + raise ValueError(f"Unsupported component version for config validation: {component}@{version}") + model = _STEP_CONFIG_MODELS.get(component) + if model is None: + raise ValueError(f"No config schema registered for component '{component}'") + try: + model.model_validate(config) + except ValidationError as exc: + raise ValueError(f"Invalid config for component '{component}@{version}': {exc}") from exc diff --git a/src/eo_api/workflows/services/run_logs.py b/src/eo_api/workflows/services/run_logs.py index ea4d375..aa9fef2 100644 --- a/src/eo_api/workflows/services/run_logs.py +++ b/src/eo_api/workflows/services/run_logs.py @@ -18,6 +18,9 @@ def persist_run_log( status: str, output_file: str | None = None, error: str | None = None, + error_code: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, ) -> str: """Write workflow run metadata to disk and return file path.""" logs_dir = DOWNLOAD_DIR / "workflow_runs" @@ -32,6 +35,9 @@ def persist_run_log( "component_runs": [run.model_dump(mode="json") for run in component_runs], "output_file": output_file, "error": error, + "error_code": error_code, + "failed_component": failed_component, + "failed_component_version": failed_component_version, } path.write_text(json.dumps(payload, indent=2), encoding="utf-8") return str(path) diff --git a/src/eo_api/workflows/services/simple_mapper.py b/src/eo_api/workflows/services/simple_mapper.py index 74b8758..e8c016f 100644 --- a/src/eo_api/workflows/services/simple_mapper.py +++ b/src/eo_api/workflows/services/simple_mapper.py @@ -6,7 +6,6 @@ from ...data_registry.services.datasets import get_dataset from ..schemas import ( - AggregationMethod, Dhis2DataValueSetConfig, FeatureSourceConfig, FeatureSourceType, @@ -63,10 +62,6 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR else: raise HTTPException(status_code=422, detail="Provide org_unit_level or org_unit_ids") - reducer_alias = AggregationMethod(inputs.reducer.lower()) if inputs.reducer else None - spatial_method = reducer_alias or inputs.spatial_reducer - temporal_method = reducer_alias or inputs.temporal_reducer - normalized = WorkflowExecuteRequest( dataset_id=dataset_id, start=start, @@ -76,9 +71,9 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR feature_source=feature_source, temporal_aggregation=TemporalAggregationConfig( target_period_type=inputs.temporal_resolution, - method=temporal_method, + method=inputs.temporal_reducer, ), - spatial_aggregation=SpatialAggregationConfig(method=spatial_method), + spatial_aggregation=SpatialAggregationConfig(method=inputs.spatial_reducer), dhis2=Dhis2DataValueSetConfig(data_element_uid=inputs.data_element), ) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index da16352..4056629 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import Any, cast import pytest import xarray as xr @@ -82,6 +82,12 @@ def test_components_catalog_endpoint_returns_five_components(client: TestClient) "spatial_aggregation", "build_datavalueset", } + for item in items: + assert item["version"] == "v1" + assert isinstance(item["input_schema"], dict) + assert isinstance(item["config_schema"], dict) + assert isinstance(item["output_schema"], dict) + assert "EXECUTION_FAILED" in item["error_codes"] def test_workflow_endpoint_returns_response_shape(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: @@ -110,9 +116,19 @@ def test_workflow_endpoint_returns_response_shape(client: TestClient, monkeypatc }, component_runs=[], ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + ) -> WorkflowExecuteResponse: + del payload, workflow_id, request_params, include_component_run_details + return stub + monkeypatch.setattr( "eo_api.workflows.routes.execute_workflow", - lambda payload, workflow_id="dhis2_datavalue_set_v1", include_component_run_details=False: stub, + _execute_stub, ) response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) @@ -162,10 +178,20 @@ def test_workflow_endpoint_accepts_simplified_payload(client: TestClient, monkey data_value_set={"dataValues": []}, component_runs=[], ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + ) -> WorkflowExecuteResponse: + del payload, workflow_id, request_params, include_component_run_details + return stub + monkeypatch.setattr("eo_api.workflows.routes.normalize_simple_request", lambda payload: (normalized, [])) monkeypatch.setattr( "eo_api.workflows.routes.execute_workflow", - lambda payload, workflow_id="dhis2_datavalue_set_v1", include_component_run_details=False: stub, + _execute_stub, ) response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) @@ -328,6 +354,11 @@ def test_engine_returns_503_when_upstream_unreachable(monkeypatch: pytest.Monkey engine.execute_workflow(request) assert exc_info.value.status_code == 503 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "upstream_unreachable" + assert detail["error_code"] == "UPSTREAM_UNREACHABLE" + assert detail["failed_component"] == "download_dataset" + assert detail["failed_component_version"] == "v1" def test_mapper_uses_year_format_for_yearly_dataset() -> None: @@ -348,25 +379,6 @@ def test_mapper_uses_year_format_for_yearly_dataset() -> None: assert normalized.end == "2026" -def test_mapper_reducer_alias_overrides_spatial_and_temporal_reducers() -> None: - normalized, _warnings = normalize_simple_request( - WorkflowRequest.model_validate( - { - "dataset_id": "worldpop_population_yearly", - "country_code": "SLE", - "start_year": 2015, - "end_year": 2026, - "org_unit_level": 2, - "data_element": "DE_UID", - "temporal_resolution": "yearly", - "reducer": "sum", - } - ) - ) - assert normalized.spatial_aggregation.method.value == "sum" - assert normalized.temporal_aggregation.method.value == "sum" - - def test_mapper_uses_month_format_for_chirps_date_window() -> None: normalized, _warnings = normalize_simple_request( WorkflowRequest.model_validate( @@ -488,3 +500,148 @@ def test_engine_rejects_unknown_workflow_id(monkeypatch: pytest.MonkeyPatch) -> engine.execute_workflow(request, workflow_id="not_allowlisted") assert exc_info.value.status_code == 422 + + +def test_engine_resolves_step_config_from_request_params(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 2, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + { + "component": "temporal_aggregation", + "config": { + "method": "$request.temporal_reducer", + "target_period_type": "$request.temporal_resolution", + }, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + + def _temporal_component(**kwargs: Any) -> xr.Dataset: + assert kwargs["method"].value == "max" + assert kwargs["target_period_type"].value == "monthly" + return ds + + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", _temporal_component) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow( + request, + request_params={"temporal_reducer": "max", "temporal_resolution": "monthly"}, + ) + assert response.status == "completed" + + +def test_engine_rejects_invalid_step_config(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 2, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + {"component": "temporal_aggregation", "config": {"invalid_key": 1}}, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + persisted: dict[str, Any] = {} + + def _persist_run_log(**kwargs: Any) -> str: + persisted.update(kwargs) + return "/tmp/data/workflow_runs/run.json" + + monkeypatch.setattr(engine, "persist_run_log", _persist_run_log) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "workflow_execution_failed" + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "temporal_aggregation" + assert detail["failed_component_version"] == "v1" + assert persisted["error_code"] == "CONFIG_VALIDATION_FAILED" + assert persisted["failed_component"] == "temporal_aggregation" + assert persisted["failed_component_version"] == "v1" From 6965a84ab0152d7b4119ec77f98743db2ae67b58 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Thu, 12 Mar 2026 13:36:46 +0100 Subject: [PATCH 05/15] feat: finalize workflow assembly validation and mixed local/remote orchestration --- docs/workflow-orchestration.md | 172 +++++-- src/eo_api/components/routes.py | 31 +- src/eo_api/components/schemas.py | 12 +- src/eo_api/components/services.py | 58 ++- src/eo_api/workflows/routes.py | 87 +++- src/eo_api/workflows/schemas.py | 54 +++ src/eo_api/workflows/services/engine.py | 494 +++++++++++++++++--- src/eo_api/workflows/services/spatial.py | 9 +- tests/test_workflows.py | 550 ++++++++++++++++++++++- 9 files changed, 1312 insertions(+), 155 deletions(-) diff --git a/docs/workflow-orchestration.md b/docs/workflow-orchestration.md index 97d26a1..279f0c8 100644 --- a/docs/workflow-orchestration.md +++ b/docs/workflow-orchestration.md @@ -20,7 +20,7 @@ The current implementation provides: 1. One canonical workflow execution endpoint: - `POST /workflows/dhis2-datavalue-set` -2. One public flat request payload contract (`WorkflowRequest`). +2. One public wrapped request payload contract (`{"request": WorkflowRequest}`). 3. Internal normalization into a canonical execution model (`WorkflowExecuteRequest`). 4. A fixed generic orchestration chain with exactly 5 components: - `feature_source` @@ -45,6 +45,8 @@ The current implementation provides: ### Primary Workflow Endpoint - `POST /workflows/dhis2-datavalue-set` +- `POST /workflows/execute` (inline assembly execution: post `workflow.steps` + `request` payload) +- `POST /workflows/validate` (validate discovered/inline workflow + request compatibility without execution) ### Workflow Discovery Endpoint @@ -52,7 +54,8 @@ The current implementation provides: ### Component Discovery/Execution Endpoints -- `GET /components` +- `GET /components` (public catalog; hides internal orchestration-only config schema) +- `GET /components?include_internal=true` (internal/debug catalog including component config schema) - `POST /components/feature-source` - `POST /components/download-dataset` - `POST /components/temporal-aggregation` @@ -65,20 +68,22 @@ The current implementation provides: ## Public Workflow Request Contract -The workflow endpoint accepts one flat payload shape: +The workflow endpoint accepts one wrapped payload shape: ```json { - "workflow_id": "dhis2_datavalue_set_v1", - "dataset_id": "chirps3_precipitation_daily", - "start_date": "2024-01-01", - "end_date": "2024-05-31", - "org_unit_level": 2, - "data_element": "DE_UID", - "temporal_resolution": "monthly", - "temporal_reducer": "sum", - "spatial_reducer": "mean", - "include_component_run_details": false + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-05-31", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": false + } } ``` @@ -106,7 +111,7 @@ Notes: File: `src/eo_api/workflows/services/simple_mapper.py` -Public flat payload is normalized to internal `WorkflowExecuteRequest` with component-ready nested configs: +Public wrapped payload (`request`) is normalized to internal `WorkflowExecuteRequest` with component-ready nested configs: 1. `feature_source` config: - `org_unit_level` -> `source_type=dhis2_level` @@ -251,6 +256,50 @@ Workflow step schema now supports: 2. `version` (default `v1`) 3. `config` (default `{}`) +### Remote Component Execution + +All five components support either local (default) or remote API execution. + +Common step config options: + +1. `execution_mode`: `local` or `remote` (default `local`) +2. `remote_url`: required when `execution_mode=remote` (expects component-compatible POST endpoint) +3. `remote_timeout_sec`: request timeout (default `30`) +4. `remote_retries`: number of attempts (default `1`) +5. `remote_retry_delay_sec`: delay between attempts in seconds (default `1`) +6. Component-specific options remain available (for example `overwrite`, `country_code`, `method`, `period_type`) + +Example: + +```yaml +steps: + - component: feature_source + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/feature-source" + - component: download_dataset + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/download-dataset" + - component: temporal_aggregation + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/temporal-aggregation" + - component: spatial_aggregation + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/spatial-aggregation" + - component: build_datavalueset + version: v1 + config: + execution_mode: remote + remote_url: "http://component-host/components/build-datavalue-set" +``` + The default YAML remains the same 5-step sequence, but the engine reads it declaratively and dispatches components through a registry map. --- @@ -446,16 +495,18 @@ curl -s http://127.0.0.1:8000/workflows | jq curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ -H "Content-Type: application/json" \ -d '{ - "workflow_id": "dhis2_datavalue_set_v1", - "dataset_id": "chirps3_precipitation_daily", - "start_date": "2024-01-01", - "end_date": "2024-02-29", - "org_unit_level": 2, - "data_element": "DE_UID", - "temporal_resolution": "monthly", - "temporal_reducer": "sum", - "spatial_reducer": "mean", - "include_component_run_details": true + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": true + } }' | jq ``` @@ -473,14 +524,16 @@ Expected component order: curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ -H "Content-Type: application/json" \ -d '{ - "workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1", - "dataset_id": "chirps3_precipitation_daily", - "start_date": "2024-01-01", - "end_date": "2024-02-29", - "org_unit_level": 2, - "data_element": "DE_UID", - "spatial_reducer": "mean", - "include_component_run_details": true + "request": { + "workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-02-29", + "org_unit_level": 2, + "data_element": "DE_UID", + "spatial_reducer": "mean", + "include_component_run_details": true + } }' | jq ``` @@ -497,19 +550,60 @@ Expected component order: curl -s -X POST "http://127.0.0.1:8000/workflows/dhis2-datavalue-set" \ -H "Content-Type: application/json" \ -d '{ - "workflow_id": "does_not_exist", - "dataset_id": "chirps3_precipitation_daily", - "start_date": "2024-01-01", - "end_date": "2024-01-31", - "org_unit_level": 2, - "data_element": "DE_UID" + "request": { + "workflow_id": "does_not_exist", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 2, + "data_element": "DE_UID" + } }' | jq ``` Expected result: `422` with allowed/discovered `workflow_id` values in error detail. +6. Validate inline assembly (no execution): + +```bash +curl -s -X POST "http://127.0.0.1:8000/workflows/validate" \ + -H "Content-Type: application/json" \ + -d '{ + "workflow": { + "workflow_id": "adhoc_validate_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}} + ] + }, + "request": { + "workflow_id": "adhoc_validate_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 2, + "data_element": "DE_UID" + } + }' | jq +``` + +Expected result: `200` with `valid: true`, resolved step configs, and no execution side effects. + --- ## Next Technical Step -Add a workflow governance model for multi-user environments: workflow metadata (owner/status), promotion states (draft/staging/prod), and optional signature/checksum validation before a discovered YAML can execute. +Prioritize orchestration-tool readiness (Prefect/Airflow wrappers over the current workflow service) before any OGC-first migration. + +Rationale: + +1. Delivers immediate operational value (scheduling, retries, long-running reliability) with minimal API churn. +2. Reuses existing componentization, dispatcher, and run metadata. +3. Avoids a high-risk architecture pivot while the current workflow contract is stabilizing. + +For detailed option synthesis and implementation scope, see: + +- `docs/internal/roadmap_v2.md` (Post-V2 Decision Synthesis) diff --git a/src/eo_api/components/routes.py b/src/eo_api/components/routes.py index 73519c2..3754535 100644 --- a/src/eo_api/components/routes.py +++ b/src/eo_api/components/routes.py @@ -2,7 +2,10 @@ from __future__ import annotations -from fastapi import APIRouter +from typing import Any + +import numpy as np +from fastapi import APIRouter, Query from ..data_manager.services.constants import BBOX from . import services @@ -23,10 +26,24 @@ router = APIRouter() -@router.get("/components", response_model=ComponentCatalogResponse) -def list_components() -> ComponentCatalogResponse: +def _to_jsonable_scalar(value: Any) -> Any: + """Convert numpy scalars/datetimes to JSON-safe native values.""" + if isinstance(value, np.datetime64): + return np.datetime_as_string(value, unit="s") + if isinstance(value, np.generic): + return value.item() + return value + + +def _json_safe_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Ensure record rows are JSON-serializable.""" + return [{key: _to_jsonable_scalar(value) for key, value in record.items()} for record in records] + + +@router.get("/components", response_model=ComponentCatalogResponse, response_model_exclude_none=True) +def list_components(include_internal: bool = Query(default=False)) -> ComponentCatalogResponse: """List all discoverable reusable components.""" - return ComponentCatalogResponse(components=services.component_catalog()) + return ComponentCatalogResponse(components=services.component_catalog(include_internal=include_internal)) @router.post("/components/feature-source", response_model=FeatureSourceRunResponse) @@ -94,10 +111,12 @@ def run_spatial_aggregation(payload: SpatialAggregationRunRequest) -> SpatialAgg method=payload.method, feature_id_property=payload.feature_id_property, ) + json_records = _json_safe_records(records) return SpatialAggregationRunResponse( dataset_id=payload.dataset_id, - record_count=len(records), - preview=records[: payload.max_preview_rows], + record_count=len(json_records), + preview=json_records[: payload.max_preview_rows], + records=json_records if payload.include_records else None, ) diff --git a/src/eo_api/components/schemas.py b/src/eo_api/components/schemas.py index a0920dc..1c2d684 100644 --- a/src/eo_api/components/schemas.py +++ b/src/eo_api/components/schemas.py @@ -14,6 +14,13 @@ ) +class ComponentEndpoint(BaseModel): + """HTTP endpoint metadata for a component.""" + + path: str + method: str + + class ComponentDefinition(BaseModel): """Component metadata for discovery.""" @@ -23,9 +30,10 @@ class ComponentDefinition(BaseModel): inputs: list[str] outputs: list[str] input_schema: dict[str, Any] = Field(default_factory=dict) - config_schema: dict[str, Any] = Field(default_factory=dict) + config_schema: dict[str, Any] | None = None output_schema: dict[str, Any] = Field(default_factory=dict) error_codes: list[str] = Field(default_factory=list) + endpoint: ComponentEndpoint class ComponentCatalogResponse(BaseModel): @@ -98,6 +106,7 @@ class SpatialAggregationRunRequest(BaseModel): method: AggregationMethod = AggregationMethod.MEAN bbox: list[float] | None = None feature_id_property: str = "id" + include_records: bool = False max_preview_rows: int = 20 @@ -107,6 +116,7 @@ class SpatialAggregationRunResponse(BaseModel): dataset_id: str record_count: int preview: list[dict[str, Any]] + records: list[dict[str, Any]] | None = None class BuildDataValueSetRunRequest(BaseModel): diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index 8bd0e1d..a546051 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -21,7 +21,7 @@ from ..workflows.services.preflight import check_upstream_connectivity from ..workflows.services.spatial import aggregate_to_features from ..workflows.services.temporal import aggregate_temporal -from .schemas import ComponentDefinition +from .schemas import ComponentDefinition, ComponentEndpoint _ERROR_CODES_V1: Final[list[str]] = [ "INPUT_VALIDATION_FAILED", @@ -43,7 +43,17 @@ "properties": {"feature_source": {"type": "object"}}, "required": ["feature_source"], }, - config_schema={"type": "object", "properties": {}, "additionalProperties": False}, + config_schema={ + "type": "object", + "properties": { + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, + }, + "additionalProperties": False, + }, output_schema={ "type": "object", "properties": { @@ -53,6 +63,7 @@ "required": ["features", "bbox"], }, error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/feature-source", method="POST"), ), "download_dataset@v1": ComponentDefinition( name="download_dataset", @@ -75,13 +86,17 @@ config_schema={ "type": "object", "properties": { - "overwrite": {"type": "boolean"}, - "country_code": {"type": ["string", "null"]}, + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, }, "additionalProperties": False, }, output_schema={"type": "object", "properties": {"status": {"type": "string"}}}, error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/download-dataset", method="POST"), ), "temporal_aggregation@v1": ComponentDefinition( name="temporal_aggregation", @@ -104,13 +119,17 @@ config_schema={ "type": "object", "properties": { - "target_period_type": {"type": "string"}, - "method": {"type": "string"}, + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, }, "additionalProperties": False, }, output_schema={"type": "object", "properties": {"dataset": {"type": "object"}}}, error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/temporal-aggregation", method="POST"), ), "spatial_aggregation@v1": ComponentDefinition( name="spatial_aggregation", @@ -132,13 +151,17 @@ config_schema={ "type": "object", "properties": { - "method": {"type": "string"}, - "feature_id_property": {"type": "string"}, + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, }, "additionalProperties": False, }, output_schema={"type": "object", "properties": {"records": {"type": "array"}}}, error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/spatial-aggregation", method="POST"), ), "build_datavalueset@v1": ComponentDefinition( name="build_datavalueset", @@ -159,7 +182,11 @@ config_schema={ "type": "object", "properties": { - "period_type": {"type": "string"}, + "execution_mode": {"type": "string", "enum": ["local", "remote"]}, + "remote_url": {"type": ["string", "null"]}, + "remote_timeout_sec": {"type": "number"}, + "remote_retries": {"type": "integer"}, + "remote_retry_delay_sec": {"type": "number"}, }, "additionalProperties": False, }, @@ -169,13 +196,20 @@ "required": ["data_value_set", "output_file"], }, error_codes=_ERROR_CODES_V1, + endpoint=ComponentEndpoint(path="/components/build-datavalue-set", method="POST"), ), } -def component_catalog() -> list[ComponentDefinition]: - """Return all discoverable component definitions.""" - return list(_COMPONENT_REGISTRY.values()) +def component_catalog(*, include_internal: bool = False) -> list[ComponentDefinition]: + """Return discoverable component definitions. + + By default, internal orchestration-only metadata (config_schema) is hidden. + """ + components = list(_COMPONENT_REGISTRY.values()) + if include_internal: + return components + return [component.model_copy(update={"config_schema": None}) for component in components] def component_registry() -> dict[str, ComponentDefinition]: diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index 6716c51..8650f54 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -2,9 +2,18 @@ from fastapi import APIRouter, HTTPException -from .schemas import WorkflowCatalogItem, WorkflowCatalogResponse, WorkflowExecuteResponse, WorkflowRequest -from .services.definitions import list_workflow_definitions -from .services.engine import execute_workflow +from .schemas import ( + WorkflowAssemblyExecuteRequest, + WorkflowCatalogItem, + WorkflowCatalogResponse, + WorkflowExecuteEnvelopeRequest, + WorkflowExecuteResponse, + WorkflowValidateRequest, + WorkflowValidateResponse, + WorkflowValidateStep, +) +from .services.definitions import list_workflow_definitions, load_workflow_definition +from .services.engine import execute_workflow, validate_workflow_steps from .services.simple_mapper import normalize_simple_request router = APIRouter() @@ -31,12 +40,74 @@ def list_workflows() -> WorkflowCatalogResponse: @router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) -def run_dhis2_datavalue_set_workflow(payload: WorkflowRequest) -> WorkflowExecuteResponse: +def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> WorkflowExecuteResponse: """Run workflow from a single flat request payload.""" - request, _warnings = normalize_simple_request(payload) + request, _warnings = normalize_simple_request(payload.request) return execute_workflow( request, - workflow_id=payload.workflow_id, - request_params=payload.model_dump(), - include_component_run_details=payload.include_component_run_details, + workflow_id=payload.request.workflow_id, + request_params=payload.request.model_dump(), + include_component_run_details=payload.request.include_component_run_details, + ) + + +@router.post("/execute", response_model=WorkflowExecuteResponse) +def run_inline_assembled_workflow(payload: WorkflowAssemblyExecuteRequest) -> WorkflowExecuteResponse: + """Run an inline assembled workflow definition from one flat request payload.""" + request, _warnings = normalize_simple_request(payload.request) + return execute_workflow( + request, + workflow_id=payload.workflow.workflow_id, + workflow_definition=payload.workflow, + request_params=payload.request.model_dump(exclude_none=True), + include_component_run_details=payload.request.include_component_run_details, + ) + + +@router.post("/validate", response_model=WorkflowValidateResponse) +def validate_workflow_assembly(payload: WorkflowValidateRequest) -> WorkflowValidateResponse: + """Validate workflow assembly without executing any component.""" + warnings: list[str] = [] + errors: list[str] = [] + + try: + if payload.workflow is not None: + workflow = payload.workflow + else: + workflow = load_workflow_definition(payload.workflow_id or "") + except ValueError as exc: + return WorkflowValidateResponse( + valid=False, + workflow_id=payload.workflow_id or "unknown", + workflow_version=0, + step_count=0, + components=[], + warnings=warnings, + errors=[str(exc)], + ) + + request_params: dict[str, object] = {} + if payload.request is not None: + _request, map_warnings = normalize_simple_request(payload.request) + warnings.extend(map_warnings) + request_params = payload.request.model_dump(exclude_none=True) + + try: + resolved_steps = [ + WorkflowValidateStep.model_validate(step) + for step in validate_workflow_steps(workflow=workflow, request_params=request_params) + ] + except ValueError as exc: + errors.append(str(exc)) + resolved_steps = [] + + return WorkflowValidateResponse( + valid=not errors, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + step_count=len(workflow.steps), + components=[step.component for step in workflow.steps], + resolved_steps=resolved_steps, + warnings=warnings, + errors=errors, ) diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index cfd0645..09072dc 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -5,6 +5,8 @@ from pydantic import BaseModel, Field, model_validator +from .services.definitions import WorkflowDefinition + class FeatureSourceType(StrEnum): """Supported feature source backends.""" @@ -173,3 +175,55 @@ def validate_time_window(self) -> "WorkflowRequest": if self.org_unit_level is None and not self.org_unit_ids: raise ValueError("Provide org_unit_level or org_unit_ids") return self + + +class WorkflowExecuteEnvelopeRequest(BaseModel): + """Envelope for workflow execution input payload.""" + + request: WorkflowRequest + + +class WorkflowAssemblyExecuteRequest(BaseModel): + """Inline workflow assembly + wrapped public workflow input.""" + + request: WorkflowRequest + workflow: WorkflowDefinition + + +class WorkflowValidateRequest(BaseModel): + """Validation request for discovered or inline workflow assembly.""" + + workflow_id: str | None = None + workflow: WorkflowDefinition | None = None + request: WorkflowRequest | None = None + + @model_validator(mode="after") + def validate_workflow_source(self) -> "WorkflowValidateRequest": + """Require exactly one workflow source.""" + if (self.workflow_id is None and self.workflow is None) or ( + self.workflow_id is not None and self.workflow is not None + ): + raise ValueError("Provide exactly one of workflow_id or workflow") + return self + + +class WorkflowValidateStep(BaseModel): + """Resolved workflow step metadata from validation.""" + + index: int + component: str + version: str + resolved_config: dict[str, Any] + + +class WorkflowValidateResponse(BaseModel): + """Validation result for a workflow assembly.""" + + valid: bool + workflow_id: str + workflow_version: int + step_count: int + components: list[str] + resolved_steps: list[WorkflowValidateStep] = Field(default_factory=list) + warnings: list[str] = Field(default_factory=list) + errors: list[str] = Field(default_factory=list) diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index d8e1ad7..5f91716 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -2,15 +2,17 @@ from __future__ import annotations +import time from collections.abc import Callable from typing import Any +import httpx from fastapi import HTTPException from pydantic import BaseModel, ConfigDict, ValidationError from ...components import services as component_services from ...data_registry.services.datasets import get_dataset -from ..schemas import AggregationMethod, PeriodType, WorkflowExecuteRequest, WorkflowExecuteResponse +from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse from .definitions import WorkflowDefinition, load_workflow_definition from .run_logs import persist_run_log from .runtime import WorkflowRuntime @@ -39,6 +41,7 @@ def execute_workflow( request: WorkflowExecuteRequest, *, workflow_id: str = "dhis2_datavalue_set_v1", + workflow_definition: WorkflowDefinition | None = None, request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, ) -> WorkflowExecuteResponse: @@ -52,10 +55,13 @@ def execute_workflow( context: dict[str, Any] = {} try: - try: - workflow = load_workflow_definition(workflow_id) - except ValueError as exc: - raise HTTPException(status_code=422, detail=str(exc)) from exc + if workflow_definition is not None: + workflow = workflow_definition + else: + try: + workflow = load_workflow_definition(workflow_id) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc _execute_workflow_steps( workflow=workflow, runtime=runtime, @@ -229,6 +235,31 @@ def _execute_workflow_steps( context.update(updates) +def validate_workflow_steps( + *, + workflow: WorkflowDefinition, + request_params: dict[str, Any] | None = None, +) -> list[dict[str, Any]]: + """Resolve and validate step configs without executing components.""" + resolved_steps: list[dict[str, Any]] = [] + params = request_params or {} + for index, step in enumerate(workflow.steps): + try: + resolved_config = _resolve_step_config(step.config, params) + _validate_step_config(step.component, step.version, resolved_config) + except ValueError as exc: + raise ValueError(f"Step {index + 1} ({step.component}@{step.version}) validation failed: {exc}") from exc + resolved_steps.append( + { + "index": index + 1, + "component": step.component, + "version": step.version, + "resolved_config": resolved_config, + } + ) + return resolved_steps + + type StepExecutor = Callable[..., dict[str, Any]] @@ -240,12 +271,24 @@ def _run_feature_source( context: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - del dataset, context, step_config - features, bbox = runtime.run( - "feature_source", - component_services.feature_source_component, - config=request.feature_source, - ) + del dataset, context + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + features, bbox = runtime.run( + "feature_source", + _invoke_remote_feature_source_component, + remote_url=str(step_config["remote_url"]), + feature_source=request.feature_source.model_dump(mode="json"), + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + else: + features, bbox = runtime.run( + "feature_source", + component_services.feature_source_component, + config=request.feature_source, + ) return {"features": features, "bbox": bbox} @@ -257,18 +300,45 @@ def _run_download_dataset( context: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - overwrite = bool(step_config.get("overwrite", request.overwrite)) - country_code = step_config.get("country_code", request.country_code) - runtime.run( - "download_dataset", - component_services.download_dataset_component, - dataset=dataset, - start=request.start, - end=request.end, - overwrite=overwrite, - country_code=country_code, - bbox=_require_context(context, "bbox"), - ) + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode not in {"local", "remote"}: + raise ValueError("download_dataset.execution_mode must be 'local' or 'remote'") + + overwrite = request.overwrite + country_code = request.country_code + bbox = _require_context(context, "bbox") + if execution_mode == "remote": + remote_url = step_config.get("remote_url") + if not isinstance(remote_url, str) or not remote_url: + raise ValueError("download_dataset remote mode requires non-empty 'remote_url'") + remote_timeout = float(step_config.get("remote_timeout_sec", 30.0)) + remote_retries = int(step_config.get("remote_retries", 1)) + remote_retry_delay_sec = float(step_config.get("remote_retry_delay_sec", 1.0)) + runtime.run( + "download_dataset", + _invoke_remote_download_component, + remote_url=remote_url, + dataset_id=request.dataset_id, + start=request.start, + end=request.end, + overwrite=overwrite, + country_code=country_code, + bbox=bbox, + timeout_sec=remote_timeout, + retries=remote_retries, + retry_delay_sec=remote_retry_delay_sec, + ) + else: + runtime.run( + "download_dataset", + component_services.download_dataset_component, + dataset=dataset, + start=request.start, + end=request.end, + overwrite=overwrite, + country_code=country_code, + bbox=bbox, + ) return {} @@ -280,20 +350,35 @@ def _run_temporal_aggregation( context: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - target_period_type = PeriodType( - str(step_config.get("target_period_type", request.temporal_aggregation.target_period_type)) - ) - method = AggregationMethod(str(step_config.get("method", request.temporal_aggregation.method))) - temporal_ds = runtime.run( - "temporal_aggregation", - component_services.temporal_aggregation_component, - dataset=dataset, - start=request.start, - end=request.end, - bbox=_require_context(context, "bbox"), - target_period_type=target_period_type, - method=method, - ) + target_period_type = request.temporal_aggregation.target_period_type + method = request.temporal_aggregation.method + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + temporal_ds = runtime.run( + "temporal_aggregation", + _invoke_remote_temporal_aggregation_component, + remote_url=str(step_config["remote_url"]), + dataset_id=request.dataset_id, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + target_period_type=target_period_type.value, + method=method.value, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + else: + temporal_ds = runtime.run( + "temporal_aggregation", + component_services.temporal_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + target_period_type=target_period_type, + method=method, + ) return {"temporal_dataset": temporal_ds} @@ -305,19 +390,37 @@ def _run_spatial_aggregation( context: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - method = AggregationMethod(str(step_config.get("method", request.spatial_aggregation.method))) - feature_id_property = str(step_config.get("feature_id_property", request.dhis2.org_unit_property)) - records = runtime.run( - "spatial_aggregation", - component_services.spatial_aggregation_component, - dataset=dataset, - start=request.start, - end=request.end, - bbox=_require_context(context, "bbox"), - features=_require_context(context, "features"), - method=method, - feature_id_property=feature_id_property, - ) + method = request.spatial_aggregation.method + feature_id_property = request.dhis2.org_unit_property + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + records = runtime.run( + "spatial_aggregation", + _invoke_remote_spatial_aggregation_component, + remote_url=str(step_config["remote_url"]), + dataset_id=request.dataset_id, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + feature_source=request.feature_source.model_dump(mode="json"), + method=method.value, + feature_id_property=feature_id_property, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + else: + records = runtime.run( + "spatial_aggregation", + component_services.spatial_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=_require_context(context, "bbox"), + features=_require_context(context, "features"), + method=method, + feature_id_property=feature_id_property, + ) return {"records": records} @@ -330,15 +433,30 @@ def _run_build_datavalueset( step_config: dict[str, Any], ) -> dict[str, Any]: del dataset - period_type = PeriodType(str(step_config.get("period_type", request.temporal_aggregation.target_period_type))) - data_value_set, output_file = runtime.run( - "build_datavalueset", - component_services.build_datavalueset_component, - records=_require_context(context, "records"), - dataset_id=request.dataset_id, - period_type=period_type, - dhis2=request.dhis2, - ) + period_type = request.temporal_aggregation.target_period_type + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + data_value_set, output_file = runtime.run( + "build_datavalueset", + _invoke_remote_build_datavalueset_component, + remote_url=str(step_config["remote_url"]), + dataset_id=request.dataset_id, + period_type=period_type.value, + records=_require_context(context, "records"), + dhis2=request.dhis2.model_dump(mode="json"), + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + else: + data_value_set, output_file = runtime.run( + "build_datavalueset", + component_services.build_datavalueset_component, + records=_require_context(context, "records"), + dataset_id=request.dataset_id, + period_type=period_type, + dhis2=request.dhis2, + ) return {"data_value_set": data_value_set, "output_file": output_file} @@ -374,32 +492,51 @@ def _resolve_value(value: Any, request_params: dict[str, Any]) -> Any: class _FeatureSourceStepConfig(BaseModel): model_config = ConfigDict(extra="forbid") + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 + class _DownloadDatasetStepConfig(BaseModel): model_config = ConfigDict(extra="forbid") - overwrite: bool | None = None - country_code: str | None = None + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 class _TemporalAggregationStepConfig(BaseModel): model_config = ConfigDict(extra="forbid") - target_period_type: PeriodType | None = None - method: AggregationMethod | None = None + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 class _SpatialAggregationStepConfig(BaseModel): model_config = ConfigDict(extra="forbid") - method: AggregationMethod | None = None - feature_id_property: str | None = None + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 class _BuildDataValueSetStepConfig(BaseModel): model_config = ConfigDict(extra="forbid") - period_type: PeriodType | None = None + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 _STEP_CONFIG_MODELS: dict[str, type[BaseModel]] = { @@ -419,6 +556,229 @@ def _validate_step_config(component: str, version: str, config: dict[str, Any]) if model is None: raise ValueError(f"No config schema registered for component '{component}'") try: - model.model_validate(config) + validated = model.model_validate(config) except ValidationError as exc: raise ValueError(f"Invalid config for component '{component}@{version}': {exc}") from exc + mode = str(getattr(validated, "execution_mode", "local")).lower() + if mode not in {"local", "remote"}: + raise ValueError( + f"Invalid config for component '{component}@{version}': execution_mode must be local or remote" + ) + remote_url = getattr(validated, "remote_url", None) + remote_timeout_sec = getattr(validated, "remote_timeout_sec", 30.0) + remote_retries = getattr(validated, "remote_retries", 1) + remote_retry_delay_sec = getattr(validated, "remote_retry_delay_sec", 1.0) + + has_remote_config = bool( + (isinstance(remote_url, str) and remote_url.strip()) + or float(remote_timeout_sec) != 30.0 + or int(remote_retries) != 1 + or float(remote_retry_delay_sec) != 1.0 + ) + + if mode == "local" and has_remote_config: + raise ValueError( + f"Invalid config for component '{component}@{version}': " + "remote_url/remote_timeout_sec/remote_retries/remote_retry_delay_sec are only allowed in remote mode" + ) + if mode == "remote": + if not isinstance(remote_url, str) or not remote_url.strip(): + raise ValueError( + f"Invalid config for component '{component}@{version}': remote_url is required for remote mode" + ) + + +def _invoke_remote_download_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + overwrite: bool, + country_code: str | None, + bbox: list[float], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> None: + """Invoke remote download component endpoint with retry/timeout.""" + payload = { + "dataset_id": dataset_id, + "start": start, + "end": end, + "overwrite": overwrite, + "country_code": country_code, + "bbox": bbox, + } + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + return + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote download invocation failed without exception context") + raise last_exc + + +def _invoke_remote_feature_source_component( + *, + remote_url: str, + feature_source: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], list[float]]: + """Invoke remote feature-source component endpoint.""" + payload = { + "feature_source": feature_source, + "include_features": True, + } + result = _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + features = result.get("features") + bbox = result.get("bbox") + if not isinstance(features, dict) or not isinstance(bbox, list): + raise RuntimeError("Remote feature_source response missing features/bbox") + return features, [float(x) for x in bbox] + + +def _invoke_remote_temporal_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + target_period_type: str, + method: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + """Invoke remote temporal-aggregation component endpoint.""" + payload = { + "dataset_id": dataset_id, + "start": start, + "end": end, + "bbox": bbox, + "target_period_type": target_period_type, + "method": method, + } + return _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + + +def _invoke_remote_spatial_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + feature_source: dict[str, Any], + method: str, + feature_id_property: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> list[dict[str, Any]]: + """Invoke remote spatial-aggregation component endpoint.""" + payload = { + "dataset_id": dataset_id, + "start": start, + "end": end, + "feature_source": feature_source, + "method": method, + "bbox": bbox, + "feature_id_property": feature_id_property, + "include_records": True, + } + result = _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + records = result.get("records") + if not isinstance(records, list): + raise RuntimeError("Remote spatial_aggregation response missing records") + return records + + +def _invoke_remote_build_datavalueset_component( + *, + remote_url: str, + dataset_id: str, + period_type: str, + records: list[dict[str, Any]], + dhis2: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], str]: + """Invoke remote build-datavalue-set component endpoint.""" + payload = { + "dataset_id": dataset_id, + "period_type": period_type, + "records": records, + "dhis2": dhis2, + } + result = _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + data_value_set = result.get("data_value_set") + output_file = result.get("output_file") + if not isinstance(data_value_set, dict) or not isinstance(output_file, str): + raise RuntimeError("Remote build_datavalueset response missing data_value_set/output_file") + return data_value_set, output_file + + +def _post_remote_json( + *, + remote_url: str, + payload: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + """POST JSON to remote component endpoint with retry and return JSON body.""" + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + body = response.json() + if not isinstance(body, dict): + raise RuntimeError("Remote component returned non-object JSON response") + return body + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote component invocation failed without exception context") + raise last_exc diff --git a/src/eo_api/workflows/services/spatial.py b/src/eo_api/workflows/services/spatial.py index 668141e..8bf3f05 100644 --- a/src/eo_api/workflows/services/spatial.py +++ b/src/eo_api/workflows/services/spatial.py @@ -46,10 +46,17 @@ def aggregate_to_features( for t, value in zip(reduced[time_dim].values, reduced.values, strict=True): if np.isnan(value): continue + # Keep component outputs JSON-safe for direct API exposure and remote execution. + if isinstance(t, np.datetime64): + time_value: Any = np.datetime_as_string(t, unit="s") + elif isinstance(t, np.generic): + time_value = t.item() + else: + time_value = t output.append( { "org_unit": org_unit, - "time": t, + "time": time_value, "value": float(value), } ) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 4056629..27577db 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -2,6 +2,7 @@ from typing import Any, cast +import numpy as np import pytest import xarray as xr from fastapi import HTTPException @@ -17,17 +18,19 @@ def _valid_public_payload() -> dict[str, Any]: return { - "workflow_id": "dhis2_datavalue_set_v1", - "dataset_id": "chirps3_precipitation_daily", - "start_date": "2024-01-01", - "end_date": "2024-01-31", - "org_unit_level": 3, - "data_element": "abc123def45", - "temporal_resolution": "monthly", - "temporal_reducer": "sum", - "spatial_reducer": "mean", - "dry_run": True, - "include_component_run_details": False, + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "dry_run": True, + "include_component_run_details": False, + } } @@ -37,7 +40,7 @@ def test_workflow_endpoint_exists_once() -> None: for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/workflows") and "POST" in route.methods } - assert workflow_routes == {"/workflows/dhis2-datavalue-set"} + assert workflow_routes == {"/workflows/dhis2-datavalue-set", "/workflows/execute", "/workflows/validate"} def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClient) -> None: @@ -85,9 +88,20 @@ def test_components_catalog_endpoint_returns_five_components(client: TestClient) for item in items: assert item["version"] == "v1" assert isinstance(item["input_schema"], dict) - assert isinstance(item["config_schema"], dict) + assert "config_schema" not in item assert isinstance(item["output_schema"], dict) assert "EXECUTION_FAILED" in item["error_codes"] + assert item["endpoint"]["method"] == "POST" + assert item["endpoint"]["path"].startswith("/components/") + + +def test_components_catalog_include_internal_includes_config_schema(client: TestClient) -> None: + response = client.get("/components?include_internal=true") + assert response.status_code == 200 + items = response.json()["components"] + assert len(items) >= 5 + for item in items: + assert isinstance(item["config_schema"], dict) def test_workflow_endpoint_returns_response_shape(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: @@ -146,7 +160,7 @@ def _execute_stub( def test_workflow_endpoint_validates_required_fields(client: TestClient) -> None: payload = _valid_public_payload() - payload.pop("org_unit_level") + payload["request"].pop("org_unit_level") response = client.post("/workflows/dhis2-datavalue-set", json=payload) assert response.status_code == 422 @@ -199,6 +213,205 @@ def _execute_stub( assert response.json()["status"] == "completed" +def test_inline_workflow_execute_endpoint_accepts_assembly(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + stub = WorkflowExecuteResponse( + status="completed", + run_id="run-assembly-123", + workflow_id="adhoc_dhis2_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + bbox=[-13.3, 6.9, -10.1, 10.0], + feature_count=2, + value_count=4, + output_file="/tmp/data/chirps3_datavalueset.json", + run_log_file="/tmp/data/workflow_runs/run-assembly-123.json", + data_value_set={"dataValues": []}, + component_runs=[], + ) + + def _execute_stub( + payload: Any, + workflow_id: str = "dhis2_datavalue_set_v1", + workflow_definition: WorkflowDefinition | None = None, + request_params: dict[str, Any] | None = None, + include_component_run_details: bool = False, + ) -> WorkflowExecuteResponse: + del payload, request_params, include_component_run_details + assert workflow_id == "adhoc_dhis2_v1" + assert workflow_definition is not None + assert workflow_definition.workflow_id == "adhoc_dhis2_v1" + assert len(workflow_definition.steps) == 4 + return stub + + monkeypatch.setattr("eo_api.workflows.routes.execute_workflow", _execute_stub) + + response = client.post( + "/workflows/execute", + json={ + "workflow": { + "workflow_id": "adhoc_dhis2_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + }, + "request": { + "workflow_id": "adhoc_dhis2_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "include_component_run_details": False, + }, + }, + ) + assert response.status_code == 200 + assert response.json()["workflow_id"] == "adhoc_dhis2_v1" + + +def test_inline_workflow_execute_endpoint_rejects_bad_component_chain(client: TestClient) -> None: + response = client.post( + "/workflows/execute", + json={ + "workflow": { + "workflow_id": "bad_adhoc_v1", + "version": 1, + "steps": [ + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + }, + "request": { + "workflow_id": "bad_adhoc_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 422 + + +def test_workflow_validate_endpoint_accepts_valid_inline_workflow(client: TestClient) -> None: + response = client.post( + "/workflows/validate", + json={ + "workflow": { + "workflow_id": "adhoc_validate_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + }, + "request": { + "workflow_id": "adhoc_validate_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is True + assert body["workflow_id"] == "adhoc_validate_v1" + assert body["step_count"] == 4 + assert len(body["resolved_steps"]) == 4 + assert body["errors"] == [] + + +def test_workflow_validate_endpoint_rejects_runtime_knobs_in_step_config(client: TestClient) -> None: + response = client.post( + "/workflows/validate", + json={ + "workflow": { + "workflow_id": "adhoc_invalid_config_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1", "config": {}}, + {"component": "download_dataset", "version": "v1", "config": {"overwrite": True}}, + {"component": "spatial_aggregation", "version": "v1", "config": {}}, + {"component": "build_datavalueset", "version": "v1", "config": {}}, + ], + }, + "request": { + "workflow_id": "adhoc_invalid_config_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + }, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is False + assert body["resolved_steps"] == [] + assert len(body["errors"]) == 1 + assert "validation failed" in body["errors"][0].lower() + + +def test_workflow_validate_endpoint_unknown_workflow_id(client: TestClient) -> None: + response = client.post("/workflows/validate", json={"workflow_id": "does_not_exist"}) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is False + assert body["step_count"] == 0 + assert len(body["errors"]) == 1 + assert "Unknown workflow_id" in body["errors"][0] + + +def test_component_spatial_aggregation_serializes_numpy_datetime64( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr( + "eo_api.components.routes.services.require_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + "eo_api.components.routes.services.feature_source_component", + lambda feature_source: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr( + "eo_api.components.routes.services.spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": np.datetime64("2024-01-01"), "value": 10.0}], + ) + + response = client.post( + "/components/spatial-aggregation", + json={ + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01", + "end": "2024-01", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 2}, + "method": "mean", + "include_records": True, + }, + ) + assert response.status_code == 200 + body = response.json() + assert body["record_count"] == 1 + assert body["records"][0]["time"] == "2024-01-01T00:00:00" + + def test_engine_orchestrates_components(monkeypatch: pytest.MonkeyPatch) -> None: request = { "dataset_id": "chirps3_precipitation_daily", @@ -528,13 +741,13 @@ def test_engine_resolves_step_config_from_request_params(monkeypatch: pytest.Mon "version": 2, "steps": [ {"component": "feature_source"}, - {"component": "download_dataset"}, + { + "component": "download_dataset", + "config": {"execution_mode": "$request.download_execution_mode"}, + }, { "component": "temporal_aggregation", - "config": { - "method": "$request.temporal_reducer", - "target_period_type": "$request.temporal_resolution", - }, + "config": {}, }, {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, @@ -558,7 +771,7 @@ def test_engine_resolves_step_config_from_request_params(monkeypatch: pytest.Mon monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) def _temporal_component(**kwargs: Any) -> xr.Dataset: - assert kwargs["method"].value == "max" + assert kwargs["method"].value == "sum" assert kwargs["target_period_type"].value == "monthly" return ds @@ -577,7 +790,7 @@ def _temporal_component(**kwargs: Any) -> xr.Dataset: response = engine.execute_workflow( request, - request_params={"temporal_reducer": "max", "temporal_resolution": "monthly"}, + request_params={"download_execution_mode": "local"}, ) assert response.status == "completed" @@ -645,3 +858,298 @@ def _persist_run_log(**kwargs: Any) -> str: assert persisted["error_code"] == "CONFIG_VALIDATION_FAILED" assert persisted["failed_component"] == "temporal_aggregation" assert persisted["failed_component_version"] == "v1" + + +def test_engine_download_dataset_remote_mode_uses_remote_adapter(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + { + "component": "download_dataset", + "config": { + "execution_mode": "remote", + "remote_url": "http://component-host/components/download-dataset", + "remote_retries": 2, + "remote_timeout_sec": 9, + }, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + remote_called: dict[str, Any] = {} + + def _remote_adapter(**kwargs: Any) -> None: + remote_called.update(kwargs) + + monkeypatch.setattr(engine, "_invoke_remote_download_component", _remote_adapter) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.status == "completed" + assert remote_called["remote_url"] == "http://component-host/components/download-dataset" + assert remote_called["dataset_id"] == "chirps3_precipitation_daily" + + +def test_engine_rejects_remote_download_without_remote_url(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset", "config": {"execution_mode": "remote"}}, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "download_dataset" + + +def test_engine_rejects_remote_fields_in_local_mode(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + {"component": "feature_source"}, + { + "component": "download_dataset", + "config": { + "execution_mode": "local", + "remote_url": "http://should-not-be-here/components/download-dataset", + }, + }, + {"component": "spatial_aggregation"}, + {"component": "build_datavalueset"}, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error_code"] == "CONFIG_VALIDATION_FAILED" + assert detail["failed_component"] == "download_dataset" + + +def test_engine_supports_remote_mode_for_all_components(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + monkeypatch.setattr( + engine, + "load_workflow_definition", + lambda workflow_id: WorkflowDefinition.model_validate( + { + "workflow_id": workflow_id, + "version": 1, + "steps": [ + { + "component": "feature_source", + "config": {"execution_mode": "remote", "remote_url": "http://x/components/feature-source"}, + }, + { + "component": "download_dataset", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/download-dataset", + }, + }, + { + "component": "temporal_aggregation", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/temporal-aggregation", + }, + }, + { + "component": "spatial_aggregation", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/spatial-aggregation", + }, + }, + { + "component": "build_datavalueset", + "config": { + "execution_mode": "remote", + "remote_url": "http://x/components/build-datavalue-set", + }, + }, + ], + } + ), + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + + called: dict[str, bool] = { + "feature": False, + "download": False, + "temporal": False, + "spatial": False, + "build": False, + } + + monkeypatch.setattr( + engine, + "_invoke_remote_feature_source_component", + lambda **kwargs: ( + called.__setitem__("feature", True), + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + )[1:], + ) + monkeypatch.setattr( + engine, + "_invoke_remote_download_component", + lambda **kwargs: called.__setitem__("download", True), + ) + monkeypatch.setattr( + engine, + "_invoke_remote_temporal_aggregation_component", + lambda **kwargs: (called.__setitem__("temporal", True), {"sizes": {"time": 1}, "dims": ["time"]})[1], + ) + monkeypatch.setattr( + engine, + "_invoke_remote_spatial_aggregation_component", + lambda **kwargs: ( + called.__setitem__("spatial", True), + [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + )[1], + ) + monkeypatch.setattr( + engine, + "_invoke_remote_build_datavalueset_component", + lambda **kwargs: ( + called.__setitem__("build", True), + ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + )[1], + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request) + assert response.status == "completed" + assert all(called.values()) From be70b2dd9eed2f73acdbf82c241cdb7e6b90793d Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 18 Mar 2026 11:17:01 +0100 Subject: [PATCH 06/15] Add publication-driven OGC workflow outputs --- data/workflows/dhis2_datavalue_set.yaml | 5 + ...alue_set_without_temporal_aggregation.yaml | 3 + src/eo_api/components/services.py | 7 +- src/eo_api/main.py | 10 + src/eo_api/ogc/__init__.py | 1 + src/eo_api/ogc/routes.py | 227 ++++++ src/eo_api/ogc_api/__init__.py | 5 + src/eo_api/publications/__init__.py | 1 + src/eo_api/publications/generated_routes.py | 29 + src/eo_api/publications/pygeoapi.py | 226 ++++++ src/eo_api/publications/routes.py | 37 + src/eo_api/publications/schemas.py | 59 ++ src/eo_api/publications/services.py | 180 +++++ src/eo_api/startup.py | 15 + src/eo_api/workflows/routes.py | 84 ++- src/eo_api/workflows/schemas.py | 65 ++ src/eo_api/workflows/services/definitions.py | 32 +- src/eo_api/workflows/services/engine.py | 128 +++- src/eo_api/workflows/services/job_store.py | 280 ++++++++ .../workflows/services/publication_assets.py | 79 +++ src/eo_api/workflows/services/runtime.py | 4 +- tests/conftest.py | 19 + tests/test_workflows.py | 669 +++++++++++++++++- 23 files changed, 2134 insertions(+), 31 deletions(-) create mode 100644 src/eo_api/ogc/__init__.py create mode 100644 src/eo_api/ogc/routes.py create mode 100644 src/eo_api/ogc_api/__init__.py create mode 100644 src/eo_api/publications/__init__.py create mode 100644 src/eo_api/publications/generated_routes.py create mode 100644 src/eo_api/publications/pygeoapi.py create mode 100644 src/eo_api/publications/routes.py create mode 100644 src/eo_api/publications/schemas.py create mode 100644 src/eo_api/publications/services.py create mode 100644 src/eo_api/workflows/services/job_store.py create mode 100644 src/eo_api/workflows/services/publication_assets.py diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml index 8f5afd8..da50d5d 100644 --- a/data/workflows/dhis2_datavalue_set.yaml +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -1,5 +1,10 @@ workflow_id: dhis2_datavalue_set_v1 version: 1 +publication: + publishable: true + strategy: on_success + intent: feature_collection + exposure: ogc steps: - component: feature_source version: v1 diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml index c7baaf5..9beb8e0 100644 --- a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -1,5 +1,8 @@ workflow_id: dhis2_datavalue_set_without_temporal_aggregation_v1 version: 1 +publication: + publishable: false + exposure: registry_only steps: - component: feature_source version: v1 diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index a546051..5539218 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -267,9 +267,14 @@ def spatial_aggregation_component( features: dict[str, Any], method: AggregationMethod, feature_id_property: str, + aggregated_dataset: xr.Dataset | None = None, ) -> list[dict[str, Any]]: """Load dataset and aggregate spatially to provided features.""" - ds = get_data(dataset=dataset, start=start, end=end, bbox=bbox) + ds = ( + aggregated_dataset + if aggregated_dataset is not None + else get_data(dataset=dataset, start=start, end=end, bbox=bbox) + ) return aggregate_to_features( ds=ds, variable=dataset["variable"], diff --git a/src/eo_api/main.py b/src/eo_api/main.py index 0ab8abb..86ce4c7 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -2,9 +2,14 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles import eo_api.startup # noqa: F401 # pyright: ignore[reportUnusedImport] from eo_api import components, data_accessor, data_manager, data_registry, system, workflows +from eo_api.ogc import routes as ogc_routes +from eo_api.ogc_api import ogc_api_app +from eo_api.publications import generated_routes as publication_generated_routes +from eo_api.publications import routes as publication_routes app = FastAPI() @@ -21,4 +26,9 @@ app.include_router(data_manager.routes.router, prefix="/manage", tags=["Data manager"]) app.include_router(data_accessor.routes.router, prefix="/retrieve", tags=["Data retrieval"]) app.include_router(workflows.routes.router, prefix="/workflows", tags=["Workflows"]) +app.include_router(publication_routes.router, prefix="/publications", tags=["Publications"]) +app.include_router(publication_generated_routes.router, prefix="/publications", tags=["Publications"]) app.include_router(components.routes.router, tags=["Components"]) +app.include_router(ogc_routes.router, prefix="/ogcapi", tags=["OGC API"]) +app.mount("/data", StaticFiles(directory="data/downloads"), name="Data") +app.mount("/ogcapi", ogc_api_app) diff --git a/src/eo_api/ogc/__init__.py b/src/eo_api/ogc/__init__.py new file mode 100644 index 0000000..a3635b0 --- /dev/null +++ b/src/eo_api/ogc/__init__.py @@ -0,0 +1 @@ +"""OGC adapter routes package.""" diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py new file mode 100644 index 0000000..f67726c --- /dev/null +++ b/src/eo_api/ogc/routes.py @@ -0,0 +1,227 @@ +"""Thin OGC API adapter routes over the native workflow engine.""" + +from __future__ import annotations + +import uuid +from typing import Any + +from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, Response + +from ..publications.schemas import PublishedResourceExposure +from ..publications.services import collection_id_for_resource, get_published_resource +from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus +from ..workflows.services.definitions import load_workflow_definition +from ..workflows.services.engine import execute_workflow +from ..workflows.services.job_store import get_job, get_job_result, initialize_job, list_jobs +from ..workflows.services.simple_mapper import normalize_simple_request + +router = APIRouter() + +_PROCESS_ID = "generic-dhis2-workflow" +_PROCESS_TITLE = "Generic DHIS2 workflow" + + +@router.get("/processes") +def list_processes(request: Request) -> dict[str, Any]: + """List exposed OGC processes.""" + return { + "processes": [ + { + "id": _PROCESS_ID, + "title": _PROCESS_TITLE, + "description": "Execute the generic DHIS2 EO workflow and persist a native job record.", + "jobControlOptions": ["sync-execute", "async-execute"], + "outputTransmission": ["value", "reference"], + "links": [ + { + "rel": "self", + "type": "application/json", + "href": str(request.url_for("describe_ogc_process", process_id=_PROCESS_ID)), + } + ], + } + ] + } + + +@router.get("/processes/{process_id}", name="describe_ogc_process") +def describe_process(process_id: str, request: Request) -> dict[str, Any]: + """Describe the single exposed generic workflow process.""" + _require_process(process_id) + return { + "id": _PROCESS_ID, + "title": _PROCESS_TITLE, + "description": "OGC-facing adapter over the reusable native workflow engine.", + "jobControlOptions": ["sync-execute", "async-execute"], + "outputTransmission": ["value", "reference"], + "links": [ + { + "rel": "execute", + "type": "application/json", + "href": str(request.url_for("execute_ogc_process", process_id=_PROCESS_ID)), + } + ], + } + + +@router.post("/processes/{process_id}/execution", name="execute_ogc_process") +def execute_process( + process_id: str, + payload: WorkflowExecuteEnvelopeRequest, + request: Request, + response: Response, + background_tasks: BackgroundTasks, + prefer: str | None = Header(default=None), +) -> dict[str, Any]: + """Execute the generic workflow synchronously or submit it asynchronously.""" + _require_process(process_id) + normalized, _warnings = normalize_simple_request(payload.request) + + if prefer is not None and "respond-async" in prefer.lower(): + job_id = str(uuid.uuid4()) + workflow = load_workflow_definition(payload.request.workflow_id) + initialize_job( + job_id=job_id, + request=normalized, + request_payload=payload.request.model_dump(exclude_none=True), + workflow=workflow, + workflow_definition_source="catalog", + workflow_id=payload.request.workflow_id, + workflow_version=workflow.version, + status=WorkflowJobStatus.ACCEPTED, + process_id=_PROCESS_ID, + ) + background_tasks.add_task( + _run_async_workflow_job, + job_id, + normalized, + payload.request.workflow_id, + payload.request.model_dump(exclude_none=True), + payload.request.include_component_run_details, + ) + job_url = str(request.url_for("get_ogc_job", job_id=job_id)) + results_url = str(request.url_for("get_ogc_job_results", job_id=job_id)) + response.status_code = 202 + response.headers["Location"] = job_url + return { + "jobID": job_id, + "status": WorkflowJobStatus.ACCEPTED, + "location": job_url, + "jobUrl": job_url, + "resultsUrl": results_url, + } + + result = execute_workflow( + normalized, + workflow_id=payload.request.workflow_id, + request_params=payload.request.model_dump(exclude_none=True), + include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="catalog", + ) + job_url = str(request.url_for("get_ogc_job", job_id=result.run_id)) + results_url = str(request.url_for("get_ogc_job_results", job_id=result.run_id)) + publication = get_published_resource(f"workflow-output-{result.run_id}") + links: list[dict[str, Any]] = [ + {"rel": "monitor", "type": "application/json", "href": job_url}, + {"rel": "results", "type": "application/json", "href": results_url}, + ] + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + collection_id = collection_id_for_resource(publication) + links.append( + { + "rel": "collection", + "type": "application/json", + "href": _collection_href(request, collection_id), + } + ) + return { + "jobID": result.run_id, + "processID": _PROCESS_ID, + "status": WorkflowJobStatus.SUCCESSFUL, + "outputs": result.model_dump(mode="json"), + "links": links, + } + + +@router.get("/jobs") +def list_ogc_jobs(process_id: str | None = None) -> dict[str, Any]: + """List OGC-visible jobs backed by the native job store.""" + jobs = list_jobs(process_id=process_id, status=None) + return {"jobs": [job.model_dump(mode="json") for job in jobs]} + + +@router.get("/jobs/{job_id}", name="get_ogc_job") +def get_ogc_job(job_id: str, request: Request) -> dict[str, Any]: + """Fetch one OGC job view from the native job store.""" + job = get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + publication = get_published_resource(f"workflow-output-{job.job_id}") + links: list[dict[str, Any]] = [ + { + "rel": "self", + "type": "application/json", + "href": str(request.url_for("get_ogc_job", job_id=job.job_id)), + }, + { + "rel": "results", + "type": "application/json", + "href": str(request.url_for("get_ogc_job_results", job_id=job.job_id)), + }, + ] + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + links.append( + { + "rel": "collection", + "type": "application/json", + "href": _collection_href(request, collection_id_for_resource(publication)), + } + ) + return { + "jobID": job.job_id, + "processID": job.process_id, + "status": job.status, + "created": job.created_at, + "updated": job.updated_at, + "links": links, + } + + +@router.get("/jobs/{job_id}/results", name="get_ogc_job_results") +def get_ogc_job_results(job_id: str) -> dict[str, Any]: + """Return persisted results for a completed OGC job.""" + job = get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + result = get_job_result(job_id) + if result is None: + raise HTTPException(status_code=409, detail={"jobID": job_id, "status": job.status}) + return result + + +def _require_process(process_id: str) -> None: + if process_id != _PROCESS_ID: + raise HTTPException(status_code=404, detail=f"Unknown process_id '{process_id}'") + + +def _run_async_workflow_job( + job_id: str, + normalized_request: Any, + workflow_id: str, + request_params: dict[str, Any], + include_component_run_details: bool, +) -> None: + try: + execute_workflow( + normalized_request, + workflow_id=workflow_id, + request_params=request_params, + include_component_run_details=include_component_run_details, + run_id=job_id, + ) + except HTTPException: + return + + +def _collection_href(request: Request, collection_id: str) -> str: + return str(request.base_url).rstrip("/") + f"/ogcapi/collections/{collection_id}" diff --git a/src/eo_api/ogc_api/__init__.py b/src/eo_api/ogc_api/__init__.py new file mode 100644 index 0000000..0e7d977 --- /dev/null +++ b/src/eo_api/ogc_api/__init__.py @@ -0,0 +1,5 @@ +"""Mounted pygeoapi application.""" + +from pygeoapi.starlette_app import APP as ogc_api_app + +__all__ = ["ogc_api_app"] diff --git a/src/eo_api/publications/__init__.py b/src/eo_api/publications/__init__.py new file mode 100644 index 0000000..392d955 --- /dev/null +++ b/src/eo_api/publications/__init__.py @@ -0,0 +1 @@ +"""Publication registry package.""" diff --git a/src/eo_api/publications/generated_routes.py b/src/eo_api/publications/generated_routes.py new file mode 100644 index 0000000..dbd9533 --- /dev/null +++ b/src/eo_api/publications/generated_routes.py @@ -0,0 +1,29 @@ +"""Routes exposing generated pygeoapi documents from publication truth.""" + +from fastapi import APIRouter + +from .pygeoapi import build_pygeoapi_config, build_pygeoapi_openapi, write_generated_pygeoapi_documents + +router = APIRouter() + + +@router.get("/pygeoapi/config") +def get_generated_pygeoapi_config() -> dict[str, object]: + """Return generated pygeoapi config from backend publication truth.""" + return build_pygeoapi_config() + + +@router.get("/pygeoapi/openapi") +def get_generated_pygeoapi_openapi() -> dict[str, object]: + """Return generated pygeoapi OpenAPI projection from backend publication truth.""" + return build_pygeoapi_openapi() + + +@router.post("/pygeoapi/materialize") +def materialize_generated_pygeoapi_documents() -> dict[str, str]: + """Write generated pygeoapi documents to disk for runtime wiring.""" + config_path, openapi_path = write_generated_pygeoapi_documents() + return { + "config_path": str(config_path), + "openapi_path": str(openapi_path), + } diff --git a/src/eo_api/publications/pygeoapi.py b/src/eo_api/publications/pygeoapi.py new file mode 100644 index 0000000..894b7b9 --- /dev/null +++ b/src/eo_api/publications/pygeoapi.py @@ -0,0 +1,226 @@ +"""Generate pygeoapi-facing documents from backend publication state.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + +from ..data_manager.services.downloader import DOWNLOAD_DIR, get_zarr_path +from ..data_registry.services.datasets import get_dataset +from .schemas import PublishedResource, PublishedResourceExposure, PublishedResourceKind +from .services import collection_id_for_resource, ensure_source_dataset_publications, list_published_resources + +_DEFAULT_SERVER_URL = "http://127.0.0.1:8000/ogcapi" + + +def build_pygeoapi_config(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, Any]: + """Build a minimal pygeoapi config from published resources.""" + ensure_source_dataset_publications() + resources = list(_iter_pygeoapi_resources()) + return { + "server": { + "bind": {"host": "0.0.0.0", "port": 5000}, + "url": server_url, + "mimetype": "application/json; charset=UTF-8", + "encoding": "utf-8", + "languages": ["en-US"], + "limits": {"default_items": 20, "max_items": 50}, + "map": { + "url": "https://tile.openstreetmap.org/{z}/{x}/{y}.png", + "attribution": "OpenStreetMap", + }, + "gzip": True, + }, + "logging": {"level": "ERROR"}, + "metadata": { + "identification": { + "title": {"en": "DHIS2 EO API"}, + "description": {"en": "Generated pygeoapi publication config from backend publication truth"}, + "keywords": {"en": ["EO", "DHIS2", "OGC"]}, + "terms_of_service": "https://dhis2.org", + "url": "https://dhis2.org", + }, + "license": { + "name": "CC-BY 4.0", + "url": "https://creativecommons.org/licenses/by/4.0/", + }, + "provider": {"name": "DHIS2 EO API", "url": "https://dhis2.org"}, + "contact": {"name": "DHIS2", "position": "Team", "email": "climate@dhis2.org"}, + }, + "resources": { + collection_id_for_resource(resource): _build_pygeoapi_resource(resource) for resource in resources + }, + } + + +def build_pygeoapi_openapi(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, Any]: + """Build a minimal OpenAPI projection that reflects generated collection resources.""" + config = build_pygeoapi_config(server_url=server_url) + resources = config["resources"] + return { + "openapi": "3.0.2", + "info": { + "title": "DHIS2 EO API", + "description": "Generated pygeoapi OpenAPI projection from backend publication truth", + "version": "0.1.0", + }, + "servers": [{"url": server_url}], + "paths": { + "/collections": { + "get": { + "summary": "Collections", + "operationId": "getCollections", + "responses": {"200": {"description": "successful operation"}}, + } + }, + **{ + f"/collections/{resource_id}": { + "get": { + "summary": f"Collection {resource_id}", + "operationId": f"getCollection_{resource_id.replace('-', '_')}", + "responses": {"200": {"description": "successful operation"}}, + } + } + for resource_id in resources + }, + }, + "x-generated-resources": list(resources.keys()), + } + + +def write_generated_pygeoapi_documents(*, server_url: str = _DEFAULT_SERVER_URL) -> tuple[Path, Path]: + """Persist generated pygeoapi config and OpenAPI documents to disk.""" + output_dir = DOWNLOAD_DIR / "pygeoapi" + output_dir.mkdir(parents=True, exist_ok=True) + config_path = output_dir / "pygeoapi-config.generated.yml" + openapi_path = output_dir / "pygeoapi-openapi.generated.yml" + config_text = yaml.safe_dump(build_pygeoapi_config(server_url=server_url), sort_keys=False) + config_path.write_text(config_text, encoding="utf-8") + openapi_path.write_text( + yaml.safe_dump(build_pygeoapi_openapi(server_url=server_url), sort_keys=False), + encoding="utf-8", + ) + return config_path, openapi_path + + +def _build_pygeoapi_resource(resource: PublishedResource) -> dict[str, Any]: + provider = _build_provider(resource) + return { + "type": "collection", + "title": {"en": resource.title}, + "description": {"en": resource.description}, + "keywords": _keywords_for_resource(resource), + "links": _pygeoapi_links(resource), + "extents": { + "spatial": {"bbox": _bbox_for_resource(resource)}, + "temporal": _temporal_extent_for_resource(resource), + }, + "providers": [provider], + "metadata": { + "resource_id": resource.resource_id, + "resource_class": str(resource.resource_class), + "dataset_id": resource.dataset_id, + "workflow_id": resource.workflow_id, + "job_id": resource.job_id, + "kind": str(resource.kind), + **resource.metadata, + }, + } + + +def _bbox_for_resource(resource: PublishedResource) -> list[list[float]]: + bbox = resource.metadata.get("bbox") + if isinstance(bbox, list) and bbox: + return [bbox] + return [[-180.0, -90.0, 180.0, 90.0]] + + +def _temporal_extent_for_resource(resource: PublishedResource) -> dict[str, str | None]: + metadata = resource.metadata + start = metadata.get("time_start") + end = metadata.get("time_end") + if start is not None or end is not None: + return { + "begin": str(start) if start is not None else None, + "end": str(end) if end is not None else None, + } + period_type = metadata.get("period_type") + if period_type is not None: + value = str(period_type) + return {"begin": value, "end": value} + return {"begin": None, "end": None} + + +def _keywords_for_resource(resource: PublishedResource) -> list[str]: + keywords = ["EO", "DHIS2", str(resource.resource_class), str(resource.kind)] + if resource.dataset_id is not None: + keywords.append(resource.dataset_id) + if resource.workflow_id is not None: + keywords.append(resource.workflow_id) + return keywords + + +def _build_provider(resource: PublishedResource) -> dict[str, Any]: + if resource.kind == PublishedResourceKind.COVERAGE: + dataset = get_dataset(str(resource.dataset_id)) + if dataset is None: + raise ValueError(f"Unknown dataset_id '{resource.dataset_id}' for resource '{resource.resource_id}'") + zarr_path = get_zarr_path(dataset) + if zarr_path is None: + raise ValueError(f"No zarr cache available for dataset '{resource.dataset_id}'") + return { + "name": "xarray", + "type": "coverage", + "data": str(zarr_path), + "default": True, + } + + if resource.kind == PublishedResourceKind.FEATURE_COLLECTION and resource.path is not None: + suffix = Path(resource.path).suffix.lower() + if suffix == ".geojson": + return { + "name": "GeoJSON", + "type": "feature", + "data": resource.path, + "id_field": "id", + "default": True, + } + + raise ValueError(f"Resource '{resource.resource_id}' is not yet mappable to a pygeoapi provider") + + +def _iter_pygeoapi_resources() -> list[PublishedResource]: + resources: list[PublishedResource] = [] + for resource in list_published_resources(exposure=PublishedResourceExposure.OGC): + try: + _build_provider(resource) + except ValueError: + continue + resources.append(resource) + return resources + + +def _pygeoapi_links(resource: PublishedResource) -> list[dict[str, str]]: + links: list[dict[str, str]] = [] + for link in resource.links: + href = str(link.get("href", "")) + rel = str(link.get("rel", "related")) + if href == "": + continue + links.append( + { + "type": "application/json", + "rel": rel, + "title": rel.replace("-", " ").title(), + "href": _absolute_ogc_href(href), + } + ) + return links + + +def _absolute_ogc_href(href: str) -> str: + if href.startswith("http://") or href.startswith("https://"): + return href + return f"{_DEFAULT_SERVER_URL.removesuffix('/ogcapi')}{href}" diff --git a/src/eo_api/publications/routes.py b/src/eo_api/publications/routes.py new file mode 100644 index 0000000..de228d8 --- /dev/null +++ b/src/eo_api/publications/routes.py @@ -0,0 +1,37 @@ +"""Routes for backend-owned publication state.""" + +from fastapi import APIRouter, HTTPException + +from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceListResponse +from .services import ensure_source_dataset_publications, get_published_resource, list_published_resources + +router = APIRouter() + + +@router.get("", response_model=PublishedResourceListResponse) +def list_publications( + resource_class: PublishedResourceClass | None = None, + dataset_id: str | None = None, + workflow_id: str | None = None, + exposure: PublishedResourceExposure | None = None, +) -> PublishedResourceListResponse: + """List backend-owned published resources.""" + ensure_source_dataset_publications() + return PublishedResourceListResponse( + resources=list_published_resources( + resource_class=resource_class, + dataset_id=dataset_id, + workflow_id=workflow_id, + exposure=exposure, + ) + ) + + +@router.get("/{resource_id}", response_model=PublishedResource) +def get_publication(resource_id: str) -> PublishedResource: + """Get one published resource.""" + ensure_source_dataset_publications() + resource = get_published_resource(resource_id) + if resource is None: + raise HTTPException(status_code=404, detail=f"Unknown resource_id '{resource_id}'") + return resource diff --git a/src/eo_api/publications/schemas.py b/src/eo_api/publications/schemas.py new file mode 100644 index 0000000..240ddc7 --- /dev/null +++ b/src/eo_api/publications/schemas.py @@ -0,0 +1,59 @@ +"""Schemas for backend-owned published resources.""" + +from __future__ import annotations + +from enum import StrEnum +from typing import Any + +from pydantic import BaseModel, Field + + +class PublishedResourceClass(StrEnum): + """High-level publication origin.""" + + SOURCE = "source" + DERIVED = "derived" + + +class PublishedResourceKind(StrEnum): + """Supported OGC-facing resource kinds.""" + + COLLECTION = "collection" + COVERAGE = "coverage" + FEATURE_COLLECTION = "feature_collection" + TILESET = "tileset" + + +class PublishedResourceExposure(StrEnum): + """Whether a registered resource should be surfaced via OGC.""" + + REGISTRY_ONLY = "registry_only" + OGC = "ogc" + + +class PublishedResource(BaseModel): + """Backend-owned publication state for one discoverable resource.""" + + resource_id: str + resource_class: PublishedResourceClass + kind: PublishedResourceKind + title: str + description: str + dataset_id: str | None = None + workflow_id: str | None = None + job_id: str | None = None + run_id: str | None = None + path: str | None = None + ogc_path: str | None = None + asset_format: str | None = None + exposure: PublishedResourceExposure = PublishedResourceExposure.REGISTRY_ONLY + created_at: str + updated_at: str + metadata: dict[str, Any] = Field(default_factory=dict) + links: list[dict[str, Any]] = Field(default_factory=list) + + +class PublishedResourceListResponse(BaseModel): + """List of published resources.""" + + resources: list[PublishedResource] diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py new file mode 100644 index 0000000..59db4cf --- /dev/null +++ b/src/eo_api/publications/services.py @@ -0,0 +1,180 @@ +"""Disk-backed published resource registry.""" + +from __future__ import annotations + +import datetime as dt +from pathlib import Path +from typing import TYPE_CHECKING + +from ..data_manager.services.downloader import DOWNLOAD_DIR +from ..data_registry.services.datasets import list_datasets +from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceKind + +if TYPE_CHECKING: + from ..workflows.schemas import WorkflowExecuteResponse + + +def ensure_source_dataset_publications() -> list[PublishedResource]: + """Seed published source dataset resources from the dataset registry.""" + resources: list[PublishedResource] = [] + for dataset in list_datasets(): + resource_id = f"dataset-{dataset['id']}" + existing = get_published_resource(resource_id) + timestamp = _utc_now() + record = PublishedResource( + resource_id=resource_id, + resource_class=PublishedResourceClass.SOURCE, + kind=PublishedResourceKind.COVERAGE, + title=str(dataset.get("name") or dataset["id"]), + description=f"Source dataset: {dataset.get('source') or dataset['id']}", + dataset_id=str(dataset["id"]), + path=None, + ogc_path=f"/ogcapi/collections/{dataset['id']}", + asset_format="zarr", + exposure=PublishedResourceExposure.OGC, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + metadata={ + "dataset_id": dataset["id"], + "variable": dataset.get("variable"), + "period_type": dataset.get("period_type"), + "source": dataset.get("source"), + "source_url": dataset.get("source_url"), + "resolution": dataset.get("resolution"), + "units": dataset.get("units"), + }, + links=[ + { + "rel": "collection", + "href": f"/ogcapi/collections/{dataset['id']}", + } + ], + ) + _write_resource(record) + resources.append(record) + return resources + + +def register_workflow_output_publication( + *, + response: WorkflowExecuteResponse, + exposure: PublishedResourceExposure, + published_path: str | None = None, + asset_format: str | None = None, +) -> PublishedResource: + """Register a completed workflow output as a published derived resource.""" + resource_id = f"workflow-output-{response.run_id}" + existing = get_published_resource(resource_id) + timestamp = _utc_now() + record = PublishedResource( + resource_id=resource_id, + resource_class=PublishedResourceClass.DERIVED, + kind=PublishedResourceKind.FEATURE_COLLECTION, + title=f"{response.workflow_id} output for {response.dataset_id}", + description="Derived workflow output registered for OGC publication.", + dataset_id=response.dataset_id, + workflow_id=response.workflow_id, + job_id=response.run_id, + run_id=response.run_id, + path=published_path or response.output_file, + ogc_path=f"/ogcapi/collections/{resource_id}", + asset_format=asset_format or "datavalueset-json", + exposure=exposure, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + metadata={ + "workflow_id": response.workflow_id, + "workflow_version": response.workflow_version, + "dataset_id": response.dataset_id, + "feature_count": response.feature_count, + "value_count": response.value_count, + "bbox": response.bbox, + "native_output_file": response.output_file, + }, + links=[ + {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, + {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, + {"rel": "collection", "href": f"/ogcapi/collections/{resource_id}"}, + ], + ) + _write_resource(record) + return record + + +def list_published_resources( + *, + resource_class: PublishedResourceClass | None = None, + dataset_id: str | None = None, + workflow_id: str | None = None, + exposure: PublishedResourceExposure | None = None, +) -> list[PublishedResource]: + """List persisted published resources.""" + resources: list[PublishedResource] = [] + for path in _resources_dir().glob("*.json"): + resources.append(PublishedResource.model_validate_json(path.read_text(encoding="utf-8"))) + resources.sort(key=lambda item: item.created_at, reverse=True) + if resource_class is not None: + resources = [item for item in resources if item.resource_class == resource_class] + if dataset_id is not None: + resources = [item for item in resources if item.dataset_id == dataset_id] + if workflow_id is not None: + resources = [item for item in resources if item.workflow_id == workflow_id] + if exposure is not None: + resources = [item for item in resources if item.exposure == exposure] + return resources + + +def get_published_resource(resource_id: str) -> PublishedResource | None: + """Fetch a single published resource.""" + path = _resource_path(resource_id) + if not path.exists(): + return None + return PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + + +def delete_published_resource(resource_id: str) -> PublishedResource | None: + """Delete one persisted published resource if it exists.""" + resource = get_published_resource(resource_id) + if resource is None: + return None + path = _resource_path(resource_id) + if path.exists(): + path.unlink() + return resource + + +def get_published_resource_by_collection_id(collection_id: str) -> PublishedResource | None: + """Resolve an OGC collection identifier to a published resource.""" + ensure_source_dataset_publications() + for resource in list_published_resources(exposure=PublishedResourceExposure.OGC): + if _collection_id_for_resource(resource) == collection_id: + return resource + return None + + +def collection_id_for_resource(resource: PublishedResource) -> str: + """Return the OGC collection identifier for a published resource.""" + return _collection_id_for_resource(resource) + + +def _write_resource(resource: PublishedResource) -> None: + _resources_dir().mkdir(parents=True, exist_ok=True) + _resource_path(resource.resource_id).write_text(resource.model_dump_json(indent=2), encoding="utf-8") + + +def _resource_path(resource_id: str) -> Path: + return _resources_dir() / f"{resource_id}.json" + + +def _resources_dir() -> Path: + return DOWNLOAD_DIR / "published_resources" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _collection_id_for_resource(resource: PublishedResource) -> str: + if resource.resource_class == PublishedResourceClass.SOURCE and resource.dataset_id is not None: + return resource.dataset_id + return resource.resource_id diff --git a/src/eo_api/startup.py b/src/eo_api/startup.py index 5c33ffa..8b48d7c 100644 --- a/src/eo_api/startup.py +++ b/src/eo_api/startup.py @@ -5,6 +5,7 @@ """ import logging +import os from dotenv import load_dotenv # noqa: E402 @@ -20,3 +21,17 @@ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s - %(message)s")) eo_logger.addHandler(handler) eo_logger.propagate = False + + +def _configure_generated_pygeoapi() -> None: + """Materialize publication-driven pygeoapi documents before pygeoapi import.""" + from eo_api.publications.pygeoapi import write_generated_pygeoapi_documents + + server_url = os.environ.get("PYGEOAPI_SERVER_URL", "http://127.0.0.1:8000/ogcapi") + config_path, openapi_path = write_generated_pygeoapi_documents(server_url=server_url) + os.environ["PYGEOAPI_CONFIG"] = str(config_path) + os.environ["PYGEOAPI_OPENAPI"] = str(openapi_path) + eo_logger.info("Configured generated pygeoapi documents: %s %s", config_path, openapi_path) + + +_configure_generated_pygeoapi() diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index 8650f54..43bb65a 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -1,19 +1,27 @@ -"""API routes for workflow discovery and execution.""" +"""API routes for workflow discovery, execution, and native job access.""" -from fastapi import APIRouter, HTTPException +from typing import Any +from fastapi import APIRouter, HTTPException, Request + +from ..publications.schemas import PublishedResourceExposure +from ..publications.services import collection_id_for_resource, get_published_resource from .schemas import ( WorkflowAssemblyExecuteRequest, WorkflowCatalogItem, WorkflowCatalogResponse, WorkflowExecuteEnvelopeRequest, WorkflowExecuteResponse, + WorkflowJobListResponse, + WorkflowJobRecord, + WorkflowJobStatus, WorkflowValidateRequest, WorkflowValidateResponse, WorkflowValidateStep, ) from .services.definitions import list_workflow_definitions, load_workflow_definition from .services.engine import execute_workflow, validate_workflow_steps +from .services.job_store import delete_job, get_job, get_job_result, get_job_trace, list_jobs from .services.simple_mapper import normalize_simple_request router = APIRouter() @@ -31,6 +39,11 @@ def list_workflows() -> WorkflowCatalogResponse: WorkflowCatalogItem( workflow_id=definition.workflow_id, version=definition.version, + publication_publishable=definition.publication.publishable, + publication_intent=(str(definition.publication.intent) if definition.publication.publishable else None), + publication_exposure=( + str(definition.publication.exposure) if definition.publication.publishable else None + ), step_count=len(definition.steps), components=[step.component for step in definition.steps], ) @@ -39,6 +52,71 @@ def list_workflows() -> WorkflowCatalogResponse: ) +@router.get("/jobs", response_model=WorkflowJobListResponse) +def list_workflow_jobs( + process_id: str | None = None, + status: WorkflowJobStatus | None = None, +) -> WorkflowJobListResponse: + """List persisted workflow jobs.""" + return WorkflowJobListResponse(jobs=list_jobs(process_id=process_id, status=status)) + + +@router.get("/jobs/{job_id}", response_model=WorkflowJobRecord) +def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: + """Fetch one persisted workflow job.""" + job = get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + links: list[dict[str, str]] = [ + {"rel": "self", "href": str(request.url_for("get_workflow_job", job_id=job_id))}, + {"rel": "result", "href": str(request.url_for("get_workflow_job_result", job_id=job_id))}, + {"rel": "trace", "href": str(request.url_for("get_workflow_job_trace", job_id=job_id))}, + ] + publication = get_published_resource(f"workflow-output-{job_id}") + if publication is not None and publication.exposure == PublishedResourceExposure.OGC: + collection_id = collection_id_for_resource(publication) + links.append( + { + "rel": "collection", + "href": f"{str(request.base_url).rstrip('/')}/ogcapi/collections/{collection_id}", + } + ) + return job.model_copy(update={"links": links}) + + +@router.get("/jobs/{job_id}/result") +def get_workflow_job_result(job_id: str) -> dict[str, Any]: + """Fetch persisted workflow results for a completed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + result = get_job_result(job_id) + if result is None: + raise HTTPException(status_code=409, detail={"job_id": job_id, "status": job.status}) + return result + + +@router.get("/jobs/{job_id}/trace") +def get_workflow_job_trace(job_id: str) -> dict[str, Any]: + """Fetch persisted workflow trace for a completed or failed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + trace = get_job_trace(job_id) + if trace is None: + raise HTTPException(status_code=409, detail={"job_id": job_id, "status": job.status}) + return trace + + +@router.delete("/jobs/{job_id}") +def delete_workflow_job(job_id: str) -> dict[str, Any]: + """Delete one workflow job and cascade run-owned derived artifacts.""" + deleted = delete_job(job_id) + if deleted is None: + raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + return deleted + + @router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> WorkflowExecuteResponse: """Run workflow from a single flat request payload.""" @@ -48,6 +126,7 @@ def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> workflow_id=payload.request.workflow_id, request_params=payload.request.model_dump(), include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="catalog", ) @@ -61,6 +140,7 @@ def run_inline_assembled_workflow(payload: WorkflowAssemblyExecuteRequest) -> Wo workflow_definition=payload.workflow, request_params=payload.request.model_dump(exclude_none=True), include_component_run_details=payload.request.include_component_run_details, + workflow_definition_source="inline", ) diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index 09072dc..72e574d 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -126,11 +126,76 @@ class WorkflowExecuteResponse(BaseModel): component_run_details_available: bool = True +class WorkflowJobStatus(StrEnum): + """Native workflow job lifecycle states.""" + + ACCEPTED = "accepted" + RUNNING = "running" + SUCCESSFUL = "successful" + FAILED = "failed" + DISMISSED = "dismissed" + + +class WorkflowJobOrchestrationStep(BaseModel): + """Compact summary of one workflow step.""" + + component: str + version: str + execution_mode: str | None = None + + +class WorkflowJobOrchestration(BaseModel): + """Compact summary of workflow orchestration.""" + + definition_source: str + step_count: int + components: list[str] + steps: list[WorkflowJobOrchestrationStep] + + +class WorkflowJobRecord(BaseModel): + """Persisted workflow job metadata.""" + + job_id: str + process_id: str + workflow_id: str + workflow_version: int + dataset_id: str + status: WorkflowJobStatus + created_at: str + updated_at: str + request: dict[str, Any] + orchestration: WorkflowJobOrchestration + run_log_file: str | None = None + output_file: str | None = None + error: str | None = None + error_code: str | None = None + failed_component: str | None = None + failed_component_version: str | None = None + links: list[dict[str, Any]] = Field(default_factory=list) + + +class WorkflowJobStoredRecord(WorkflowJobRecord): + """Persisted workflow job metadata including internal result payload.""" + + run_id: str + result: dict[str, Any] | None = None + + +class WorkflowJobListResponse(BaseModel): + """List of persisted workflow jobs.""" + + jobs: list[WorkflowJobRecord] + + class WorkflowCatalogItem(BaseModel): """Discoverable workflow definition summary.""" workflow_id: str version: int + publication_publishable: bool + publication_intent: str | None = None + publication_exposure: str | None = None step_count: int components: list[str] diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py index a05965c..56ff715 100644 --- a/src/eo_api/workflows/services/definitions.py +++ b/src/eo_api/workflows/services/definitions.py @@ -6,7 +6,9 @@ from typing import Any, Final, Literal import yaml -from pydantic import BaseModel, Field, model_validator +from pydantic import AliasChoices, BaseModel, Field, model_validator + +from ...publications.schemas import PublishedResourceExposure, PublishedResourceKind ComponentName = Literal[ "feature_source", @@ -59,11 +61,39 @@ def validate_component_version(self) -> "WorkflowStep": return self +class WorkflowPublicationPolicy(BaseModel): + """Publication policy for workflow outputs.""" + + publishable: bool = Field(default=False, validation_alias=AliasChoices("publishable", "enabled")) + strategy: Literal["on_success", "manual"] = Field( + default="on_success", + validation_alias=AliasChoices("strategy", "publish_strategy"), + ) + intent: PublishedResourceKind = Field( + default=PublishedResourceKind.FEATURE_COLLECTION, + validation_alias=AliasChoices("intent", "resource_kind"), + ) + exposure: PublishedResourceExposure = PublishedResourceExposure.REGISTRY_ONLY + required_output_file_suffixes: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def validate_publication_policy(self) -> "WorkflowPublicationPolicy": + """Restrict workflow-driven publication to currently supported resource types.""" + if self.publishable and self.intent != PublishedResourceKind.FEATURE_COLLECTION: + raise ValueError("Workflow publication currently supports only intent='feature_collection'") + normalized_suffixes = [] + for suffix in self.required_output_file_suffixes: + normalized_suffixes.append(suffix if suffix.startswith(".") else f".{suffix}") + self.required_output_file_suffixes = normalized_suffixes + return self + + class WorkflowDefinition(BaseModel): """Declarative workflow definition.""" workflow_id: str version: int = 1 + publication: WorkflowPublicationPolicy = Field(default_factory=WorkflowPublicationPolicy) steps: list[WorkflowStep] @model_validator(mode="after") diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index 5f91716..bf00407 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -2,9 +2,11 @@ from __future__ import annotations +import os import time from collections.abc import Callable -from typing import Any +from pathlib import Path +from typing import Any, Literal import httpx from fastapi import HTTPException @@ -12,8 +14,11 @@ from ...components import services as component_services from ...data_registry.services.datasets import get_dataset -from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse -from .definitions import WorkflowDefinition, load_workflow_definition +from ...publications.services import register_workflow_output_publication +from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowJobStatus +from .definitions import WorkflowDefinition, WorkflowPublicationPolicy, load_workflow_definition +from .job_store import initialize_job, mark_job_failed, mark_job_running, mark_job_success +from .publication_assets import build_feature_collection_asset from .run_logs import persist_run_log from .runtime import WorkflowRuntime @@ -44,9 +49,12 @@ def execute_workflow( workflow_definition: WorkflowDefinition | None = None, request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, + run_id: str | None = None, + workflow_definition_source: Literal["catalog", "inline"] = "catalog", ) -> WorkflowExecuteResponse: """Execute the feature->download->aggregate->DataValueSet workflow.""" - runtime = WorkflowRuntime() + runtime = WorkflowRuntime(run_id=run_id) + workflow: WorkflowDefinition | None = None dataset = get_dataset(request.dataset_id) if dataset is None: @@ -62,6 +70,18 @@ def execute_workflow( workflow = load_workflow_definition(workflow_id) except ValueError as exc: raise HTTPException(status_code=422, detail=str(exc)) from exc + + initialize_job( + job_id=runtime.run_id, + request=request, + request_payload=request_params, + workflow=workflow, + workflow_definition_source=workflow_definition_source, + workflow_id=workflow.workflow_id, + workflow_version=workflow.version, + status=WorkflowJobStatus.RUNNING, + ) + mark_job_running(runtime.run_id) _execute_workflow_steps( workflow=workflow, runtime=runtime, @@ -82,7 +102,7 @@ def execute_workflow( output_file=output_file, ) - return WorkflowExecuteResponse( + response = WorkflowExecuteResponse( status="completed", run_id=runtime.run_id, workflow_id=workflow.workflow_id, @@ -98,8 +118,27 @@ def execute_workflow( component_run_details_included=include_component_run_details, component_run_details_available=True, ) + mark_job_success(job_id=runtime.run_id, response=response) + if _should_publish_workflow_output( + response=response, + publication=workflow.publication, + workflow_definition_source=workflow_definition_source, + ): + publication_path, publication_asset_format = _build_publication_artifact( + response=response, + request=request, + publication=workflow.publication, + context=context, + ) + register_workflow_output_publication( + response=response, + exposure=workflow.publication.exposure, + published_path=publication_path, + asset_format=publication_asset_format, + ) + return response except WorkflowComponentError as exc: - persist_run_log( + run_log_file = persist_run_log( run_id=runtime.run_id, request=request, component_runs=runtime.component_runs, @@ -109,6 +148,15 @@ def execute_workflow( failed_component=exc.component, failed_component_version=exc.component_version, ) + if workflow is not None: + mark_job_failed( + job_id=runtime.run_id, + error=str(exc), + error_code=exc.error_code, + failed_component=exc.component, + failed_component_version=exc.component_version, + run_log_file=run_log_file, + ) error = "upstream_unreachable" if exc.error_code == "UPSTREAM_UNREACHABLE" else "workflow_execution_failed" raise HTTPException( status_code=exc.status_code, @@ -122,16 +170,18 @@ def execute_workflow( }, ) from exc except HTTPException: - persist_run_log( + run_log_file = persist_run_log( run_id=runtime.run_id, request=request, component_runs=runtime.component_runs, status="failed", error="http_exception", ) + if workflow is not None: + mark_job_failed(job_id=runtime.run_id, error="http_exception", run_log_file=run_log_file) raise except Exception as exc: - persist_run_log( + run_log_file = persist_run_log( run_id=runtime.run_id, request=request, component_runs=runtime.component_runs, @@ -139,6 +189,13 @@ def execute_workflow( error=str(exc), error_code="EXECUTION_FAILED", ) + if workflow is not None: + mark_job_failed( + job_id=runtime.run_id, + error=str(exc), + error_code="EXECUTION_FAILED", + run_log_file=run_log_file, + ) last_component = runtime.component_runs[-1].component if runtime.component_runs else "unknown" raise HTTPException( status_code=500, @@ -153,6 +210,54 @@ def execute_workflow( ) from exc +def _should_publish_workflow_output( + *, + response: WorkflowExecuteResponse, + publication: WorkflowPublicationPolicy, + workflow_definition_source: Literal["catalog", "inline"], +) -> bool: + """Apply workflow-level publication policy to a concrete workflow output.""" + if not publication.publishable: + return False + if publication.strategy != "on_success": + return False + if not _server_allows_workflow_publication(workflow_definition_source=workflow_definition_source): + return False + if publication.required_output_file_suffixes: + suffix = Path(response.output_file).suffix.lower() + return suffix in publication.required_output_file_suffixes + return True + + +def _server_allows_workflow_publication(*, workflow_definition_source: Literal["catalog", "inline"]) -> bool: + """Apply server-side guardrails to workflow-driven publication.""" + if workflow_definition_source == "catalog": + return True + return os.environ.get("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", "").lower() in {"1", "true", "yes"} + + +def _build_publication_artifact( + *, + response: WorkflowExecuteResponse, + request: WorkflowExecuteRequest, + publication: WorkflowPublicationPolicy, + context: dict[str, Any], +) -> tuple[str, str]: + """Build the publication-facing artifact for a publishable workflow output.""" + if publication.intent.value == "feature_collection": + features = _require_context(context, "features") + records = _require_context(context, "records") + path = build_feature_collection_asset( + dataset_id=response.dataset_id, + features=features, + records=records, + period_type=request.temporal_aggregation.target_period_type, + feature_id_property=request.feature_source.feature_id_property, + ) + return path, "geojson" + return response.output_file, "datavalueset-json" + + def _is_upstream_connectivity_error(exc: Exception) -> bool: message = str(exc).lower() patterns = ( @@ -393,7 +498,13 @@ def _run_spatial_aggregation( method = request.spatial_aggregation.method feature_id_property = request.dhis2.org_unit_property execution_mode = str(step_config.get("execution_mode", "local")).lower() + temporal_dataset = context.get("temporal_dataset") if execution_mode == "remote": + if temporal_dataset is not None: + raise ValueError( + "remote spatial_aggregation does not yet support workflow temporal_aggregation output; " + "use local spatial_aggregation for temporally aggregated workflows" + ) records = runtime.run( "spatial_aggregation", _invoke_remote_spatial_aggregation_component, @@ -420,6 +531,7 @@ def _run_spatial_aggregation( features=_require_context(context, "features"), method=method, feature_id_property=feature_id_property, + aggregated_dataset=temporal_dataset, ) return {"records": records} diff --git a/src/eo_api/workflows/services/job_store.py b/src/eo_api/workflows/services/job_store.py new file mode 100644 index 0000000..0295364 --- /dev/null +++ b/src/eo_api/workflows/services/job_store.py @@ -0,0 +1,280 @@ +"""Disk-backed workflow job persistence.""" + +from __future__ import annotations + +import datetime as dt +import json +from pathlib import Path +from typing import Any, cast + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ...publications.pygeoapi import write_generated_pygeoapi_documents +from ...publications.services import delete_published_resource +from ..schemas import ( + WorkflowExecuteRequest, + WorkflowExecuteResponse, + WorkflowJobOrchestration, + WorkflowJobOrchestrationStep, + WorkflowJobRecord, + WorkflowJobStatus, + WorkflowJobStoredRecord, +) +from .definitions import WorkflowDefinition + +_DEFAULT_PROCESS_ID = "generic-dhis2-workflow" + + +def initialize_job( + *, + job_id: str, + request: WorkflowExecuteRequest, + request_payload: dict[str, Any] | None, + workflow: WorkflowDefinition, + workflow_definition_source: str, + workflow_id: str, + workflow_version: int, + status: WorkflowJobStatus = WorkflowJobStatus.RUNNING, + process_id: str = _DEFAULT_PROCESS_ID, +) -> WorkflowJobRecord: + """Create or replace a persisted job record.""" + existing = get_stored_job(job_id) + timestamp = _utc_now() + record = WorkflowJobStoredRecord( + job_id=job_id, + run_id=job_id, + process_id=process_id, + workflow_id=workflow_id, + workflow_version=workflow_version, + dataset_id=request.dataset_id, + status=status, + created_at=existing.created_at if existing is not None else timestamp, + updated_at=timestamp, + request=request_payload if request_payload is not None else request.model_dump(mode="json"), + orchestration=_build_orchestration_summary( + workflow=workflow, + workflow_definition_source=workflow_definition_source, + ), + run_log_file=existing.run_log_file if existing is not None else None, + output_file=existing.output_file if existing is not None else None, + result=existing.result if existing is not None else None, + error=existing.error if existing is not None else None, + error_code=existing.error_code if existing is not None else None, + failed_component=existing.failed_component if existing is not None else None, + failed_component_version=existing.failed_component_version if existing is not None else None, + ) + _write_job(record) + return record + + +def mark_job_running(job_id: str) -> WorkflowJobRecord: + """Transition an existing job to running.""" + record = _require_job(job_id) + updated = record.model_copy(update={"status": WorkflowJobStatus.RUNNING, "updated_at": _utc_now()}) + _write_job(updated) + return updated + + +def mark_job_success( + *, + job_id: str, + response: WorkflowExecuteResponse, +) -> WorkflowJobRecord: + """Persist successful job completion details.""" + record = _require_job(job_id) + updated = record.model_copy( + update={ + "status": WorkflowJobStatus.SUCCESSFUL, + "updated_at": _utc_now(), + "run_log_file": response.run_log_file, + "output_file": response.output_file, + "result": response.model_dump(mode="json"), + "error": None, + "error_code": None, + "failed_component": None, + "failed_component_version": None, + } + ) + _write_job(updated) + return updated + + +def mark_job_failed( + *, + job_id: str, + error: str, + error_code: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, + run_log_file: str | None = None, +) -> WorkflowJobRecord: + """Persist failed job details.""" + record = _require_job(job_id) + updated = record.model_copy( + update={ + "status": WorkflowJobStatus.FAILED, + "updated_at": _utc_now(), + "run_log_file": run_log_file or record.run_log_file, + "error": error, + "error_code": error_code, + "failed_component": failed_component, + "failed_component_version": failed_component_version, + "result": None, + } + ) + _write_job(updated) + return updated + + +def get_job(job_id: str) -> WorkflowJobRecord | None: + """Load one persisted job if it exists.""" + record = get_stored_job(job_id) + if record is None: + return None + return _to_public_job_record(record) + + +def get_stored_job(job_id: str) -> WorkflowJobStoredRecord | None: + """Load one persisted job including internal result payload if it exists.""" + path = _job_path(job_id) + if not path.exists(): + return None + return WorkflowJobStoredRecord.model_validate_json(path.read_text(encoding="utf-8")) + + +def list_jobs(*, process_id: str | None = None, status: WorkflowJobStatus | None = None) -> list[WorkflowJobRecord]: + """List persisted jobs ordered by newest first.""" + jobs: list[WorkflowJobRecord] = [] + for path in _jobs_dir().glob("*.json"): + jobs.append( + _to_public_job_record(WorkflowJobStoredRecord.model_validate_json(path.read_text(encoding="utf-8"))) + ) + jobs.sort(key=lambda item: item.created_at, reverse=True) + if process_id is not None: + jobs = [job for job in jobs if job.process_id == process_id] + if status is not None: + jobs = [job for job in jobs if job.status == status] + return jobs + + +def get_job_result(job_id: str) -> dict[str, Any] | None: + """Return persisted workflow result payload for a completed job.""" + record = get_stored_job(job_id) + if record is None: + return None + return record.result + + +def get_job_trace(job_id: str) -> dict[str, Any] | None: + """Return persisted run-trace payload for a workflow job if available.""" + record = get_stored_job(job_id) + if record is None or record.run_log_file is None: + return None + path = Path(record.run_log_file) + if not path.exists(): + return None + return cast(dict[str, Any], json.loads(path.read_text(encoding="utf-8"))) + + +def delete_job(job_id: str) -> dict[str, Any] | None: + """Delete a job and cascade removal of run-owned derived artifacts.""" + record = get_stored_job(job_id) + if record is None: + return None + + deleted_paths: list[str] = [] + publication = delete_published_resource(f"workflow-output-{job_id}") + if publication is not None: + for candidate in (publication.path, publication.metadata.get("native_output_file")): + deleted = _delete_owned_path(candidate) + if deleted is not None: + deleted_paths.append(deleted) + + for candidate in (record.run_log_file, record.output_file): + deleted = _delete_owned_path(candidate) + if deleted is not None: + deleted_paths.append(deleted) + + job_path = _job_path(job_id) + if job_path.exists(): + job_path.unlink() + deleted_paths.append(str(job_path)) + + # Keep generated documents on disk aligned with current publication truth. + config_path, openapi_path = write_generated_pygeoapi_documents() + return { + "job_id": job_id, + "deleted": True, + "deleted_paths": deleted_paths, + "deleted_publication": publication.resource_id if publication is not None else None, + "materialized_config_path": str(config_path), + "materialized_openapi_path": str(openapi_path), + "pygeoapi_runtime_reload_required": True, + } + + +def _require_job(job_id: str) -> WorkflowJobStoredRecord: + record = get_stored_job(job_id) + if record is None: + raise ValueError(f"Unknown job_id '{job_id}'") + return record + + +def _write_job(record: WorkflowJobRecord) -> None: + _jobs_dir().mkdir(parents=True, exist_ok=True) + _job_path(record.job_id).write_text(record.model_dump_json(indent=2), encoding="utf-8") + + +def _job_path(job_id: str) -> Path: + return _jobs_dir() / f"{job_id}.json" + + +def _jobs_dir() -> Path: + return DOWNLOAD_DIR / "workflow_jobs" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _to_public_job_record(record: WorkflowJobStoredRecord) -> WorkflowJobRecord: + data = record.model_dump(mode="json") + data.pop("run_id", None) + data.pop("result", None) + return WorkflowJobRecord.model_validate(data) + + +def _build_orchestration_summary( + *, + workflow: WorkflowDefinition, + workflow_definition_source: str, +) -> WorkflowJobOrchestration: + return WorkflowJobOrchestration( + definition_source=workflow_definition_source, + step_count=len(workflow.steps), + components=[step.component for step in workflow.steps], + steps=[ + WorkflowJobOrchestrationStep( + component=step.component, + version=step.version, + execution_mode=cast(str | None, step.config.get("execution_mode")), + ) + for step in workflow.steps + ], + ) + + +def _delete_owned_path(path_value: Any) -> str | None: + if not isinstance(path_value, str) or path_value == "": + return None + path = Path(path_value) + if not path.exists() or not path.is_file(): + return None + try: + resolved = path.resolve() + downloads_root = DOWNLOAD_DIR.resolve() + except OSError: + return None + if downloads_root not in resolved.parents: + return None + path.unlink() + return str(path) diff --git a/src/eo_api/workflows/services/publication_assets.py b/src/eo_api/workflows/services/publication_assets.py new file mode 100644 index 0000000..a5022e2 --- /dev/null +++ b/src/eo_api/workflows/services/publication_assets.py @@ -0,0 +1,79 @@ +"""Build OGC-ready publication assets from workflow execution context.""" + +from __future__ import annotations + +import datetime as dt +import json +from typing import Any + +import numpy as np + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import PeriodType +from .features import feature_id + + +def build_feature_collection_asset( + *, + dataset_id: str, + features: dict[str, Any], + records: list[dict[str, Any]], + period_type: PeriodType, + feature_id_property: str = "id", +) -> str: + """Write a GeoJSON FeatureCollection derived from workflow records and features.""" + features_by_id = {feature_id(feature, feature_id_property): feature for feature in features.get("features", [])} + output_features: list[dict[str, Any]] = [] + for index, record in enumerate(records): + org_unit = str(record["org_unit"]) + source_feature = features_by_id.get(org_unit) + if source_feature is None: + continue + properties = source_feature.get("properties", {}) + output_features.append( + { + "type": "Feature", + "id": f"{org_unit}-{record['time']}-{index}", + "geometry": source_feature.get("geometry"), + "properties": { + "org_unit": org_unit, + "org_unit_name": _org_unit_name(properties), + "period": _format_period(record["time"], period_type), + "value": record["value"], + }, + } + ) + + collection = {"type": "FeatureCollection", "features": output_features} + return _write_feature_collection(collection=collection, dataset_id=dataset_id) + + +def _write_feature_collection(*, collection: dict[str, Any], dataset_id: str) -> str: + DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) + now = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = DOWNLOAD_DIR / f"{dataset_id}_feature_collection_{now}.geojson" + path.write_text(json.dumps(collection, indent=2), encoding="utf-8") + return str(path) + + +def _format_period(time_value: Any, period_type: PeriodType) -> str: + ts = np.datetime64(time_value) + s = np.datetime_as_string(ts, unit="D") + year, month, day = s.split("-") + if period_type == PeriodType.DAILY: + return f"{year}-{month}-{day}" + if period_type == PeriodType.MONTHLY: + return f"{year}-{month}" + if period_type == PeriodType.YEARLY: + return year + if period_type == PeriodType.HOURLY: + return np.datetime_as_string(ts, unit="h") + return s + + +def _org_unit_name(properties: dict[str, Any]) -> str | None: + for key in ("name", "displayName", "org_unit_name"): + value = properties.get(key) + if isinstance(value, str) and value.strip(): + return value + return None diff --git a/src/eo_api/workflows/services/runtime.py b/src/eo_api/workflows/services/runtime.py index 905ba54..10a0aac 100644 --- a/src/eo_api/workflows/services/runtime.py +++ b/src/eo_api/workflows/services/runtime.py @@ -14,8 +14,8 @@ class WorkflowRuntime: """Capture execution metadata for component orchestration.""" - def __init__(self) -> None: - self.run_id = str(uuid.uuid4()) + def __init__(self, *, run_id: str | None = None) -> None: + self.run_id = run_id or str(uuid.uuid4()) self.component_runs: list[ComponentRun] = [] def run(self, component: str, fn: Callable[..., Any], **kwargs: Any) -> Any: diff --git a/tests/conftest.py b/tests/conftest.py index 9c1b3c2..a92c038 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,28 @@ +from pathlib import Path + import pytest from fastapi.testclient import TestClient from eo_api.main import app +from eo_api.publications import pygeoapi as publication_pygeoapi +from eo_api.publications import services as publication_services +from eo_api.workflows.services import datavalueset, job_store, publication_assets, run_logs @pytest.fixture def client() -> TestClient: return TestClient(app) + + +@pytest.fixture(autouse=True) +def isolate_download_artifacts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """Keep workflow/publication tests from writing into the repo download dir.""" + isolated_download_dir = tmp_path / "downloads" + isolated_download_dir.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(datavalueset, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_assets, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", isolated_download_dir) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", isolated_download_dir) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 27577db..a3ea969 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +from pathlib import Path from typing import Any, cast import numpy as np @@ -10,8 +12,10 @@ from fastapi.testclient import TestClient from eo_api.main import app +from eo_api.publications import pygeoapi as publication_pygeoapi +from eo_api.publications import services as publication_services from eo_api.workflows.schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowRequest -from eo_api.workflows.services import engine +from eo_api.workflows.services import engine, job_store, run_logs from eo_api.workflows.services.definitions import WorkflowDefinition, load_workflow_definition from eo_api.workflows.services.simple_mapper import normalize_simple_request @@ -34,6 +38,38 @@ def _valid_public_payload() -> dict[str, Any]: } +def _patch_successful_execution(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), + ) + + def test_workflow_endpoint_exists_once() -> None: workflow_routes = { route.path @@ -43,6 +79,47 @@ def test_workflow_endpoint_exists_once() -> None: assert workflow_routes == {"/workflows/dhis2-datavalue-set", "/workflows/execute", "/workflows/validate"} +def test_ogc_process_routes_exist() -> None: + ogc_routes = { + route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/ogcapi") + } + assert "/ogcapi/processes" in ogc_routes + assert "/ogcapi/processes/{process_id}" in ogc_routes + assert "/ogcapi/processes/{process_id}/execution" in ogc_routes + assert "/ogcapi/jobs" in ogc_routes + assert "/ogcapi/jobs/{job_id}" in ogc_routes + assert "/ogcapi/jobs/{job_id}/results" in ogc_routes + + +def test_publication_generated_pygeoapi_routes_exist() -> None: + publication_routes = { + route.path + for route in app.routes + if isinstance(route, APIRoute) and route.path.startswith("/publications/pygeoapi") + } + assert "/publications/pygeoapi/config" in publication_routes + assert "/publications/pygeoapi/openapi" in publication_routes + assert "/publications/pygeoapi/materialize" in publication_routes + + +def test_pygeoapi_runtime_env_points_to_generated_documents() -> None: + config_path = os.environ.get("PYGEOAPI_CONFIG") + openapi_path = os.environ.get("PYGEOAPI_OPENAPI") + assert config_path is not None + assert openapi_path is not None + assert config_path.endswith("pygeoapi-config.generated.yml") + assert openapi_path.endswith("pygeoapi-openapi.generated.yml") + assert Path(config_path).exists() + assert Path(openapi_path).exists() + + +def test_pygeoapi_mount_serves_landing_page(client: TestClient) -> None: + response = client.get("/ogcapi?f=json") + assert response.status_code == 200 + body = response.json() + assert body["title"] == "DHIS2 EO API" + + def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClient) -> None: response = client.get("/workflows") assert response.status_code == 200 @@ -53,6 +130,9 @@ def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClie default = by_id["dhis2_datavalue_set_v1"] assert default["version"] == 1 + assert default["publication_publishable"] is True + assert default["publication_intent"] == "feature_collection" + assert default["publication_exposure"] == "ogc" assert default["step_count"] == 5 assert default["components"] == [ "feature_source", @@ -64,6 +144,9 @@ def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClie fast = by_id["dhis2_datavalue_set_without_temporal_aggregation_v1"] assert fast["version"] == 1 + assert fast["publication_publishable"] is False + assert fast["publication_intent"] is None + assert fast["publication_exposure"] is None assert fast["step_count"] == 4 assert fast["components"] == [ "feature_source", @@ -136,8 +219,9 @@ def _execute_stub( workflow_id: str = "dhis2_datavalue_set_v1", request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, + workflow_definition_source: str = "catalog", ) -> WorkflowExecuteResponse: - del payload, workflow_id, request_params, include_component_run_details + del payload, workflow_id, request_params, include_component_run_details, workflow_definition_source return stub monkeypatch.setattr( @@ -198,8 +282,9 @@ def _execute_stub( workflow_id: str = "dhis2_datavalue_set_v1", request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, + workflow_definition_source: str = "catalog", ) -> WorkflowExecuteResponse: - del payload, workflow_id, request_params, include_component_run_details + del payload, workflow_id, request_params, include_component_run_details, workflow_definition_source return stub monkeypatch.setattr("eo_api.workflows.routes.normalize_simple_request", lambda payload: (normalized, [])) @@ -235,10 +320,12 @@ def _execute_stub( workflow_definition: WorkflowDefinition | None = None, request_params: dict[str, Any] | None = None, include_component_run_details: bool = False, + workflow_definition_source: str = "inline", ) -> WorkflowExecuteResponse: del payload, request_params, include_component_run_details assert workflow_id == "adhoc_dhis2_v1" assert workflow_definition is not None + assert workflow_definition_source == "inline" assert workflow_definition.workflow_id == "adhoc_dhis2_v1" assert len(workflow_definition.steps) == 4 return stub @@ -376,6 +463,453 @@ def test_workflow_validate_endpoint_unknown_workflow_id(client: TestClient) -> N assert "Unknown workflow_id" in body["errors"][0] +def test_workflow_job_endpoints_return_persisted_result( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 200 + job_body = job_response.json() + assert job_body["job_id"] == run_id + assert job_body["status"] == "successful" + assert job_body["process_id"] == "generic-dhis2-workflow" + assert job_body["request"]["dataset_id"] == "chirps3_precipitation_daily" + assert job_body["request"]["start_date"] == "2024-01-01" + assert job_body["request"]["end_date"] == "2024-01-31" + assert job_body["orchestration"]["definition_source"] == "catalog" + assert job_body["orchestration"]["step_count"] == 5 + assert job_body["orchestration"]["components"] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + assert job_body["orchestration"]["steps"][0]["component"] == "feature_source" + assert job_body["orchestration"]["steps"][0]["version"] == "v1" + links = {item["rel"]: item["href"] for item in job_body["links"]} + assert links["self"].endswith(f"/workflows/jobs/{run_id}") + assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") + assert links["trace"].endswith(f"/workflows/jobs/{run_id}/trace") + assert links["collection"].endswith(f"/ogcapi/collections/workflow-output-{run_id}") + assert "result" not in job_body + + results_response = client.get(f"/workflows/jobs/{run_id}/result") + assert results_response.status_code == 200 + assert results_response.json()["run_id"] == run_id + + trace_response = client.get(f"/workflows/jobs/{run_id}/trace") + assert trace_response.status_code == 200 + trace_body = trace_response.json() + assert trace_body["run_id"] == run_id + assert trace_body["status"] == "completed" + assert [item["component"] for item in trace_body["component_runs"]] == [ + "feature_source", + "download_dataset", + "temporal_aggregation", + "spatial_aggregation", + "build_datavalueset", + ] + + +def test_delete_workflow_job_cascades_derived_artifacts( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + output_path = job_store.DOWNLOAD_DIR / "cascade-test-datavalue-set.json" + output_path.write_text('{"dataValues": [{"value": "10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(output_path)), + ) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + output_file = Path(response.json()["output_file"]) + run_log_file = Path(response.json()["run_log_file"]) + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + derived = next( + item for item in publications_response.json()["resources"] if item["resource_id"] == f"workflow-output-{run_id}" + ) + publication_file = publication_services.DOWNLOAD_DIR / "published_resources" / f"workflow-output-{run_id}.json" + publication_asset = Path(derived["path"]) + job_file = job_store.DOWNLOAD_DIR / "workflow_jobs" / f"{run_id}.json" + + assert job_file.exists() + assert run_log_file.exists() + assert output_file.exists() + assert publication_file.exists() + assert publication_asset.exists() + + delete_response = client.delete(f"/workflows/jobs/{run_id}") + assert delete_response.status_code == 200 + delete_body = delete_response.json() + assert delete_body["job_id"] == run_id + assert delete_body["deleted"] is True + assert delete_body["deleted_publication"] == f"workflow-output-{run_id}" + assert delete_body["pygeoapi_runtime_reload_required"] is True + + assert not job_file.exists() + assert not run_log_file.exists() + assert not output_file.exists() + assert not publication_file.exists() + assert not publication_asset.exists() + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 404 + + publication_response = client.get(f"/publications/workflow-output-{run_id}") + assert publication_response.status_code == 404 + + +def test_ogc_async_execution_creates_job_and_results( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + body = response.json() + assert body["status"] == "accepted" + job_id = body["jobID"] + + job_response = client.get(f"/ogcapi/jobs/{job_id}") + assert job_response.status_code == 200 + assert job_response.json()["status"] == "successful" + + results_response = client.get(f"/ogcapi/jobs/{job_id}/results") + assert results_response.status_code == 200 + assert results_response.json()["run_id"] == job_id + + +def test_publications_endpoint_seeds_source_datasets( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + + response = client.get("/publications") + assert response.status_code == 200 + body = response.json() + resource_ids = {item["resource_id"] for item in body["resources"]} + assert "dataset-chirps3_precipitation_daily" in resource_ids + assert "dataset-worldpop_population_yearly" in resource_ids + + +def test_generated_pygeoapi_config_reflects_collection_registry( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + body = response.json() + resources = body["resources"] + assert len(resources) > 0 + assert "chirps3_precipitation_daily" in resources + first = resources["chirps3_precipitation_daily"] + assert first["type"] == "collection" + assert "title" in first + + +def test_generated_pygeoapi_config_contains_collection_detail( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + collection = response.json()["resources"]["chirps3_precipitation_daily"] + assert collection["type"] == "collection" + assert collection["title"]["en"] + assert collection["providers"][0]["type"] == "coverage" + + +def test_workflow_success_registers_derived_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + derived = next(item for item in resources if item["resource_id"] == f"workflow-output-{run_id}") + assert derived["resource_class"] == "derived" + assert derived["job_id"] == run_id + assert derived["ogc_path"] == f"/ogcapi/collections/workflow-output-{run_id}" + assert derived["exposure"] == "ogc" + assert derived["asset_format"] == "geojson" + assert derived["path"].endswith(".geojson") + assert derived["metadata"]["native_output_file"].endswith(".json") + geojson = Path(derived["path"]).read_text(encoding="utf-8") + assert '"org_unit_name"' in geojson + assert '"period": "2024-01"' in geojson + assert '"period_type"' not in geojson + assert '"dataset_id"' not in geojson + + +def test_workflow_with_publication_disabled_does_not_register_derived_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + payload = _valid_public_payload() + payload["request"]["workflow_id"] = "dhis2_datavalue_set_without_temporal_aggregation_v1" + + response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get( + "/publications", + params={"workflow_id": "dhis2_datavalue_set_without_temporal_aggregation_v1"}, + ) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + +def test_inline_workflow_publication_intent_is_blocked_by_server_guardrail( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.delenv("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", raising=False) + _patch_successful_execution(monkeypatch) + + payload = { + "workflow": { + "workflow_id": "adhoc_chirps_mixed_exec_v1", + "version": 1, + "publication": { + "publishable": True, + "strategy": "on_success", + "intent": "feature_collection", + }, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + }, + "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, + } + + response = client.post("/workflows/execute", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "adhoc_chirps_mixed_exec_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + +def test_inline_workflow_publication_intent_can_be_enabled_by_server_policy( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setenv("EO_API_ALLOW_INLINE_WORKFLOW_PUBLICATION", "true") + _patch_successful_execution(monkeypatch) + + payload = { + "workflow": { + "workflow_id": "adhoc_chirps_mixed_exec_v1", + "version": 1, + "publication": { + "publishable": True, + "strategy": "on_success", + "intent": "feature_collection", + }, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + }, + "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, + } + + response = client.post("/workflows/execute", json=payload) + assert response.status_code == 200 + run_id = response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "adhoc_chirps_mixed_exec_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + derived = next(item for item in resources if item["resource_id"] == f"workflow-output-{run_id}") + assert derived["workflow_id"] == "adhoc_chirps_mixed_exec_v1" + assert derived["exposure"] == "registry_only" + + +def test_ogc_process_sync_execution_links_to_collection( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + + response = client.post("/ogcapi/processes/generic-dhis2-workflow/execution", json=_valid_public_payload()) + assert response.status_code == 200 + body = response.json() + collection_links = [item for item in body["links"] if item["rel"] == "collection"] + assert len(collection_links) == 1 + assert "/ogcapi/collections/workflow-output-" in collection_links[0]["href"] + + +def test_generated_pygeoapi_config_reflects_publication_registry( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + body = response.json() + resources = body["resources"] + assert "chirps3_precipitation_daily" in resources + chirps = resources["chirps3_precipitation_daily"] + assert chirps["type"] == "collection" + assert chirps["providers"][0]["type"] == "coverage" + assert chirps["metadata"]["dataset_id"] == "chirps3_precipitation_daily" + + +def test_generated_pygeoapi_openapi_includes_derived_collection( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + response = client.get("/publications/pygeoapi/openapi") + assert response.status_code == 200 + body = response.json() + assert "/collections/chirps3_precipitation_daily" in body["paths"] + assert "chirps3_precipitation_daily" in body["x-generated-resources"] + assert f"/collections/workflow-output-{run_id}" in body["paths"] + assert f"workflow-output-{run_id}" in body["x-generated-resources"] + + +def test_generated_pygeoapi_config_includes_geojson_derived_resource( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + resources = response.json()["resources"] + derived = resources[f"workflow-output-{run_id}"] + assert derived["providers"][0]["name"] == "GeoJSON" + assert derived["providers"][0]["type"] == "feature" + assert derived["providers"][0]["data"].endswith(".geojson") + + +def test_materialize_generated_pygeoapi_documents_writes_files( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + + response = client.post("/publications/pygeoapi/materialize") + assert response.status_code == 200 + body = response.json() + config_path = Path(body["config_path"]) + openapi_path = Path(body["openapi_path"]) + assert config_path.exists() + assert openapi_path.exists() + assert "resources:" in config_path.read_text(encoding="utf-8") + + def test_component_spatial_aggregation_serializes_numpy_datetime64( client: TestClient, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -486,6 +1020,56 @@ def _download_dataset_component(**kwargs: Any) -> None: assert called["downloaded"] is True +def test_engine_spatial_aggregation_uses_temporally_aggregated_dataset(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + temporal_ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[31.0]]])}, + coords={"time": ["2024-01"], "lat": [0], "lon": [0]}, + ) + + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: temporal_ds) + + def _spatial_aggregation_component(**kwargs: Any) -> list[dict[str, Any]]: + assert kwargs["aggregated_dataset"] is temporal_ds + return [{"org_unit": "OU_1", "time": "2024-01", "value": 31.0}] + + monkeypatch.setattr(engine.component_services, "spatial_aggregation_component", _spatial_aggregation_component) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "31.0", "period": "202401"}]}, "/tmp/data/out.json"), + ) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + response = engine.execute_workflow(request, include_component_run_details=True) + assert response.status == "completed" + assert response.data_value_set["dataValues"][0]["period"] == "202401" + + def test_engine_hides_component_details_by_default(monkeypatch: pytest.MonkeyPatch) -> None: request = WorkflowExecuteRequest.model_validate( { @@ -535,6 +1119,68 @@ def test_engine_hides_component_details_by_default(monkeypatch: pytest.MonkeyPat assert response.component_run_details_available is True +def test_engine_rejects_remote_spatial_after_temporal_aggregation(monkeypatch: pytest.MonkeyPatch) -> None: + request = WorkflowExecuteRequest.model_validate( + { + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "feature_source": {"source_type": "dhis2_level", "dhis2_level": 3}, + "temporal_aggregation": {"target_period_type": "monthly", "method": "sum"}, + "spatial_aggregation": {"method": "mean"}, + "dhis2": {"data_element_uid": "abc123def45"}, + } + ) + temporal_ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[31.0]]])}, + coords={"time": ["2024-01"], "lat": [0], "lon": [0]}, + ) + workflow = WorkflowDefinition.model_validate( + { + "workflow_id": "dhis2_datavalue_set_v1", + "version": 1, + "steps": [ + {"component": "feature_source"}, + {"component": "download_dataset"}, + {"component": "temporal_aggregation"}, + { + "component": "spatial_aggregation", + "config": { + "execution_mode": "remote", + "remote_url": "http://localhost:8000/components/spatial-aggregation", + }, + }, + {"component": "build_datavalueset"}, + ], + } + ) + + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": "chirps3_precipitation_daily", "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0, 0, 1, 1], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: temporal_ds) + monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") + + with pytest.raises(HTTPException) as exc_info: + engine.execute_workflow(request, workflow_definition=workflow) + + assert exc_info.value.status_code == 500 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["failed_component"] == "spatial_aggregation" + assert "local spatial_aggregation" in detail["message"] + + def test_engine_returns_503_when_upstream_unreachable(monkeypatch: pytest.MonkeyPatch) -> None: request = WorkflowExecuteRequest.model_validate( { @@ -1043,7 +1689,9 @@ def test_engine_rejects_remote_fields_in_local_mode(monkeypatch: pytest.MonkeyPa assert detail["failed_component"] == "download_dataset" -def test_engine_supports_remote_mode_for_all_components(monkeypatch: pytest.MonkeyPatch) -> None: +def test_engine_supports_remote_mode_for_remote_compatible_component_chain( + monkeypatch: pytest.MonkeyPatch, +) -> None: request = WorkflowExecuteRequest.model_validate( { "dataset_id": "chirps3_precipitation_daily", @@ -1074,13 +1722,6 @@ def test_engine_supports_remote_mode_for_all_components(monkeypatch: pytest.Monk "remote_url": "http://x/components/download-dataset", }, }, - { - "component": "temporal_aggregation", - "config": { - "execution_mode": "remote", - "remote_url": "http://x/components/temporal-aggregation", - }, - }, { "component": "spatial_aggregation", "config": { @@ -1108,7 +1749,6 @@ def test_engine_supports_remote_mode_for_all_components(monkeypatch: pytest.Monk called: dict[str, bool] = { "feature": False, "download": False, - "temporal": False, "spatial": False, "build": False, } @@ -1127,11 +1767,6 @@ def test_engine_supports_remote_mode_for_all_components(monkeypatch: pytest.Monk "_invoke_remote_download_component", lambda **kwargs: called.__setitem__("download", True), ) - monkeypatch.setattr( - engine, - "_invoke_remote_temporal_aggregation_component", - lambda **kwargs: (called.__setitem__("temporal", True), {"sizes": {"time": 1}, "dims": ["time"]})[1], - ) monkeypatch.setattr( engine, "_invoke_remote_spatial_aggregation_component", From d535395bbca3de27a4ac5d4538d372369cc82434 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 18 Mar 2026 16:04:42 +0100 Subject: [PATCH 07/15] Add dynamic OGC collection browsing and embedded analytics --- docs/internal/SESSION_HANDOFF_2026-03-18.md | 180 +++ src/eo_api/analytics_viewer/__init__.py | 5 + src/eo_api/analytics_viewer/routes.py | 493 +++++++++ src/eo_api/components/services.py | 15 + src/eo_api/main.py | 3 +- src/eo_api/ogc/routes.py | 1088 ++++++++++++++++++- src/eo_api/publications/pygeoapi.py | 6 +- src/eo_api/publications/services.py | 1 + src/eo_api/workflows/routes.py | 127 ++- src/eo_api/workflows/schemas.py | 34 + src/eo_api/workflows/services/engine.py | 105 +- src/eo_api/workflows/services/job_store.py | 59 + tests/test_workflows.py | 232 +++- 13 files changed, 2296 insertions(+), 52 deletions(-) create mode 100644 docs/internal/SESSION_HANDOFF_2026-03-18.md create mode 100644 src/eo_api/analytics_viewer/__init__.py create mode 100644 src/eo_api/analytics_viewer/routes.py diff --git a/docs/internal/SESSION_HANDOFF_2026-03-18.md b/docs/internal/SESSION_HANDOFF_2026-03-18.md new file mode 100644 index 0000000..96421a4 --- /dev/null +++ b/docs/internal/SESSION_HANDOFF_2026-03-18.md @@ -0,0 +1,180 @@ +# Session Handoff - 2026-03-18 + +## Stop Point + +This is a clean demo checkpoint. + +The system now has: + +1. native workflow execution and job persistence as backend truth +2. publication registration via `PublishedResource` +3. dynamic OGC collection/detail/items routes backed directly by live publication state +4. a pluggable analytics viewer mounted outside the OGC core +5. OGC HTML items pages that can switch between `Browse` and embedded `Analytics` modes + +The important operational improvement is that collection publish/delete visibility no longer requires restart. + +--- + +## What Changed This Session + +### 1. Dynamic OGC collection surface + +Implemented in: + +1. [src/eo_api/ogc/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/ogc/routes.py) + +Behavior: + +1. `/ogcapi/collections` +2. `/ogcapi/collections/{collection_id}` +3. `/ogcapi/collections/{collection_id}/items` + +are now served natively from live publication truth instead of relying on startup-loaded `pygeoapi` collection state. + +Result: + +1. new derived publications appear immediately +2. deleted workflow-output collections disappear immediately +3. no restart is needed for collection visibility changes + +### 2. OGC HTML became first-class + +The OGC HTML pages are now intentionally controlled rather than inherited utility pages. + +Current state: + +1. collections page uses a scalable list/table layout +2. collection pages have clearer representation labeling +3. collection items pages have explicit OGC navigation and back links +4. items pages support period filtering in HTML +5. items pages now have two modes: + - `Browse` + - `Analytics` + +### 3. Analytics viewer remained pluggable but is now embedded + +Implemented in: + +1. [src/eo_api/analytics_viewer/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/analytics_viewer/routes.py) + +Current model: + +1. `/analytics/...` still exists as the pluggable analytics module +2. the OGC items HTML page can embed that module in-place +3. this keeps the implementation swappable while avoiding a detached user journey + +### 4. Published workflow output representation improved + +For derived feature collections: + +1. published properties were cleaned to focus on: + - `org_unit` + - `org_unit_name` + - `period` + - `value` +2. precipitation views now use a blue value ramp +3. OGC collection tables distinguish: + - source dataset + - native workflow output + - OGC representation type + +### 5. Workflow runtime contracts were tightened + +Implemented in: + +1. [src/eo_api/workflows/services/engine.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/services/engine.py) +2. [src/eo_api/components/services.py](/home/abyot/coding/EO/eo-api/src/eo_api/components/services.py) + +Behavior: + +1. workflow step handoff uses typed artifacts instead of a loose context dict +2. temporal aggregation can no-op/pass through when source period already matches requested period +3. orchestration wires artifacts; components own pass-through decisions + +### 6. Retention cleanup exists + +Implemented in: + +1. [src/eo_api/workflows/routes.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/routes.py) +2. [src/eo_api/workflows/services/job_store.py](/home/abyot/coding/EO/eo-api/src/eo_api/workflows/services/job_store.py) + +Endpoint: + +1. `POST /workflows/jobs/cleanup` + +Policy knobs: + +1. `dry_run` +2. `keep_latest` +3. `older_than_hours` + +Cleanup cascades through: + +1. job record +2. run trace +3. native workflow output +4. derived publication record +5. derived publication asset + +--- + +## Current UX Shape + +The intended human-facing entry path is now: + +1. `/ogcapi/collections?f=html` +2. select a collection +3. open `/ogcapi/collections/{id}/items?f=html` +4. switch between: + - `Browse` + - `Analytics` + +This keeps the user inside the OGC page flow while still using the pluggable analytics module underneath. + +--- + +## Standards Boundary + +The current discipline is: + +1. OGC JSON/resource shape stays standards-oriented +2. HTML is allowed to be product-friendly +3. current `period=` handling is an HTML convenience +4. long-term machine filtering should move toward CQL2 rather than grow more ad hoc parameters + +--- + +## Verification State + +At stop: + +1. `uv run pytest -q tests/test_workflows.py` passes +2. `make lint` passes + +--- + +## Recommended Next Step + +The next meaningful architectural step is: + +1. strengthen error handling and response envelopes across the workflow/OGC surfaces + +Followed by: + +1. decide whether period filtering should begin moving toward CQL2-style handling for machine clients +2. continue tightening component contracts only where real ambiguity remains + +--- + +## Demo Notes + +Good demo flow: + +1. execute a publishable workflow +2. show `/workflows/jobs/{job_id}` +3. show `/ogcapi/collections` +4. open the derived workflow-output collection +5. open items HTML +6. switch between `Browse` and `Analytics` +7. optionally delete the job and refresh collections to show immediate disappearance without restart diff --git a/src/eo_api/analytics_viewer/__init__.py b/src/eo_api/analytics_viewer/__init__.py new file mode 100644 index 0000000..f3546a5 --- /dev/null +++ b/src/eo_api/analytics_viewer/__init__.py @@ -0,0 +1,5 @@ +"""Time-aware analytics viewer module.""" + +from . import routes as routes + +__all__ = ["routes"] diff --git a/src/eo_api/analytics_viewer/routes.py b/src/eo_api/analytics_viewer/routes.py new file mode 100644 index 0000000..12785f6 --- /dev/null +++ b/src/eo_api/analytics_viewer/routes.py @@ -0,0 +1,493 @@ +"""Pluggable time-aware analytics viewer routes.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from fastapi import APIRouter, HTTPException +from fastapi.responses import HTMLResponse + +from ..publications import services as publication_services +from ..publications.schemas import PublishedResourceKind + +router = APIRouter() + + +@router.get("/publications/{resource_id}") +def get_publication_analytics_config(resource_id: str) -> dict[str, Any]: + """Return viewer configuration for one published resource.""" + resource = publication_services.get_published_resource(resource_id) + if resource is None: + raise HTTPException(status_code=404, detail=f"Unknown resource_id '{resource_id}'") + if resource.kind != PublishedResourceKind.FEATURE_COLLECTION or resource.path is None: + raise HTTPException( + status_code=409, + detail=f"Resource '{resource_id}' is not a feature collection viewer target", + ) + + data_url = _data_url_for_path(resource.path) + return { + "resource_id": resource.resource_id, + "title": resource.title, + "description": resource.description, + "dataset_id": resource.dataset_id, + "workflow_id": resource.workflow_id, + "job_id": resource.job_id, + "data_url": data_url, + "ogc_items_url": f"/ogcapi/collections/{resource.resource_id}/items", + "links": { + "publication": f"/publications/{resource.resource_id}", + "collection": f"/ogcapi/collections/{resource.resource_id}", + "items": f"/ogcapi/collections/{resource.resource_id}/items", + }, + } + + +@router.get("/publications/{resource_id}/viewer", response_class=HTMLResponse) +def get_publication_analytics_viewer(resource_id: str, embed: bool = False) -> HTMLResponse: + """Return an interactive time-aware analytics viewer for one published resource.""" + config = get_publication_analytics_config(resource_id) + return HTMLResponse(_render_viewer_html(config, embed=embed)) + + +def _data_url_for_path(path_value: str) -> str: + path = Path(path_value).resolve() + downloads_root = publication_services.DOWNLOAD_DIR.resolve() + if downloads_root not in path.parents: + raise HTTPException(status_code=409, detail="Published resource path is outside mounted download storage") + relative_path = path.relative_to(downloads_root).as_posix() + return f"/data/{relative_path}" + + +def _render_viewer_html(config: dict[str, Any], *, embed: bool = False) -> str: + config_json = json.dumps(config) + shell_padding = "0" if embed else "28px 24px 40px" + shell_max_width = "100%" if embed else "1440px" + shell_margin = "0" if embed else "0 auto" + body_background = ( + "transparent" + if embed + else """ + radial-gradient(circle at top left, rgba(221, 141, 85, 0.18), transparent 32%), + radial-gradient(circle at right, rgba(65, 130, 180, 0.16), transparent 28%), + linear-gradient(180deg, #f8f4ee 0%, var(--bg) 100%) + """ + ) + hero_html = ( + "" + if embed + else f""" +
Analytics Viewer
+

{config["title"]}

+

+ Time-aware choropleth view over the published workflow output. This viewer is intentionally isolated from the + OGC/publication core so it can be swapped or removed without changing the publication contract. +

+ """ + ) + return f""" + + + + + {config["title"]} Analytics Viewer + + + + + +
+ {hero_html} +
+
+
+ + + +
+
+
+
+ +
+
+ + +""" diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index 5539218..58ad6df 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import Mapping from typing import Any, Final import xarray as xr @@ -255,6 +256,9 @@ def temporal_aggregation_component( ) -> xr.Dataset: """Load dataset and aggregate over time.""" ds = get_data(dataset=dataset, start=start, end=end, bbox=bbox) + source_period_type = _dataset_period_type(dataset) + if source_period_type == target_period_type: + return ds return aggregate_temporal(ds=ds, period_type=target_period_type, method=method) @@ -301,3 +305,14 @@ def require_dataset(dataset_id: str) -> dict[str, Any]: if dataset is None: raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") return dataset + + +def _dataset_period_type(dataset: Mapping[str, Any]) -> PeriodType | None: + raw_value = dataset.get("period_type") + if not isinstance(raw_value, str): + return None + normalized = raw_value.strip().lower() + try: + return PeriodType(normalized) + except ValueError: + return None diff --git a/src/eo_api/main.py b/src/eo_api/main.py index 86ce4c7..dd40d89 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -5,7 +5,7 @@ from fastapi.staticfiles import StaticFiles import eo_api.startup # noqa: F401 # pyright: ignore[reportUnusedImport] -from eo_api import components, data_accessor, data_manager, data_registry, system, workflows +from eo_api import analytics_viewer, components, data_accessor, data_manager, data_registry, system, workflows from eo_api.ogc import routes as ogc_routes from eo_api.ogc_api import ogc_api_app from eo_api.publications import generated_routes as publication_generated_routes @@ -28,6 +28,7 @@ app.include_router(workflows.routes.router, prefix="/workflows", tags=["Workflows"]) app.include_router(publication_routes.router, prefix="/publications", tags=["Publications"]) app.include_router(publication_generated_routes.router, prefix="/publications", tags=["Publications"]) +app.include_router(analytics_viewer.routes.router, prefix="/analytics", tags=["Analytics"]) app.include_router(components.routes.router, tags=["Components"]) app.include_router(ogc_routes.router, prefix="/ogcapi", tags=["OGC API"]) app.mount("/data", StaticFiles(directory="data/downloads"), name="Data") diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index f67726c..106cbd2 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -2,14 +2,24 @@ from __future__ import annotations +import json import uuid +from html import escape +from pathlib import Path from typing import Any from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, Response +from fastapi.responses import HTMLResponse from ..publications.schemas import PublishedResourceExposure -from ..publications.services import collection_id_for_resource, get_published_resource -from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus +from ..publications.services import ( + collection_id_for_resource, + ensure_source_dataset_publications, + get_published_resource, + get_published_resource_by_collection_id, + list_published_resources, +) +from ..workflows.schemas import ApiErrorResponse, WorkflowExecuteEnvelopeRequest, WorkflowJobStatus from ..workflows.services.definitions import load_workflow_definition from ..workflows.services.engine import execute_workflow from ..workflows.services.job_store import get_job, get_job_result, initialize_job, list_jobs @@ -21,6 +31,126 @@ _PROCESS_TITLE = "Generic DHIS2 workflow" +def _api_error( + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + status: str | None = None, +) -> dict[str, str]: + return ApiErrorResponse( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + status=status, + ).model_dump(exclude_none=True) + + +@router.get("/collections", response_model=None) +def list_collections(request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: + """List OGC collections directly from live publication state.""" + collections = [_collection_summary(resource, request) for resource in _ogc_resources()] + body = { + "collections": collections, + "links": [ + {"rel": "self", "type": "application/json", "href": _request_href(request, f="json")}, + {"rel": "alternate", "type": "text/html", "href": _request_href(request, f="html")}, + {"rel": "root", "type": "application/json", "href": str(request.base_url).rstrip("/") + "/ogcapi"}, + ], + } + if _wants_html(request, f): + return HTMLResponse(_render_collections_html(collections)) + return body + + +@router.get("/collections/{collection_id}", response_model=None) +def get_collection(collection_id: str, request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: + """Return one dynamic collection document backed by publication truth.""" + resource = get_published_resource_by_collection_id(collection_id) + if resource is None or resource.exposure != PublishedResourceExposure.OGC: + raise HTTPException( + status_code=404, + detail=_api_error( + error="collection_not_found", + error_code="COLLECTION_NOT_FOUND", + message=f"Unknown collection_id '{collection_id}'", + resource_id=collection_id, + ), + ) + body = _collection_detail(resource, request) + if _wants_html(request, f): + return HTMLResponse(_render_collection_html(body)) + return body + + +@router.get("/collections/{collection_id}/items", response_model=None) +def get_collection_items( + collection_id: str, + request: Request, + limit: int = 20, + offset: int = 0, + period: str | None = None, + view: str | None = None, + f: str | None = None, +) -> dict[str, Any] | HTMLResponse: + """Return dynamic feature items for a GeoJSON-backed published collection.""" + resource = get_published_resource_by_collection_id(collection_id) + if resource is None or resource.exposure != PublishedResourceExposure.OGC: + raise HTTPException( + status_code=404, + detail=_api_error( + error="collection_not_found", + error_code="COLLECTION_NOT_FOUND", + message=f"Unknown collection_id '{collection_id}'", + resource_id=collection_id, + ), + ) + if resource.path is None or Path(resource.path).suffix.lower() != ".geojson": + raise HTTPException( + status_code=409, + detail=_api_error( + error="collection_items_unavailable", + error_code="COLLECTION_ITEMS_UNAVAILABLE", + message=f"Collection '{collection_id}' does not expose OGC items", + resource_id=collection_id, + ), + ) + items = _load_feature_collection(resource.path) + features = items.get("features", []) + if period is not None: + features = [feature for feature in features if feature.get("properties", {}).get("period") == period] + matched = len(features) + page = features[offset : offset + max(limit, 0)] + body = { + "type": "FeatureCollection", + "id": collection_id, + "title": resource.title, + "numberMatched": matched, + "numberReturned": len(page), + "features": page, + "links": _item_links(request, resource, limit=limit, offset=offset, matched=matched, period=period), + } + if _wants_html(request, f): + return HTMLResponse( + _render_items_html( + resource, + page, + limit=limit, + offset=offset, + matched=matched, + selected_period=period, + view_mode=view or "browse", + ) + ) + return body + + @router.get("/processes") def list_processes(request: Request) -> dict[str, Any]: """List exposed OGC processes.""" @@ -155,7 +285,15 @@ def get_ogc_job(job_id: str, request: Request) -> dict[str, Any]: """Fetch one OGC job view from the native job store.""" job = get_job(job_id) if job is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) publication = get_published_resource(f"workflow-output-{job.job_id}") links: list[dict[str, Any]] = [ { @@ -192,16 +330,41 @@ def get_ogc_job_results(job_id: str) -> dict[str, Any]: """Return persisted results for a completed OGC job.""" job = get_job(job_id) if job is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) result = get_job_result(job_id) if result is None: - raise HTTPException(status_code=409, detail={"jobID": job_id, "status": job.status}) + raise HTTPException( + status_code=409, + detail=_api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) return result def _require_process(process_id: str) -> None: if process_id != _PROCESS_ID: - raise HTTPException(status_code=404, detail=f"Unknown process_id '{process_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="process_not_found", + error_code="PROCESS_NOT_FOUND", + message=f"Unknown process_id '{process_id}'", + process_id=process_id, + ), + ) def _run_async_workflow_job( @@ -225,3 +388,916 @@ def _run_async_workflow_job( def _collection_href(request: Request, collection_id: str) -> str: return str(request.base_url).rstrip("/") + f"/ogcapi/collections/{collection_id}" + + +def _ogc_resources() -> list[Any]: + ensure_source_dataset_publications() + return list_published_resources(exposure=PublishedResourceExposure.OGC) + + +def _collection_summary(resource: Any, request: Request) -> dict[str, Any]: + collection_id = collection_id_for_resource(resource) + item_type = "feature" if resource.path and Path(resource.path).suffix.lower() == ".geojson" else "coverage" + representation_type = "Feature Collection" if item_type == "feature" else "Coverage" + links = [ + {"rel": "self", "type": "application/json", "href": _collection_href(request, collection_id)}, + {"rel": "alternate", "type": "text/html", "href": _collection_href(request, collection_id) + "?f=html"}, + ] + if item_type == "feature": + links.extend( + [ + { + "rel": "items", + "type": "application/geo+json", + "href": _collection_href(request, collection_id) + "/items", + }, + { + "rel": "items-html", + "type": "text/html", + "href": _collection_href(request, collection_id) + "/items?f=html", + }, + ] + ) + for link in resource.links: + href = str(link.get("href", "")) + if href: + links.append( + { + "rel": str(link.get("rel", "related")), + "type": "text/html" if link.get("rel") == "analytics" else "application/json", + "href": _absolute_href(request, href), + } + ) + return { + "id": collection_id, + "title": resource.title, + "description": resource.description, + "itemType": item_type, + "representationType": representation_type, + "extent": {"spatial": {"bbox": [_bbox_for_resource(resource)]}}, + "links": links, + } + + +def _collection_detail(resource: Any, request: Request) -> dict[str, Any]: + collection = _collection_summary(resource, request) + collection["crs"] = ["http://www.opengis.net/def/crs/OGC/1.3/CRS84"] + collection["storageCrs"] = "http://www.opengis.net/def/crs/OGC/1.3/CRS84" + collection["keywords"] = _keywords_for_resource(resource) + collection["metadata"] = { + "resource_id": resource.resource_id, + "resource_class": str(resource.resource_class), + "dataset_id": resource.dataset_id, + "workflow_id": resource.workflow_id, + "job_id": resource.job_id, + } + return collection + + +def _item_links( + request: Request, resource: Any, *, limit: int, offset: int, matched: int, period: str | None +) -> list[dict[str, str]]: + collection_id = collection_id_for_resource(resource) + base_href = _collection_href(request, collection_id) + "/items" + period_query = f"&period={period}" if period is not None else "" + links = [ + { + "rel": "self", + "type": "application/geo+json", + "href": f"{base_href}?limit={limit}&offset={offset}{period_query}", + }, + {"rel": "collection", "type": "application/json", "href": _collection_href(request, collection_id)}, + { + "rel": "alternate", + "type": "text/html", + "href": f"{base_href}?limit={limit}&offset={offset}{period_query}&f=html", + }, + ] + if offset + limit < matched: + links.append( + { + "rel": "next", + "type": "application/geo+json", + "href": f"{base_href}?limit={limit}&offset={offset + limit}{period_query}", + } + ) + if offset > 0: + links.append( + { + "rel": "prev", + "type": "application/geo+json", + "href": f"{base_href}?limit={limit}&offset={max(0, offset - limit)}{period_query}", + } + ) + return links + + +def _load_feature_collection(path_value: str) -> dict[str, Any]: + path = Path(path_value) + if not path.exists(): + raise HTTPException( + status_code=404, + detail=_api_error( + error="published_asset_not_found", + error_code="PUBLISHED_ASSET_NOT_FOUND", + message=f"Published feature asset does not exist: {path_value}", + ), + ) + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise HTTPException( + status_code=409, + detail=_api_error( + error="published_asset_invalid", + error_code="PUBLISHED_ASSET_INVALID", + message="Published feature asset is not a GeoJSON object", + ), + ) + return payload + + +def _wants_html(request: Request, f: str | None) -> bool: + if f == "html": + return True + if f in {"json", "jsonld"}: + return False + accept = request.headers.get("accept", "") + return "text/html" in accept.lower() + + +def _request_href(request: Request, *, f: str | None = None) -> str: + href = str(request.url).split("?")[0] + return f"{href}?f={f}" if f is not None else href + + +def _absolute_href(request: Request, href: str) -> str: + if href.startswith("http://") or href.startswith("https://"): + return href + return str(request.base_url).rstrip("/") + href + + +def _bbox_for_resource(resource: Any) -> list[float]: + bbox = resource.metadata.get("bbox") + if isinstance(bbox, list) and len(bbox) == 4: + return [float(value) for value in bbox] + return [-180.0, -90.0, 180.0, 90.0] + + +def _keywords_for_resource(resource: Any) -> list[str]: + keywords = ["EO", "DHIS2", str(resource.resource_class), str(resource.kind)] + if resource.dataset_id is not None: + keywords.append(resource.dataset_id) + if resource.workflow_id is not None: + keywords.append(resource.workflow_id) + return keywords + + +def _render_collections_html(collections: list[dict[str, Any]]) -> str: + rows = [] + for collection in collections: + links = {link["rel"]: link["href"] for link in collection["links"]} + analytics = links.get("analytics") + analytics_html = f'Analytics' if analytics else "" + title = escape(collection["title"]) + description = escape(collection["description"]) + rows.append( + f""" + + + {title} +
{description}
+
{escape(_dataset_note_for_collection(collection))}
+ + {escape(collection["id"])} + {escape(collection["representationType"]).upper()} + + Browse + JSON + {analytics_html} + + + """ + ) + return f""" + + + + + OGC Collections + + + +
+ +
+
+ Live OGC Surface +

Collections

+

+ Live collection discovery from backend publication truth. New publications and deletions appear + here immediately without restarting the OGC surface. +

+
+ OGC Home +
+
+ +
+
+ + + + + + + + + + + {"".join(rows)} + +
CollectionIdentifierTypeActions
+
+
+ +""" + + +def _dataset_note_for_collection(collection: dict[str, Any]) -> str: + collection_id = str(collection.get("id", "")) + title = str(collection.get("title", "")) + if collection_id.startswith("workflow-output-"): + dataset_name = title.split(" output for ", 1)[-1] if " output for " in title else collection_id + return f"Source dataset: {dataset_name}" + return "" + + +def _render_collection_html(collection: dict[str, Any]) -> str: + links = {link["rel"]: link["href"] for link in collection["links"]} + analytics = links.get("analytics") + browse_items = links.get("items-html") + analytics_html = ( + f'Open Analytics Viewer' if analytics is not None else "" + ) + browse_html = f'Browse Items' if browse_items else "" + return f""" + + + + + {escape(collection["title"])} + + + +
+ + Collection Detail +

{escape(collection["title"])}

+

{escape(collection["description"])}

+
+ OGC Home + Back to Collections + Collection JSON + {browse_html} + {analytics_html} +
+
+
+

Collection Info

+
+
Identifier
{escape(collection["id"])}
+
Item type
{escape(collection["itemType"])}
+
Storage CRS
{escape(collection["storageCrs"])}
+
+
+
+

Metadata

+
{escape(json.dumps(collection["metadata"], indent=2))}
+
+
+
+ +""" + + +def _render_items_html( + resource: Any, + features: list[dict[str, Any]], + *, + limit: int, + offset: int, + matched: int, + selected_period: str | None, + view_mode: str, +) -> str: + properties = [feature.get("properties", {}) for feature in features] + columns = [] + for props in properties: + for key in props: + if key not in columns: + columns.append(key) + header_html = "".join(f"{escape(column)}" for column in columns) + collection_id = collection_id_for_resource(resource) + analytics = next((link["href"] for link in resource.links if link.get("rel") == "analytics"), None) + page_geojson = json.dumps({"type": "FeatureCollection", "features": features}) + selected_period_json = json.dumps(selected_period) + period_query = f"&period={selected_period}" if selected_period is not None else "" + next_href = ( + f"/ogcapi/collections/{collection_id}/items?" + f"limit={limit}&offset={offset + limit}{period_query}&f=html&view={escape(view_mode)}" + if offset + limit < matched + else None + ) + prev_href = ( + f"/ogcapi/collections/{collection_id}/items?" + f"limit={limit}&offset={max(0, offset - limit)}{period_query}&f=html&view={escape(view_mode)}" + if offset > 0 + else None + ) + analytics_html = f'Open Analytics Viewer' if analytics else "" + browse_active = view_mode != "analytics" + browse_href = ( + f"/ogcapi/collections/{collection_id}/items?limit={limit}&offset={offset}{period_query}&f=html&view=browse" + ) + analytics_href = ( + f"/ogcapi/collections/{collection_id}/items?limit={limit}&offset={offset}{period_query}&f=html&view=analytics" + ) + embedded_analytics_html = ( + f'' + if analytics + else '
Analytics viewer unavailable for this collection.
' + ) + return f""" + + + + + {escape(resource.title)} Items + + + + + +
+ + Collection Items +

{escape(resource.title)}

+

+ Dynamic items page over live publication state. For time-aware exploration, use the analytics viewer. +

+
+ OGC Home + Back to Collections + Back to Collection + JSON + {analytics_html} +
+ +
+ {f'Previous' if prev_href else ""} + {f'Next' if next_href else ""} +
+

Showing {offset + 1 if matched else 0} to {offset + len(features)} of {matched} items.

+ {'
' if browse_active else '
'} + +
All Periods
+
+ { + ( + f'''
+
+
+
+
Value Scale
+
+
+ 0 + 0 +
+
+
+
+
Current Page
+
+ + {header_html} + +
+ +
+
+
''' + if browse_active + else f'''
+
+ {embedded_analytics_html} +
+
''' + ) + } +
+ + +""" diff --git a/src/eo_api/publications/pygeoapi.py b/src/eo_api/publications/pygeoapi.py index 894b7b9..16c4ffc 100644 --- a/src/eo_api/publications/pygeoapi.py +++ b/src/eo_api/publications/pygeoapi.py @@ -209,11 +209,13 @@ def _pygeoapi_links(resource: PublishedResource) -> list[dict[str, str]]: rel = str(link.get("rel", "related")) if href == "": continue + link_type = "text/html" if rel == "analytics" else "application/json" + title = "Analytics Viewer" if rel == "analytics" else rel.replace("-", " ").title() links.append( { - "type": "application/json", + "type": link_type, "rel": rel, - "title": rel.replace("-", " ").title(), + "title": title, "href": _absolute_ogc_href(href), } ) diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py index 59db4cf..af7d01f 100644 --- a/src/eo_api/publications/services.py +++ b/src/eo_api/publications/services.py @@ -95,6 +95,7 @@ def register_workflow_output_publication( {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, {"rel": "collection", "href": f"/ogcapi/collections/{resource_id}"}, + {"rel": "analytics", "href": f"/analytics/publications/{resource_id}/viewer"}, ], ) _write_resource(record) diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index 43bb65a..c359603 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -7,11 +7,13 @@ from ..publications.schemas import PublishedResourceExposure from ..publications.services import collection_id_for_resource, get_published_resource from .schemas import ( + ApiErrorResponse, WorkflowAssemblyExecuteRequest, WorkflowCatalogItem, WorkflowCatalogResponse, WorkflowExecuteEnvelopeRequest, WorkflowExecuteResponse, + WorkflowJobCleanupResponse, WorkflowJobListResponse, WorkflowJobRecord, WorkflowJobStatus, @@ -21,19 +23,47 @@ ) from .services.definitions import list_workflow_definitions, load_workflow_definition from .services.engine import execute_workflow, validate_workflow_steps -from .services.job_store import delete_job, get_job, get_job_result, get_job_trace, list_jobs +from .services.job_store import cleanup_jobs, delete_job, get_job, get_job_result, get_job_trace, list_jobs from .services.simple_mapper import normalize_simple_request router = APIRouter() +def _api_error( + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + status: str | None = None, +) -> dict[str, str]: + return ApiErrorResponse( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + status=status, + ).model_dump(exclude_none=True) + + @router.get("", response_model=WorkflowCatalogResponse) def list_workflows() -> WorkflowCatalogResponse: """List all allowlisted workflow definitions.""" try: definitions = list_workflow_definitions() except ValueError as exc: - raise HTTPException(status_code=500, detail=str(exc)) from exc + raise HTTPException( + status_code=500, + detail=_api_error( + error="workflow_catalog_unavailable", + error_code="CATALOG_UNAVAILABLE", + message=str(exc), + ), + ) from exc return WorkflowCatalogResponse( workflows=[ WorkflowCatalogItem( @@ -66,7 +96,15 @@ def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: """Fetch one persisted workflow job.""" job = get_job(job_id) if job is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) links: list[dict[str, str]] = [ {"rel": "self", "href": str(request.url_for("get_workflow_job", job_id=job_id))}, {"rel": "result", "href": str(request.url_for("get_workflow_job_result", job_id=job_id))}, @@ -81,6 +119,12 @@ def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: "href": f"{str(request.base_url).rstrip('/')}/ogcapi/collections/{collection_id}", } ) + links.append( + { + "rel": "analytics", + "href": f"{str(request.base_url).rstrip('/')}/analytics/publications/{publication.resource_id}/viewer", + } + ) return job.model_copy(update={"links": links}) @@ -89,10 +133,27 @@ def get_workflow_job_result(job_id: str) -> dict[str, Any]: """Fetch persisted workflow results for a completed job.""" job = get_job(job_id) if job is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) result = get_job_result(job_id) if result is None: - raise HTTPException(status_code=409, detail={"job_id": job_id, "status": job.status}) + raise HTTPException( + status_code=409, + detail=_api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) return result @@ -101,10 +162,27 @@ def get_workflow_job_trace(job_id: str) -> dict[str, Any]: """Fetch persisted workflow trace for a completed or failed job.""" job = get_job(job_id) if job is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) trace = get_job_trace(job_id) if trace is None: - raise HTTPException(status_code=409, detail={"job_id": job_id, "status": job.status}) + raise HTTPException( + status_code=409, + detail=_api_error( + error="job_trace_unavailable", + error_code="JOB_TRACE_UNAVAILABLE", + message=f"Trace is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) return trace @@ -113,10 +191,43 @@ def delete_workflow_job(job_id: str) -> dict[str, Any]: """Delete one workflow job and cascade run-owned derived artifacts.""" deleted = delete_job(job_id) if deleted is None: - raise HTTPException(status_code=404, detail=f"Unknown job_id '{job_id}'") + raise HTTPException( + status_code=404, + detail=_api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) return deleted +@router.post("/jobs/cleanup", response_model=WorkflowJobCleanupResponse) +def cleanup_workflow_jobs( + dry_run: bool = True, + keep_latest: int | None = None, + older_than_hours: int | None = None, +) -> WorkflowJobCleanupResponse: + """Apply retention policy to terminal jobs and derived artifacts.""" + try: + result = cleanup_jobs( + dry_run=dry_run, + keep_latest=keep_latest, + older_than_hours=older_than_hours, + ) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=_api_error( + error="cleanup_policy_invalid", + error_code="CLEANUP_POLICY_INVALID", + message=str(exc), + ), + ) from exc + return WorkflowJobCleanupResponse.model_validate(result) + + @router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> WorkflowExecuteResponse: """Run workflow from a single flat request payload.""" diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index 72e574d..6f95077 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -188,6 +188,40 @@ class WorkflowJobListResponse(BaseModel): jobs: list[WorkflowJobRecord] +class WorkflowJobCleanupCandidate(BaseModel): + """One terminal job selected by retention policy.""" + + job_id: str + status: WorkflowJobStatus + created_at: str + workflow_id: str + dataset_id: str + + +class WorkflowJobCleanupResponse(BaseModel): + """Result of applying or previewing a workflow job retention policy.""" + + dry_run: bool + keep_latest: int | None = None + older_than_hours: int | None = None + candidate_count: int + deleted_count: int + candidates: list[WorkflowJobCleanupCandidate] + deleted_job_ids: list[str] + + +class ApiErrorResponse(BaseModel): + """Stable API error envelope.""" + + error: str + error_code: str + message: str + resource_id: str | None = None + process_id: str | None = None + job_id: str | None = None + status: str | None = None + + class WorkflowCatalogItem(BaseModel): """Discoverable workflow definition summary.""" diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index bf00407..c1110de 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -5,6 +5,7 @@ import os import time from collections.abc import Callable +from dataclasses import dataclass from pathlib import Path from typing import Any, Literal @@ -42,6 +43,43 @@ def __init__( self.status_code = status_code +@dataclass +class WorkflowArtifacts: + """Typed workflow artifact handoff between components.""" + + features: dict[str, Any] | None = None + bbox: list[float] | None = None + temporal_dataset: Any | None = None + records: list[dict[str, Any]] | None = None + data_value_set: dict[str, Any] | None = None + output_file: str | None = None + + def require_features(self) -> dict[str, Any]: + if self.features is None: + raise RuntimeError("Workflow definition missing prerequisite for 'features'") + return self.features + + def require_bbox(self) -> list[float]: + if self.bbox is None: + raise RuntimeError("Workflow definition missing prerequisite for 'bbox'") + return self.bbox + + def require_records(self) -> list[dict[str, Any]]: + if self.records is None: + raise RuntimeError("Workflow definition missing prerequisite for 'records'") + return self.records + + def require_data_value_set(self) -> dict[str, Any]: + if self.data_value_set is None: + raise RuntimeError("Workflow definition missing prerequisite for 'data_value_set'") + return self.data_value_set + + def require_output_file(self) -> str: + if self.output_file is None: + raise RuntimeError("Workflow definition missing prerequisite for 'output_file'") + return self.output_file + + def execute_workflow( request: WorkflowExecuteRequest, *, @@ -60,7 +98,7 @@ def execute_workflow( if dataset is None: raise HTTPException(status_code=404, detail=f"Dataset '{request.dataset_id}' not found") - context: dict[str, Any] = {} + artifacts = WorkflowArtifacts() try: if workflow_definition is not None: @@ -88,12 +126,12 @@ def execute_workflow( request=request, request_params=request_params, dataset=dataset, - context=context, + artifacts=artifacts, ) - features = _require_context(context, "features") - bbox = _require_context(context, "bbox") - data_value_set = _require_context(context, "data_value_set") - output_file = _require_context(context, "output_file") + features = artifacts.require_features() + bbox = artifacts.require_bbox() + data_value_set = artifacts.require_data_value_set() + output_file = artifacts.require_output_file() run_log_file = persist_run_log( run_id=runtime.run_id, request=request, @@ -128,7 +166,7 @@ def execute_workflow( response=response, request=request, publication=workflow.publication, - context=context, + artifacts=artifacts, ) register_workflow_output_publication( response=response, @@ -241,12 +279,12 @@ def _build_publication_artifact( response: WorkflowExecuteResponse, request: WorkflowExecuteRequest, publication: WorkflowPublicationPolicy, - context: dict[str, Any], + artifacts: WorkflowArtifacts, ) -> tuple[str, str]: """Build the publication-facing artifact for a publishable workflow output.""" if publication.intent.value == "feature_collection": - features = _require_context(context, "features") - records = _require_context(context, "records") + features = artifacts.require_features() + records = artifacts.require_records() path = build_feature_collection_asset( dataset_id=response.dataset_id, features=features, @@ -279,7 +317,7 @@ def _execute_workflow_steps( request: WorkflowExecuteRequest, request_params: dict[str, Any] | None, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, ) -> None: """Execute workflow components using declarative YAML step order.""" executors: dict[str, StepExecutor] = { @@ -317,7 +355,7 @@ def _execute_workflow_steps( runtime=runtime, request=request, dataset=dataset, - context=context, + artifacts=artifacts, step_config=step_config, ) except Exception as exc: @@ -337,7 +375,7 @@ def _execute_workflow_steps( status_code=500, ) from exc - context.update(updates) + _apply_artifact_updates(artifacts, updates) def validate_workflow_steps( @@ -373,10 +411,10 @@ def _run_feature_source( runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, step_config: dict[str, Any], ) -> dict[str, Any]: - del dataset, context + del dataset, artifacts execution_mode = str(step_config.get("execution_mode", "local")).lower() if execution_mode == "remote": features, bbox = runtime.run( @@ -402,7 +440,7 @@ def _run_download_dataset( runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, step_config: dict[str, Any], ) -> dict[str, Any]: execution_mode = str(step_config.get("execution_mode", "local")).lower() @@ -411,7 +449,7 @@ def _run_download_dataset( overwrite = request.overwrite country_code = request.country_code - bbox = _require_context(context, "bbox") + bbox = artifacts.require_bbox() if execution_mode == "remote": remote_url = step_config.get("remote_url") if not isinstance(remote_url, str) or not remote_url: @@ -452,7 +490,7 @@ def _run_temporal_aggregation( runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, step_config: dict[str, Any], ) -> dict[str, Any]: target_period_type = request.temporal_aggregation.target_period_type @@ -466,7 +504,7 @@ def _run_temporal_aggregation( dataset_id=request.dataset_id, start=request.start, end=request.end, - bbox=_require_context(context, "bbox"), + bbox=artifacts.require_bbox(), target_period_type=target_period_type.value, method=method.value, timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), @@ -480,7 +518,7 @@ def _run_temporal_aggregation( dataset=dataset, start=request.start, end=request.end, - bbox=_require_context(context, "bbox"), + bbox=artifacts.require_bbox(), target_period_type=target_period_type, method=method, ) @@ -492,13 +530,13 @@ def _run_spatial_aggregation( runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, step_config: dict[str, Any], ) -> dict[str, Any]: method = request.spatial_aggregation.method feature_id_property = request.dhis2.org_unit_property execution_mode = str(step_config.get("execution_mode", "local")).lower() - temporal_dataset = context.get("temporal_dataset") + temporal_dataset = artifacts.temporal_dataset if execution_mode == "remote": if temporal_dataset is not None: raise ValueError( @@ -512,7 +550,7 @@ def _run_spatial_aggregation( dataset_id=request.dataset_id, start=request.start, end=request.end, - bbox=_require_context(context, "bbox"), + bbox=artifacts.require_bbox(), feature_source=request.feature_source.model_dump(mode="json"), method=method.value, feature_id_property=feature_id_property, @@ -527,8 +565,8 @@ def _run_spatial_aggregation( dataset=dataset, start=request.start, end=request.end, - bbox=_require_context(context, "bbox"), - features=_require_context(context, "features"), + bbox=artifacts.require_bbox(), + features=artifacts.require_features(), method=method, feature_id_property=feature_id_property, aggregated_dataset=temporal_dataset, @@ -541,7 +579,7 @@ def _run_build_datavalueset( runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - context: dict[str, Any], + artifacts: WorkflowArtifacts, step_config: dict[str, Any], ) -> dict[str, Any]: del dataset @@ -554,7 +592,7 @@ def _run_build_datavalueset( remote_url=str(step_config["remote_url"]), dataset_id=request.dataset_id, period_type=period_type.value, - records=_require_context(context, "records"), + records=artifacts.require_records(), dhis2=request.dhis2.model_dump(mode="json"), timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), retries=int(step_config.get("remote_retries", 1)), @@ -564,7 +602,7 @@ def _run_build_datavalueset( data_value_set, output_file = runtime.run( "build_datavalueset", component_services.build_datavalueset_component, - records=_require_context(context, "records"), + records=artifacts.require_records(), dataset_id=request.dataset_id, period_type=period_type, dhis2=request.dhis2, @@ -572,11 +610,12 @@ def _run_build_datavalueset( return {"data_value_set": data_value_set, "output_file": output_file} -def _require_context(context: dict[str, Any], key: str) -> Any: - """Return required context value or raise a clear orchestration error.""" - if key not in context: - raise RuntimeError(f"Workflow definition missing prerequisite for '{key}'") - return context[key] +def _apply_artifact_updates(artifacts: WorkflowArtifacts, updates: dict[str, Any]) -> None: + """Apply validated component outputs to the typed artifact handoff.""" + for key, value in updates.items(): + if not hasattr(artifacts, key): + raise RuntimeError(f"Unsupported workflow artifact '{key}'") + setattr(artifacts, key, value) def _resolve_step_config(config: dict[str, Any], request_params: dict[str, Any]) -> dict[str, Any]: diff --git a/src/eo_api/workflows/services/job_store.py b/src/eo_api/workflows/services/job_store.py index 0295364..34fe1ac 100644 --- a/src/eo_api/workflows/services/job_store.py +++ b/src/eo_api/workflows/services/job_store.py @@ -212,6 +212,61 @@ def delete_job(job_id: str) -> dict[str, Any] | None: } +def cleanup_jobs( + *, + dry_run: bool, + keep_latest: int | None = None, + older_than_hours: int | None = None, +) -> dict[str, Any]: + """Apply retention policy to terminal jobs and their run-owned artifacts.""" + if keep_latest is not None and keep_latest < 0: + raise ValueError("keep_latest must be >= 0") + if older_than_hours is not None and older_than_hours < 0: + raise ValueError("older_than_hours must be >= 0") + + terminal_statuses = { + WorkflowJobStatus.SUCCESSFUL, + WorkflowJobStatus.FAILED, + WorkflowJobStatus.DISMISSED, + } + terminal_jobs = [job for job in list_jobs() if job.status in terminal_statuses] + candidates = terminal_jobs + + if older_than_hours is not None: + cutoff = dt.datetime.now(dt.timezone.utc) - dt.timedelta(hours=older_than_hours) + candidates = [job for job in candidates if _parse_iso8601(job.created_at) <= cutoff] + + if keep_latest is not None: + protected_ids = {job.job_id for job in terminal_jobs[:keep_latest]} + candidates = [job for job in candidates if job.job_id not in protected_ids] + + deleted_job_ids: list[str] = [] + if not dry_run: + for job in candidates: + deleted = delete_job(job.job_id) + if deleted is not None: + deleted_job_ids.append(job.job_id) + + return { + "dry_run": dry_run, + "keep_latest": keep_latest, + "older_than_hours": older_than_hours, + "candidate_count": len(candidates), + "deleted_count": len(deleted_job_ids), + "candidates": [ + { + "job_id": job.job_id, + "status": job.status, + "created_at": job.created_at, + "workflow_id": job.workflow_id, + "dataset_id": job.dataset_id, + } + for job in candidates + ], + "deleted_job_ids": deleted_job_ids, + } + + def _require_job(job_id: str) -> WorkflowJobStoredRecord: record = get_stored_job(job_id) if record is None: @@ -236,6 +291,10 @@ def _utc_now() -> str: return dt.datetime.now(dt.timezone.utc).isoformat() +def _parse_iso8601(value: str) -> dt.datetime: + return dt.datetime.fromisoformat(value) + + def _to_public_job_record(record: WorkflowJobStoredRecord) -> WorkflowJobRecord: data = record.model_dump(mode="json") data.pop("run_id", None) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index a3ea969..aa600af 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -11,10 +11,17 @@ from fastapi.routing import APIRoute from fastapi.testclient import TestClient +from eo_api.components import services as component_services from eo_api.main import app from eo_api.publications import pygeoapi as publication_pygeoapi from eo_api.publications import services as publication_services -from eo_api.workflows.schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowRequest +from eo_api.workflows.schemas import ( + AggregationMethod, + PeriodType, + WorkflowExecuteRequest, + WorkflowExecuteResponse, + WorkflowRequest, +) from eo_api.workflows.services import engine, job_store, run_logs from eo_api.workflows.services.definitions import WorkflowDefinition, load_workflow_definition from eo_api.workflows.services.simple_mapper import normalize_simple_request @@ -76,13 +83,21 @@ def test_workflow_endpoint_exists_once() -> None: for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/workflows") and "POST" in route.methods } - assert workflow_routes == {"/workflows/dhis2-datavalue-set", "/workflows/execute", "/workflows/validate"} + assert workflow_routes == { + "/workflows/dhis2-datavalue-set", + "/workflows/execute", + "/workflows/jobs/cleanup", + "/workflows/validate", + } def test_ogc_process_routes_exist() -> None: ogc_routes = { route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/ogcapi") } + assert "/ogcapi/collections" in ogc_routes + assert "/ogcapi/collections/{collection_id}" in ogc_routes + assert "/ogcapi/collections/{collection_id}/items" in ogc_routes assert "/ogcapi/processes" in ogc_routes assert "/ogcapi/processes/{process_id}" in ogc_routes assert "/ogcapi/processes/{process_id}/execution" in ogc_routes @@ -102,6 +117,14 @@ def test_publication_generated_pygeoapi_routes_exist() -> None: assert "/publications/pygeoapi/materialize" in publication_routes +def test_analytics_viewer_routes_exist() -> None: + analytics_routes = { + route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/analytics") + } + assert "/analytics/publications/{resource_id}" in analytics_routes + assert "/analytics/publications/{resource_id}/viewer" in analytics_routes + + def test_pygeoapi_runtime_env_points_to_generated_documents() -> None: config_path = os.environ.get("PYGEOAPI_CONFIG") openapi_path = os.environ.get("PYGEOAPI_OPENAPI") @@ -250,6 +273,45 @@ def test_workflow_endpoint_validates_required_fields(client: TestClient) -> None assert response.status_code == 422 +def test_workflow_job_result_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/workflows/jobs/does-not-exist/result") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "job_not_found" + assert body["error_code"] == "JOB_NOT_FOUND" + assert body["job_id"] == "does-not-exist" + + +def test_ogc_collection_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/ogcapi/collections/does-not-exist", params={"f": "json"}) + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "collection_not_found" + assert body["error_code"] == "COLLECTION_NOT_FOUND" + assert body["resource_id"] == "does-not-exist" + + +def test_ogc_job_results_unavailable_uses_typed_error_envelope( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.execute_workflow", lambda *args, **kwargs: None) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + result_response = client.get(f"/ogcapi/jobs/{job_id}/results") + assert result_response.status_code == 409 + body = result_response.json()["detail"] + assert body["error"] == "job_result_unavailable" + assert body["error_code"] == "JOB_RESULT_UNAVAILABLE" + assert body["job_id"] == job_id + + def test_workflow_endpoint_accepts_simplified_payload(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: normalized = WorkflowExecuteRequest.model_validate( { @@ -501,6 +563,7 @@ def test_workflow_job_endpoints_return_persisted_result( assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") assert links["trace"].endswith(f"/workflows/jobs/{run_id}/trace") assert links["collection"].endswith(f"/ogcapi/collections/workflow-output-{run_id}") + assert links["analytics"].endswith(f"/analytics/publications/workflow-output-{run_id}/viewer") assert "result" not in job_body results_response = client.get(f"/workflows/jobs/{run_id}/result") @@ -576,6 +639,57 @@ def test_delete_workflow_job_cascades_derived_artifacts( assert publication_response.status_code == 404 +def test_cleanup_workflow_jobs_dry_run_lists_terminal_candidates_without_deleting( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + first = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + second = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert first.status_code == 200 + assert second.status_code == 200 + first_job_id = first.json()["run_id"] + second_job_id = second.json()["run_id"] + + cleanup_response = client.post("/workflows/jobs/cleanup", params={"dry_run": "true", "keep_latest": 1}) + assert cleanup_response.status_code == 200 + body = cleanup_response.json() + assert body["dry_run"] is True + assert body["candidate_count"] == 1 + assert body["deleted_count"] == 0 + assert body["candidates"][0]["job_id"] == first_job_id + assert body["deleted_job_ids"] == [] + + assert client.get(f"/workflows/jobs/{first_job_id}").status_code == 200 + assert client.get(f"/workflows/jobs/{second_job_id}").status_code == 200 + + +def test_cleanup_workflow_jobs_applies_retention_and_cascades_deletion( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + first = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + second = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert first.status_code == 200 + assert second.status_code == 200 + first_job_id = first.json()["run_id"] + second_job_id = second.json()["run_id"] + + apply_response = client.post("/workflows/jobs/cleanup", params={"dry_run": "false", "keep_latest": 1}) + assert apply_response.status_code == 200 + body = apply_response.json() + assert body["dry_run"] is False + assert body["deleted_count"] == 1 + assert body["deleted_job_ids"] == [first_job_id] + + assert client.get(f"/workflows/jobs/{first_job_id}").status_code == 404 + assert client.get(f"/publications/workflow-output-{first_job_id}").status_code == 404 + assert client.get(f"/workflows/jobs/{second_job_id}").status_code == 200 + + def test_ogc_async_execution_creates_job_and_results( client: TestClient, monkeypatch: pytest.MonkeyPatch, @@ -681,6 +795,8 @@ def test_workflow_success_registers_derived_publication( assert derived["asset_format"] == "geojson" assert derived["path"].endswith(".geojson") assert derived["metadata"]["native_output_file"].endswith(".json") + analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") + assert analytics_link["href"] == f"/analytics/publications/workflow-output-{run_id}/viewer" geojson = Path(derived["path"]).read_text(encoding="utf-8") assert '"org_unit_name"' in geojson assert '"period": "2024-01"' in geojson @@ -688,6 +804,84 @@ def test_workflow_success_registers_derived_publication( assert '"dataset_id"' not in geojson +def test_dynamic_ogc_collection_routes_reflect_new_publication_without_restart( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + collection_id = f"workflow-output-{run_id}" + + collections_response = client.get("/ogcapi/collections", params={"f": "json"}) + assert collections_response.status_code == 200 + collections = collections_response.json()["collections"] + derived = next(item for item in collections if item["id"] == collection_id) + assert derived["itemType"] == "feature" + + detail_response = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + assert detail_response.status_code == 200 + detail = detail_response.json() + detail_links = {link["rel"]: link["href"] for link in detail["links"]} + assert detail["id"] == collection_id + assert detail_links["analytics"].endswith(f"/analytics/publications/{collection_id}/viewer") + + items_response = client.get(f"/ogcapi/collections/{collection_id}/items", params={"f": "json", "limit": 5}) + assert items_response.status_code == 200 + items = items_response.json() + assert items["type"] == "FeatureCollection" + assert items["numberReturned"] == 1 + feature_props = items["features"][0]["properties"] + assert set(feature_props) == {"org_unit", "org_unit_name", "period", "value"} + + +def test_dynamic_ogc_collection_routes_drop_deleted_publication_without_restart( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + collection_id = f"workflow-output-{run_id}" + + before_delete = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + assert before_delete.status_code == 200 + + delete_response = client.delete(f"/workflows/jobs/{run_id}") + assert delete_response.status_code == 200 + + after_delete = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + assert after_delete.status_code == 404 + + +def test_analytics_viewer_config_and_html_for_publication( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + _patch_successful_execution(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + resource_id = f"workflow-output-{run_id}" + + config_response = client.get(f"/analytics/publications/{resource_id}") + assert config_response.status_code == 200 + config = config_response.json() + assert config["resource_id"] == resource_id + assert config["data_url"].startswith("/data/") + assert config["links"]["collection"] == f"/ogcapi/collections/{resource_id}" + + viewer_response = client.get(f"/analytics/publications/{resource_id}/viewer") + assert viewer_response.status_code == 200 + assert "Time-aware choropleth view" in viewer_response.text + assert resource_id in viewer_response.text + + def test_workflow_with_publication_disabled_does_not_register_derived_publication( client: TestClient, monkeypatch: pytest.MonkeyPatch, @@ -890,6 +1084,10 @@ def test_generated_pygeoapi_config_includes_geojson_derived_resource( assert derived["providers"][0]["name"] == "GeoJSON" assert derived["providers"][0]["type"] == "feature" assert derived["providers"][0]["data"].endswith(".geojson") + analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") + assert analytics_link["type"] == "text/html" + assert analytics_link["title"] == "Analytics Viewer" + assert analytics_link["href"].endswith(f"/analytics/publications/workflow-output-{run_id}/viewer") def test_materialize_generated_pygeoapi_documents_writes_files( @@ -946,6 +1144,36 @@ def test_component_spatial_aggregation_serializes_numpy_datetime64( assert body["records"][0]["time"] == "2024-01-01T00:00:00" +def test_temporal_aggregation_component_passes_through_matching_period_type( + monkeypatch: pytest.MonkeyPatch, +) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]]])}, + coords={"time": ["2024-01-01"], "lat": [0], "lon": [0]}, + ) + aggregate_called = {"value": False} + + monkeypatch.setattr(component_services, "get_data", lambda **kwargs: ds) + + def _aggregate_temporal(**kwargs: Any) -> xr.Dataset: + aggregate_called["value"] = True + return ds + + monkeypatch.setattr(component_services, "aggregate_temporal", _aggregate_temporal) + + result = component_services.temporal_aggregation_component( + dataset={"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "daily"}, + start="2024-01-01", + end="2024-01-31", + bbox=None, + target_period_type=PeriodType.DAILY, + method=AggregationMethod.SUM, + ) + + assert result is ds + assert aggregate_called["value"] is False + + def test_engine_orchestrates_components(monkeypatch: pytest.MonkeyPatch) -> None: request = { "dataset_id": "chirps3_precipitation_daily", From c951c07c2f6f95e7754fe7bd8062f168b0c915c3 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Thu, 19 Mar 2026 14:16:10 +0100 Subject: [PATCH 08/15] Tighten hybrid OGC publication boundary --- src/eo_api/analytics_viewer/routes.py | 52 +- src/eo_api/data_manager/services/constants.py | 56 +- src/eo_api/data_registry/routes.py | 11 +- src/eo_api/ogc/routes.py | 1147 +++-------------- src/eo_api/ogc_api/__init__.py | 49 +- src/eo_api/publications/pygeoapi.py | 52 +- .../collections/collection.html | 291 +++++ src/eo_api/publications/routes.py | 11 +- src/eo_api/publications/services.py | 71 +- src/eo_api/shared/api_errors.py | 83 ++ src/eo_api/workflows/routes.py | 53 +- src/eo_api/workflows/schemas.py | 12 - src/eo_api/workflows/services/engine.py | 52 +- src/eo_api/workflows/services/job_store.py | 2 +- .../workflows/services/simple_mapper.py | 20 +- tests/test_workflows.py | 143 +- 16 files changed, 1031 insertions(+), 1074 deletions(-) create mode 100644 src/eo_api/publications/pygeoapi_templates/collections/collection.html create mode 100644 src/eo_api/shared/api_errors.py diff --git a/src/eo_api/analytics_viewer/routes.py b/src/eo_api/analytics_viewer/routes.py index 12785f6..fcf8203 100644 --- a/src/eo_api/analytics_viewer/routes.py +++ b/src/eo_api/analytics_viewer/routes.py @@ -11,6 +11,7 @@ from ..publications import services as publication_services from ..publications.schemas import PublishedResourceKind +from ..shared.api_errors import api_error router = APIRouter() @@ -20,11 +21,24 @@ def get_publication_analytics_config(resource_id: str) -> dict[str, Any]: """Return viewer configuration for one published resource.""" resource = publication_services.get_published_resource(resource_id) if resource is None: - raise HTTPException(status_code=404, detail=f"Unknown resource_id '{resource_id}'") + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown resource_id '{resource_id}'", + resource_id=resource_id, + ), + ) if resource.kind != PublishedResourceKind.FEATURE_COLLECTION or resource.path is None: raise HTTPException( status_code=409, - detail=f"Resource '{resource_id}' is not a feature collection viewer target", + detail=api_error( + error="analytics_target_invalid", + error_code="ANALYTICS_TARGET_INVALID", + message=f"Resource '{resource_id}' is not a feature collection viewer target", + resource_id=resource_id, + ), ) data_url = _data_url_for_path(resource.path) @@ -38,6 +52,7 @@ def get_publication_analytics_config(resource_id: str) -> dict[str, Any]: "data_url": data_url, "ogc_items_url": f"/ogcapi/collections/{resource.resource_id}/items", "links": { + "ogc_home": "/ogcapi", "publication": f"/publications/{resource.resource_id}", "collection": f"/ogcapi/collections/{resource.resource_id}", "items": f"/ogcapi/collections/{resource.resource_id}/items", @@ -56,7 +71,14 @@ def _data_url_for_path(path_value: str) -> str: path = Path(path_value).resolve() downloads_root = publication_services.DOWNLOAD_DIR.resolve() if downloads_root not in path.parents: - raise HTTPException(status_code=409, detail="Published resource path is outside mounted download storage") + raise HTTPException( + status_code=409, + detail=api_error( + error="published_asset_invalid", + error_code="PUBLISHED_ASSET_INVALID", + message="Published resource path is outside mounted download storage", + ), + ) relative_path = path.relative_to(downloads_root).as_posix() return f"/data/{relative_path}" @@ -79,6 +101,12 @@ def _render_viewer_html(config: dict[str, Any], *, embed: bool = False) -> str: "" if embed else f""" +
Analytics Viewer

{config["title"]}

@@ -139,6 +167,24 @@ def _render_viewer_html(config: dict[str, Any], *, embed: bool = False) -> str: letter-spacing: 0.08em; text-transform: uppercase; }} + .topnav {{ + display: flex; + flex-wrap: wrap; + gap: 10px; + margin-bottom: 14px; + }} + .topnav a {{ + display: inline-flex; + align-items: center; + padding: 8px 12px; + border-radius: 999px; + background: rgba(255, 255, 255, 0.78); + border: 1px solid var(--line); + color: var(--ink); + text-decoration: none; + font-size: 0.92rem; + font-weight: 600; + }} h1 {{ margin: 14px 0 8px; font-size: clamp(2rem, 4vw, 3.6rem); diff --git a/src/eo_api/data_manager/services/constants.py b/src/eo_api/data_manager/services/constants.py index 32bd880..f9b6a86 100644 --- a/src/eo_api/data_manager/services/constants.py +++ b/src/eo_api/data_manager/services/constants.py @@ -1,17 +1,61 @@ -"""Module-level constants loaded at import time (DHIS2 org units, bbox, env config).""" +"""Module-level constants for downloader defaults. + +This module must stay import-safe. DHIS2-backed defaults are best-effort only, +so startup should not fail when DHIS2 is temporarily unavailable. +""" import json +import logging import os import geopandas as gpd from ...shared.dhis2_adapter import create_client, get_org_units_geojson -# load geojson from dhis2 at startup and keep in-memory -# TODO: should probably save to file instead -client = create_client() -ORG_UNITS_GEOJSON = get_org_units_geojson(client, level=2) -BBOX = list(map(float, gpd.read_file(json.dumps(ORG_UNITS_GEOJSON)).total_bounds)) +LOGGER = logging.getLogger(__name__) +_DEFAULT_BBOX = [-180.0, -90.0, 180.0, 90.0] + + +def _bbox_from_env() -> list[float] | None: + raw_bbox = os.getenv("EO_API_DEFAULT_BBOX") + if not raw_bbox: + return None + parts = [part.strip() for part in raw_bbox.split(",")] + if len(parts) != 4: + LOGGER.warning("Ignoring EO_API_DEFAULT_BBOX with invalid value: %s", raw_bbox) + return None + try: + return [float(part) for part in parts] + except ValueError: + LOGGER.warning("Ignoring EO_API_DEFAULT_BBOX with non-numeric values: %s", raw_bbox) + return None + + +def _load_org_unit_defaults() -> tuple[dict[str, object], list[float]]: + try: + client = create_client() + org_units_geojson = get_org_units_geojson(client, level=2) + bbox = list(map(float, gpd.read_file(json.dumps(org_units_geojson)).total_bounds)) + return org_units_geojson, bbox + except Exception as exc: + fallback_bbox = _bbox_from_env() or _DEFAULT_BBOX + dhis2_base_url = os.getenv("DHIS2_BASE_URL", "") + LOGGER.warning( + ( + "Failed to load DHIS2 org-unit defaults at startup from DHIS2_BASE_URL=%s. " + "The server will continue using fallback bbox %s and an empty org-unit GeoJSON cache. " + "This usually means the DHIS2 server is down, unreachable, or the credentials are invalid. " + "Original error: %s" + ), + dhis2_base_url, + fallback_bbox, + exc, + ) + return {"type": "FeatureCollection", "features": []}, fallback_bbox + + +# Best-effort startup defaults. Runtime flows can still provide explicit bbox. +ORG_UNITS_GEOJSON, BBOX = _load_org_unit_defaults() # env variables we need from .env # TODO: should probably centralize to shared config module diff --git a/src/eo_api/data_registry/routes.py b/src/eo_api/data_registry/routes.py index 36f4fce..029b921 100644 --- a/src/eo_api/data_registry/routes.py +++ b/src/eo_api/data_registry/routes.py @@ -4,6 +4,7 @@ from fastapi import APIRouter, HTTPException +from ..shared.api_errors import api_error from .services import datasets router = APIRouter() @@ -19,7 +20,15 @@ def _get_dataset_or_404(dataset_id: str) -> dict[str, Any]: """Look up a dataset by ID or raise 404.""" dataset = datasets.get_dataset(dataset_id) if not dataset: - raise HTTPException(status_code=404, detail=f"Dataset '{dataset_id}' not found") + raise HTTPException( + status_code=404, + detail=api_error( + error="dataset_not_found", + error_code="DATASET_NOT_FOUND", + message=f"Dataset '{dataset_id}' not found", + resource_id=dataset_id, + ), + ) return dataset diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 106cbd2..971f80e 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -2,24 +2,17 @@ from __future__ import annotations -import json import uuid from html import escape -from pathlib import Path from typing import Any from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, Response from fastapi.responses import HTMLResponse from ..publications.schemas import PublishedResourceExposure -from ..publications.services import ( - collection_id_for_resource, - ensure_source_dataset_publications, - get_published_resource, - get_published_resource_by_collection_id, - list_published_resources, -) -from ..workflows.schemas import ApiErrorResponse, WorkflowExecuteEnvelopeRequest, WorkflowJobStatus +from ..publications.services import collection_id_for_resource, get_published_resource +from ..shared.api_errors import api_error +from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus from ..workflows.services.definitions import load_workflow_definition from ..workflows.services.engine import execute_workflow from ..workflows.services.job_store import get_job, get_job_result, initialize_job, list_jobs @@ -31,123 +24,50 @@ _PROCESS_TITLE = "Generic DHIS2 workflow" -def _api_error( - *, - error: str, - error_code: str, - message: str, - resource_id: str | None = None, - process_id: str | None = None, - job_id: str | None = None, - status: str | None = None, -) -> dict[str, str]: - return ApiErrorResponse( - error=error, - error_code=error_code, - message=message, - resource_id=resource_id, - process_id=process_id, - job_id=job_id, - status=status, - ).model_dump(exclude_none=True) - - -@router.get("/collections", response_model=None) -def list_collections(request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: - """List OGC collections directly from live publication state.""" - collections = [_collection_summary(resource, request) for resource in _ogc_resources()] +@router.get("", response_model=None) +def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: + """Return a native OGC landing page for processes and jobs.""" + base_url = str(request.base_url).rstrip("/") body = { - "collections": collections, + "title": "DHIS2 EO API", + "description": ( + "Native OGC API landing page for workflow processes and jobs. " + "Collections and items are served by the mounted geospatial publication layer." + ), "links": [ {"rel": "self", "type": "application/json", "href": _request_href(request, f="json")}, {"rel": "alternate", "type": "text/html", "href": _request_href(request, f="html")}, - {"rel": "root", "type": "application/json", "href": str(request.base_url).rstrip("/") + "/ogcapi"}, + {"rel": "service-desc", "type": "application/vnd.oai.openapi+json;version=3.0", "href": "/ogcapi/openapi"}, + {"rel": "conformance", "type": "application/json", "href": f"{base_url}/ogcapi/conformance"}, + {"rel": "data", "type": "application/json", "href": f"{base_url}/ogcapi/collections"}, + {"rel": "processes", "type": "application/json", "href": f"{base_url}/ogcapi/processes"}, + {"rel": "jobs", "type": "application/json", "href": f"{base_url}/ogcapi/jobs"}, + ], + "navigation": [ + { + "title": "Browse Collections", + "description": "Open the OGC publication surface for collections and items.", + "href": f"{base_url}/ogcapi/collections?f=html", + }, + { + "title": "List Processes", + "description": "View the exposed OGC process catalog backed by the native workflow engine.", + "href": f"{base_url}/ogcapi/processes", + }, + { + "title": "List Jobs", + "description": "Inspect OGC job records backed by the native job store.", + "href": f"{base_url}/ogcapi/jobs", + }, + { + "title": "Conformance", + "description": "See the standards conformance declarations for the mounted OGC publication layer.", + "href": f"{base_url}/ogcapi/conformance", + }, ], } if _wants_html(request, f): - return HTMLResponse(_render_collections_html(collections)) - return body - - -@router.get("/collections/{collection_id}", response_model=None) -def get_collection(collection_id: str, request: Request, f: str | None = None) -> dict[str, Any] | HTMLResponse: - """Return one dynamic collection document backed by publication truth.""" - resource = get_published_resource_by_collection_id(collection_id) - if resource is None or resource.exposure != PublishedResourceExposure.OGC: - raise HTTPException( - status_code=404, - detail=_api_error( - error="collection_not_found", - error_code="COLLECTION_NOT_FOUND", - message=f"Unknown collection_id '{collection_id}'", - resource_id=collection_id, - ), - ) - body = _collection_detail(resource, request) - if _wants_html(request, f): - return HTMLResponse(_render_collection_html(body)) - return body - - -@router.get("/collections/{collection_id}/items", response_model=None) -def get_collection_items( - collection_id: str, - request: Request, - limit: int = 20, - offset: int = 0, - period: str | None = None, - view: str | None = None, - f: str | None = None, -) -> dict[str, Any] | HTMLResponse: - """Return dynamic feature items for a GeoJSON-backed published collection.""" - resource = get_published_resource_by_collection_id(collection_id) - if resource is None or resource.exposure != PublishedResourceExposure.OGC: - raise HTTPException( - status_code=404, - detail=_api_error( - error="collection_not_found", - error_code="COLLECTION_NOT_FOUND", - message=f"Unknown collection_id '{collection_id}'", - resource_id=collection_id, - ), - ) - if resource.path is None or Path(resource.path).suffix.lower() != ".geojson": - raise HTTPException( - status_code=409, - detail=_api_error( - error="collection_items_unavailable", - error_code="COLLECTION_ITEMS_UNAVAILABLE", - message=f"Collection '{collection_id}' does not expose OGC items", - resource_id=collection_id, - ), - ) - items = _load_feature_collection(resource.path) - features = items.get("features", []) - if period is not None: - features = [feature for feature in features if feature.get("properties", {}).get("period") == period] - matched = len(features) - page = features[offset : offset + max(limit, 0)] - body = { - "type": "FeatureCollection", - "id": collection_id, - "title": resource.title, - "numberMatched": matched, - "numberReturned": len(page), - "features": page, - "links": _item_links(request, resource, limit=limit, offset=offset, matched=matched, period=period), - } - if _wants_html(request, f): - return HTMLResponse( - _render_items_html( - resource, - page, - limit=limit, - offset=offset, - matched=matched, - selected_period=period, - view_mode=view or "browse", - ) - ) + return HTMLResponse(_render_ogc_root_html(body)) return body @@ -256,12 +176,11 @@ def execute_process( {"rel": "results", "type": "application/json", "href": results_url}, ] if publication is not None and publication.exposure == PublishedResourceExposure.OGC: - collection_id = collection_id_for_resource(publication) links.append( { "rel": "collection", "type": "application/json", - "href": _collection_href(request, collection_id), + "href": _collection_href(request, collection_id_for_resource(publication)), } ) return { @@ -287,7 +206,7 @@ def get_ogc_job(job_id: str, request: Request) -> dict[str, Any]: if job is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -332,7 +251,7 @@ def get_ogc_job_results(job_id: str) -> dict[str, Any]: if job is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -343,7 +262,7 @@ def get_ogc_job_results(job_id: str) -> dict[str, Any]: if result is None: raise HTTPException( status_code=409, - detail=_api_error( + detail=api_error( error="job_result_unavailable", error_code="JOB_RESULT_UNAVAILABLE", message=f"Result is not available for job '{job_id}'", @@ -358,7 +277,7 @@ def _require_process(process_id: str) -> None: if process_id != _PROCESS_ID: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="process_not_found", error_code="PROCESS_NOT_FOUND", message=f"Unknown process_id '{process_id}'", @@ -390,208 +309,68 @@ def _collection_href(request: Request, collection_id: str) -> str: return str(request.base_url).rstrip("/") + f"/ogcapi/collections/{collection_id}" -def _ogc_resources() -> list[Any]: - ensure_source_dataset_publications() - return list_published_resources(exposure=PublishedResourceExposure.OGC) - - -def _collection_summary(resource: Any, request: Request) -> dict[str, Any]: - collection_id = collection_id_for_resource(resource) - item_type = "feature" if resource.path and Path(resource.path).suffix.lower() == ".geojson" else "coverage" - representation_type = "Feature Collection" if item_type == "feature" else "Coverage" - links = [ - {"rel": "self", "type": "application/json", "href": _collection_href(request, collection_id)}, - {"rel": "alternate", "type": "text/html", "href": _collection_href(request, collection_id) + "?f=html"}, - ] - if item_type == "feature": - links.extend( - [ - { - "rel": "items", - "type": "application/geo+json", - "href": _collection_href(request, collection_id) + "/items", - }, - { - "rel": "items-html", - "type": "text/html", - "href": _collection_href(request, collection_id) + "/items?f=html", - }, - ] - ) - for link in resource.links: - href = str(link.get("href", "")) - if href: - links.append( - { - "rel": str(link.get("rel", "related")), - "type": "text/html" if link.get("rel") == "analytics" else "application/json", - "href": _absolute_href(request, href), - } - ) - return { - "id": collection_id, - "title": resource.title, - "description": resource.description, - "itemType": item_type, - "representationType": representation_type, - "extent": {"spatial": {"bbox": [_bbox_for_resource(resource)]}}, - "links": links, - } - - -def _collection_detail(resource: Any, request: Request) -> dict[str, Any]: - collection = _collection_summary(resource, request) - collection["crs"] = ["http://www.opengis.net/def/crs/OGC/1.3/CRS84"] - collection["storageCrs"] = "http://www.opengis.net/def/crs/OGC/1.3/CRS84" - collection["keywords"] = _keywords_for_resource(resource) - collection["metadata"] = { - "resource_id": resource.resource_id, - "resource_class": str(resource.resource_class), - "dataset_id": resource.dataset_id, - "workflow_id": resource.workflow_id, - "job_id": resource.job_id, - } - return collection - - -def _item_links( - request: Request, resource: Any, *, limit: int, offset: int, matched: int, period: str | None -) -> list[dict[str, str]]: - collection_id = collection_id_for_resource(resource) - base_href = _collection_href(request, collection_id) + "/items" - period_query = f"&period={period}" if period is not None else "" - links = [ - { - "rel": "self", - "type": "application/geo+json", - "href": f"{base_href}?limit={limit}&offset={offset}{period_query}", - }, - {"rel": "collection", "type": "application/json", "href": _collection_href(request, collection_id)}, - { - "rel": "alternate", - "type": "text/html", - "href": f"{base_href}?limit={limit}&offset={offset}{period_query}&f=html", - }, - ] - if offset + limit < matched: - links.append( - { - "rel": "next", - "type": "application/geo+json", - "href": f"{base_href}?limit={limit}&offset={offset + limit}{period_query}", - } - ) - if offset > 0: - links.append( - { - "rel": "prev", - "type": "application/geo+json", - "href": f"{base_href}?limit={limit}&offset={max(0, offset - limit)}{period_query}", - } - ) - return links - - -def _load_feature_collection(path_value: str) -> dict[str, Any]: - path = Path(path_value) - if not path.exists(): - raise HTTPException( - status_code=404, - detail=_api_error( - error="published_asset_not_found", - error_code="PUBLISHED_ASSET_NOT_FOUND", - message=f"Published feature asset does not exist: {path_value}", - ), - ) - payload = json.loads(path.read_text(encoding="utf-8")) - if not isinstance(payload, dict): - raise HTTPException( - status_code=409, - detail=_api_error( - error="published_asset_invalid", - error_code="PUBLISHED_ASSET_INVALID", - message="Published feature asset is not a GeoJSON object", - ), - ) - return payload +def _request_href(request: Request, **updates: Any) -> str: + params = dict(request.query_params) + for key, value in updates.items(): + if value is None: + params.pop(key, None) + else: + params[key] = str(value) + query = "&".join(f"{key}={value}" for key, value in params.items()) + suffix = f"?{query}" if query else "" + return f"{request.url.path}{suffix}" def _wants_html(request: Request, f: str | None) -> bool: - if f == "html": - return True - if f in {"json", "jsonld"}: - return False + if f is not None: + return f.lower() == "html" accept = request.headers.get("accept", "") - return "text/html" in accept.lower() - - -def _request_href(request: Request, *, f: str | None = None) -> str: - href = str(request.url).split("?")[0] - return f"{href}?f={f}" if f is not None else href - - -def _absolute_href(request: Request, href: str) -> str: - if href.startswith("http://") or href.startswith("https://"): - return href - return str(request.base_url).rstrip("/") + href - - -def _bbox_for_resource(resource: Any) -> list[float]: - bbox = resource.metadata.get("bbox") - if isinstance(bbox, list) and len(bbox) == 4: - return [float(value) for value in bbox] - return [-180.0, -90.0, 180.0, 90.0] - + return "text/html" in accept and "application/json" not in accept -def _keywords_for_resource(resource: Any) -> list[str]: - keywords = ["EO", "DHIS2", str(resource.resource_class), str(resource.kind)] - if resource.dataset_id is not None: - keywords.append(resource.dataset_id) - if resource.workflow_id is not None: - keywords.append(resource.workflow_id) - return keywords - -def _render_collections_html(collections: list[dict[str, Any]]) -> str: - rows = [] - for collection in collections: - links = {link["rel"]: link["href"] for link in collection["links"]} - analytics = links.get("analytics") - analytics_html = f'Analytics' if analytics else "" - title = escape(collection["title"]) - description = escape(collection["description"]) - rows.append( - f""" - - - {title} -

{description}
-
{escape(_dataset_note_for_collection(collection))}
- - {escape(collection["id"])} - {escape(collection["representationType"]).upper()} - - Browse - JSON - {analytics_html} - - - """ +def _render_ogc_root_html(body: dict[str, Any]) -> str: + nav_cards = "".join( + ( + '' + '' + '{title}' + '{description}' + '' + "" + ).format( + href=escape(item["href"]), + title=escape(item["title"]), + description=escape(item["description"]), + ) + for item in body.get("navigation", []) + ) + link_items = "".join( + ( + '' + '{rel}' + '{type}' + "" + ).format( + href=escape(link["href"]), + rel=escape(link["rel"]), + type=escape(link["type"]), ) + for link in body["links"] + ) return f""" - OGC Collections + {escape(body["title"])} - - -
- -
-
- Live OGC Surface -

Collections

-

- Live collection discovery from backend publication truth. New publications and deletions appear - here immediately without restarting the OGC surface. -

-
- OGC Home -
-
- -
-
- - - - - - - - - - - {"".join(rows)} - -
CollectionIdentifierTypeActions
-
-
- -""" - - -def _dataset_note_for_collection(collection: dict[str, Any]) -> str: - collection_id = str(collection.get("id", "")) - title = str(collection.get("title", "")) - if collection_id.startswith("workflow-output-"): - dataset_name = title.split(" output for ", 1)[-1] if " output for " in title else collection_id - return f"Source dataset: {dataset_name}" - return "" - - -def _render_collection_html(collection: dict[str, Any]) -> str: - links = {link["rel"]: link["href"] for link in collection["links"]} - analytics = links.get("analytics") - browse_items = links.get("items-html") - analytics_html = ( - f'Open Analytics Viewer' if analytics is not None else "" - ) - browse_html = f'Browse Items' if browse_items else "" - return f""" - - - - - {escape(collection["title"])} - - - -
- - Collection Detail -

{escape(collection["title"])}

-

{escape(collection["description"])}

-
- OGC Home - Back to Collections - Collection JSON - {browse_html} - {analytics_html} -
-
-
-

Collection Info

-
-
Identifier
{escape(collection["id"])}
-
Item type
{escape(collection["itemType"])}
-
Storage CRS
{escape(collection["storageCrs"])}
-
-
-
-

Metadata

-
{escape(json.dumps(collection["metadata"], indent=2))}
-
-
-
- -""" - - -def _render_items_html( - resource: Any, - features: list[dict[str, Any]], - *, - limit: int, - offset: int, - matched: int, - selected_period: str | None, - view_mode: str, -) -> str: - properties = [feature.get("properties", {}) for feature in features] - columns = [] - for props in properties: - for key in props: - if key not in columns: - columns.append(key) - header_html = "".join(f"{escape(column)}" for column in columns) - collection_id = collection_id_for_resource(resource) - analytics = next((link["href"] for link in resource.links if link.get("rel") == "analytics"), None) - page_geojson = json.dumps({"type": "FeatureCollection", "features": features}) - selected_period_json = json.dumps(selected_period) - period_query = f"&period={selected_period}" if selected_period is not None else "" - next_href = ( - f"/ogcapi/collections/{collection_id}/items?" - f"limit={limit}&offset={offset + limit}{period_query}&f=html&view={escape(view_mode)}" - if offset + limit < matched - else None - ) - prev_href = ( - f"/ogcapi/collections/{collection_id}/items?" - f"limit={limit}&offset={max(0, offset - limit)}{period_query}&f=html&view={escape(view_mode)}" - if offset > 0 - else None - ) - analytics_html = f'Open Analytics Viewer' if analytics else "" - browse_active = view_mode != "analytics" - browse_href = ( - f"/ogcapi/collections/{collection_id}/items?limit={limit}&offset={offset}{period_query}&f=html&view=browse" - ) - analytics_href = ( - f"/ogcapi/collections/{collection_id}/items?limit={limit}&offset={offset}{period_query}&f=html&view=analytics" - ) - embedded_analytics_html = ( - f'' - if analytics - else '
Analytics viewer unavailable for this collection.
' - ) - return f""" - - - - - {escape(resource.title)} Items - - -
- - Collection Items -

{escape(resource.title)}

-

- Dynamic items page over live publication state. For time-aware exploration, use the analytics viewer. -

-
- OGC Home - Back to Collections - Back to Collection - JSON - {analytics_html} -
- -
- {f'Previous' if prev_href else ""} - {f'Next' if next_href else ""} -
-

Showing {offset + 1 if matched else 0} to {offset + len(features)} of {matched} items.

- {'
' if browse_active else '
'} - -
All Periods
-
- { - ( - f'''
-
-
-
-
Value Scale
-
-
- 0 - 0 -
-
-
-
-
Current Page
-
- - {header_html} - -
- -
-
-
''' - if browse_active - else f'''
-
- {embedded_analytics_html} -
-
''' - ) - } +
OGC API
+

{escape(body["title"])}

+

{escape(body["description"])}

+ +
+

API Links

+ +
- """ diff --git a/src/eo_api/ogc_api/__init__.py b/src/eo_api/ogc_api/__init__.py index 0e7d977..1433c7d 100644 --- a/src/eo_api/ogc_api/__init__.py +++ b/src/eo_api/ogc_api/__init__.py @@ -1,5 +1,50 @@ -"""Mounted pygeoapi application.""" +"""Mounted pygeoapi application with publication-aware runtime refresh.""" -from pygeoapi.starlette_app import APP as ogc_api_app +from __future__ import annotations + +import asyncio +import importlib +import os +from types import ModuleType +from typing import Any + +from starlette.types import Receive, Scope, Send + +from ..publications.pygeoapi import write_generated_pygeoapi_documents + +_STARLETTE_APP_MODULE = "pygeoapi.starlette_app" + + +class DynamicPygeoapiApp: + """Refresh pygeoapi runtime documents before serving mounted requests. + + This keeps the mounted publication surface aligned with live publication + truth without requiring an application restart after each publication + change. + """ + + def __init__(self) -> None: + self._module: ModuleType | None = None + # pygeoapi keeps request handlers and config as module globals. + # Serialize mounted requests so reloads cannot race with in-flight + # requests and produce mixed old/new publication state. + self._lock = asyncio.Lock() + + async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: + async with self._lock: + config_path, openapi_path = write_generated_pygeoapi_documents() + os.environ["PYGEOAPI_CONFIG"] = str(config_path) + os.environ["PYGEOAPI_OPENAPI"] = str(openapi_path) + + if self._module is None: + self._module = importlib.import_module(_STARLETTE_APP_MODULE) + else: + self._module = importlib.reload(self._module) + + app = getattr(self._module, "APP") + await app(scope, receive, send) + + +ogc_api_app: Any = DynamicPygeoapiApp() __all__ = ["ogc_api_app"] diff --git a/src/eo_api/publications/pygeoapi.py b/src/eo_api/publications/pygeoapi.py index 16c4ffc..8521c6e 100644 --- a/src/eo_api/publications/pygeoapi.py +++ b/src/eo_api/publications/pygeoapi.py @@ -9,10 +9,16 @@ from ..data_manager.services.downloader import DOWNLOAD_DIR, get_zarr_path from ..data_registry.services.datasets import get_dataset -from .schemas import PublishedResource, PublishedResourceExposure, PublishedResourceKind +from .schemas import ( + PublishedResource, + PublishedResourceClass, + PublishedResourceExposure, + PublishedResourceKind, +) from .services import collection_id_for_resource, ensure_source_dataset_publications, list_published_resources _DEFAULT_SERVER_URL = "http://127.0.0.1:8000/ogcapi" +_TEMPLATES_DIR = Path(__file__).resolve().parent / "pygeoapi_templates" def build_pygeoapi_config(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, Any]: @@ -25,6 +31,7 @@ def build_pygeoapi_config(*, server_url: str = _DEFAULT_SERVER_URL) -> dict[str, "url": server_url, "mimetype": "application/json; charset=UTF-8", "encoding": "utf-8", + "templates": {"path": str(_TEMPLATES_DIR)}, "languages": ["en-US"], "limits": {"default_items": 20, "max_items": 50}, "map": { @@ -110,7 +117,7 @@ def _build_pygeoapi_resource(resource: PublishedResource) -> dict[str, Any]: return { "type": "collection", "title": {"en": resource.title}, - "description": {"en": resource.description}, + "description": {"en": _description_for_resource(resource)}, "keywords": _keywords_for_resource(resource), "links": _pygeoapi_links(resource), "extents": { @@ -162,6 +169,47 @@ def _keywords_for_resource(resource: PublishedResource) -> list[str]: return keywords +def _description_for_resource(resource: PublishedResource) -> str: + metadata = resource.metadata + + if resource.resource_class == PublishedResourceClass.SOURCE: + source = metadata.get("source") + variable = metadata.get("variable") + period_type = metadata.get("period_type") + parts = ["Source dataset"] + if source: + parts.append(f"from {source}") + if variable: + parts.append(f"for {variable}") + if period_type: + parts.append(f"with {period_type} cadence") + return " ".join(parts) + "." + + if resource.resource_class == PublishedResourceClass.DERIVED: + dataset_id = resource.dataset_id or metadata.get("dataset_id") + workflow_id = resource.workflow_id or metadata.get("workflow_id") + feature_count = metadata.get("feature_count") + value_count = metadata.get("value_count") + + parts = ["Derived workflow output"] + if workflow_id: + parts.append(f"from {workflow_id}") + if dataset_id: + parts.append(f"for {dataset_id}") + + details: list[str] = [] + if feature_count is not None: + details.append(f"{feature_count} features") + if value_count is not None: + details.append(f"{value_count} values") + if details: + parts.append(f"({', '.join(details)})") + + return " ".join(parts) + "." + + return resource.description + + def _build_provider(resource: PublishedResource) -> dict[str, Any]: if resource.kind == PublishedResourceKind.COVERAGE: dataset = get_dataset(str(resource.dataset_id)) diff --git a/src/eo_api/publications/pygeoapi_templates/collections/collection.html b/src/eo_api/publications/pygeoapi_templates/collections/collection.html new file mode 100644 index 0000000..e52eb94 --- /dev/null +++ b/src/eo_api/publications/pygeoapi_templates/collections/collection.html @@ -0,0 +1,291 @@ +{% extends "_base.html" %} +{% block title %}{{ super() }} {{ data['title'] }} {% endblock %} +{% block desc %}{{ data.get('description','') | truncate(250) }}{% endblock %} +{% block tags %}{{ data.get('keywords',[]) | join(',') }}{% endblock %} +{% block crumbs %}{{ super() }} +/ {% trans %}Collections{% endtrans %} +/ {{ data['title'] | truncate( 25 ) }} +{% endblock %} + +{% block extrahead %} + + + + +{% endblock %} + +{% block body %} +
+
+
+

{{ data['title'] }}

+

{{ data['description'] }}

+

+ {% for kw in data['keywords'] %} + {{ kw }} + {% endfor %} +

+
+
+
+
+
+
+
+ + {% set ns = namespace(header_printed=false) %} + {% for link in data['links'] %} + {% if link['rel'] == 'license' %} + {% if not ns.header_printed %} +

{% trans %}License{% endtrans %}

+ {% set ns.header_printed = true %} + {% endif %} + + {% endif %} + {% endfor %} + + {% if data['itemType'] == 'feature' or data['itemType'] == 'record' %} + {% set analytics_links = data['links'] | selectattr('rel', 'equalto', 'analytics') | list %} +
+
+

{% trans %}Browse{% endtrans %}

+

{% trans %}Open the collection items view with map and attribute table.{% endtrans %}

+
+ {% trans %}Open items{% endtrans %} + +
+ {% if analytics_links %} +
+

{% trans %}Analytics{% endtrans %}

+

{% trans %}Open the separate analytics viewer for this published resource.{% endtrans %}

+ {% for link in analytics_links %} + + {{ link['title'] or 'Analytics Viewer' }} + + {% endfor %} +
+ {% endif %} +
+

{% trans %}Queryables{% endtrans %}

+

{% trans %}Inspect the fields that can be used for filtering, such as period.{% endtrans %}

+ + {% trans %}Open queryables{% endtrans %} + +
+
+

{% trans %}Schema{% endtrans %}

+

{% trans %}View the collection schema exposed by the publication layer.{% endtrans %}

+ + {% trans %}Open schema{% endtrans %} + +
+ {% set providers = config.get('resources', {}).get(data['id'], {}).get('providers', []) %} + {% for provider in providers %} + {% if 'tile' in provider['type'] %} +
+

{% trans %}Tiles{% endtrans %}

+

{% trans %}Open tile endpoints for map-oriented publication access.{% endtrans %}

+ + {% trans %}Open tiles{% endtrans %} + +
+ {% endif %} + {% endfor %} +
+ {% endif %} + + {% if 'parameter_names' in data %} +

Parameters

+ + + + + + + {% for parameter in data['parameter_names'].values() %} + + + + + + {% endfor %} +
idnameunits
{{ parameter['id'] }}{{ parameter['name'] }}{{ parameter['unit']['symbol']['value'] }}
+ {% endif %} + +

{% trans %}Links{% endtrans %}

+

{% trans %}Raw protocol and related links for this collection.{% endtrans %}

+ + {% if data['itemType'] == 'feature' %} +

{% trans %}Reference Systems{% endtrans %}

+
    + {% for crs in data['crs'] %} +
  • + {{ crs }} +
  • + {% endfor %} +
+

{% trans %}Storage CRS{% endtrans %}

+ + {% endif %} + +
+{% endblock %} + +{% block extrafoot %} + +{% endblock %} diff --git a/src/eo_api/publications/routes.py b/src/eo_api/publications/routes.py index de228d8..a3f328a 100644 --- a/src/eo_api/publications/routes.py +++ b/src/eo_api/publications/routes.py @@ -2,6 +2,7 @@ from fastapi import APIRouter, HTTPException +from ..shared.api_errors import api_error from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceListResponse from .services import ensure_source_dataset_publications, get_published_resource, list_published_resources @@ -33,5 +34,13 @@ def get_publication(resource_id: str) -> PublishedResource: ensure_source_dataset_publications() resource = get_published_resource(resource_id) if resource is None: - raise HTTPException(status_code=404, detail=f"Unknown resource_id '{resource_id}'") + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown resource_id '{resource_id}'", + resource_id=resource_id, + ), + ) return resource diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py index af7d01f..45cb1d8 100644 --- a/src/eo_api/publications/services.py +++ b/src/eo_api/publications/services.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime as dt +import json from pathlib import Path from typing import TYPE_CHECKING @@ -26,7 +27,11 @@ def ensure_source_dataset_publications() -> list[PublishedResource]: resource_class=PublishedResourceClass.SOURCE, kind=PublishedResourceKind.COVERAGE, title=str(dataset.get("name") or dataset["id"]), - description=f"Source dataset: {dataset.get('source') or dataset['id']}", + description=( + f"Source dataset from {dataset.get('source') or dataset['id']}" + f" for {dataset.get('variable') or dataset['id']}" + f" with {dataset.get('period_type') or 'native'} cadence." + ), dataset_id=str(dataset["id"]), path=None, ogc_path=f"/ogcapi/collections/{dataset['id']}", @@ -66,17 +71,29 @@ def register_workflow_output_publication( resource_id = f"workflow-output-{response.run_id}" existing = get_published_resource(resource_id) timestamp = _utc_now() + publication_path = published_path or response.output_file + analytics_metadata = _analytics_metadata_for_published_asset(publication_path) + links = [ + {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, + {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, + {"rel": "collection", "href": f"/ogcapi/collections/{resource_id}"}, + ] + if analytics_metadata["eligible"]: + links.append({"rel": "analytics", "href": f"/analytics/publications/{resource_id}/viewer"}) record = PublishedResource( resource_id=resource_id, resource_class=PublishedResourceClass.DERIVED, kind=PublishedResourceKind.FEATURE_COLLECTION, title=f"{response.workflow_id} output for {response.dataset_id}", - description="Derived workflow output registered for OGC publication.", + description=( + f"Derived workflow output from {response.workflow_id} for {response.dataset_id} " + f"({response.feature_count} features, {response.value_count} values)." + ), dataset_id=response.dataset_id, workflow_id=response.workflow_id, job_id=response.run_id, run_id=response.run_id, - path=published_path or response.output_file, + path=publication_path, ogc_path=f"/ogcapi/collections/{resource_id}", asset_format=asset_format or "datavalueset-json", exposure=exposure, @@ -90,13 +107,11 @@ def register_workflow_output_publication( "value_count": response.value_count, "bbox": response.bbox, "native_output_file": response.output_file, + "period_count": analytics_metadata["period_count"], + "has_period_field": analytics_metadata["has_period_field"], + "analytics_eligible": analytics_metadata["eligible"], }, - links=[ - {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, - {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, - {"rel": "collection", "href": f"/ogcapi/collections/{resource_id}"}, - {"rel": "analytics", "href": f"/analytics/publications/{resource_id}/viewer"}, - ], + links=links, ) _write_resource(record) return record @@ -179,3 +194,41 @@ def _collection_id_for_resource(resource: PublishedResource) -> str: if resource.resource_class == PublishedResourceClass.SOURCE and resource.dataset_id is not None: return resource.dataset_id return resource.resource_id + + +def _analytics_metadata_for_published_asset(path_value: str | None) -> dict[str, bool | int]: + if path_value is None: + return {"eligible": False, "period_count": 0, "has_period_field": False} + + path = Path(path_value) + if path.suffix.lower() != ".geojson" or not path.exists(): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + features = payload.get("features") + if not isinstance(features, list): + return {"eligible": False, "period_count": 0, "has_period_field": False} + + periods: set[str] = set() + has_period_field = False + for feature in features: + if not isinstance(feature, dict): + continue + properties = feature.get("properties", {}) + if not isinstance(properties, dict): + continue + if "period" in properties: + has_period_field = True + value = properties.get("period") + if value is not None: + periods.add(str(value)) + + return { + "eligible": has_period_field and len(periods) > 1, + "period_count": len(periods), + "has_period_field": has_period_field, + } diff --git a/src/eo_api/shared/api_errors.py b/src/eo_api/shared/api_errors.py new file mode 100644 index 0000000..544f5e4 --- /dev/null +++ b/src/eo_api/shared/api_errors.py @@ -0,0 +1,83 @@ +"""Shared typed API error helpers.""" + +from __future__ import annotations + +from typing import NoReturn + +from fastapi import HTTPException +from pydantic import BaseModel + + +class ApiErrorResponse(BaseModel): + """Stable API error envelope.""" + + error: str + error_code: str + message: str + resource_id: str | None = None + process_id: str | None = None + job_id: str | None = None + run_id: str | None = None + status: str | None = None + failed_component: str | None = None + failed_component_version: str | None = None + + +def api_error( + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + run_id: str | None = None, + status: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, +) -> dict[str, str]: + """Build a stable API error envelope.""" + return ApiErrorResponse( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + run_id=run_id, + status=status, + failed_component=failed_component, + failed_component_version=failed_component_version, + ).model_dump(exclude_none=True) + + +def raise_api_error( + status_code: int, + *, + error: str, + error_code: str, + message: str, + resource_id: str | None = None, + process_id: str | None = None, + job_id: str | None = None, + run_id: str | None = None, + status: str | None = None, + failed_component: str | None = None, + failed_component_version: str | None = None, +) -> NoReturn: + """Raise an HTTPException using the shared typed error envelope.""" + raise HTTPException( + status_code=status_code, + detail=api_error( + error=error, + error_code=error_code, + message=message, + resource_id=resource_id, + process_id=process_id, + job_id=job_id, + run_id=run_id, + status=status, + failed_component=failed_component, + failed_component_version=failed_component_version, + ), + ) diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index c359603..55a0cd4 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -6,8 +6,8 @@ from ..publications.schemas import PublishedResourceExposure from ..publications.services import collection_id_for_resource, get_published_resource +from ..shared.api_errors import api_error from .schemas import ( - ApiErrorResponse, WorkflowAssemblyExecuteRequest, WorkflowCatalogItem, WorkflowCatalogResponse, @@ -29,27 +29,6 @@ router = APIRouter() -def _api_error( - *, - error: str, - error_code: str, - message: str, - resource_id: str | None = None, - process_id: str | None = None, - job_id: str | None = None, - status: str | None = None, -) -> dict[str, str]: - return ApiErrorResponse( - error=error, - error_code=error_code, - message=message, - resource_id=resource_id, - process_id=process_id, - job_id=job_id, - status=status, - ).model_dump(exclude_none=True) - - @router.get("", response_model=WorkflowCatalogResponse) def list_workflows() -> WorkflowCatalogResponse: """List all allowlisted workflow definitions.""" @@ -58,7 +37,7 @@ def list_workflows() -> WorkflowCatalogResponse: except ValueError as exc: raise HTTPException( status_code=500, - detail=_api_error( + detail=api_error( error="workflow_catalog_unavailable", error_code="CATALOG_UNAVAILABLE", message=str(exc), @@ -98,7 +77,7 @@ def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: if job is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -119,12 +98,14 @@ def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: "href": f"{str(request.base_url).rstrip('/')}/ogcapi/collections/{collection_id}", } ) - links.append( - { - "rel": "analytics", - "href": f"{str(request.base_url).rstrip('/')}/analytics/publications/{publication.resource_id}/viewer", - } - ) + analytics_link = next((link for link in publication.links if link.get("rel") == "analytics"), None) + if analytics_link is not None: + links.append( + { + "rel": "analytics", + "href": f"{str(request.base_url).rstrip('/')}{analytics_link['href']}", + } + ) return job.model_copy(update={"links": links}) @@ -135,7 +116,7 @@ def get_workflow_job_result(job_id: str) -> dict[str, Any]: if job is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -146,7 +127,7 @@ def get_workflow_job_result(job_id: str) -> dict[str, Any]: if result is None: raise HTTPException( status_code=409, - detail=_api_error( + detail=api_error( error="job_result_unavailable", error_code="JOB_RESULT_UNAVAILABLE", message=f"Result is not available for job '{job_id}'", @@ -164,7 +145,7 @@ def get_workflow_job_trace(job_id: str) -> dict[str, Any]: if job is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -175,7 +156,7 @@ def get_workflow_job_trace(job_id: str) -> dict[str, Any]: if trace is None: raise HTTPException( status_code=409, - detail=_api_error( + detail=api_error( error="job_trace_unavailable", error_code="JOB_TRACE_UNAVAILABLE", message=f"Trace is not available for job '{job_id}'", @@ -193,7 +174,7 @@ def delete_workflow_job(job_id: str) -> dict[str, Any]: if deleted is None: raise HTTPException( status_code=404, - detail=_api_error( + detail=api_error( error="job_not_found", error_code="JOB_NOT_FOUND", message=f"Unknown job_id '{job_id}'", @@ -219,7 +200,7 @@ def cleanup_workflow_jobs( except ValueError as exc: raise HTTPException( status_code=422, - detail=_api_error( + detail=api_error( error="cleanup_policy_invalid", error_code="CLEANUP_POLICY_INVALID", message=str(exc), diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index 6f95077..6d4f27a 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -210,18 +210,6 @@ class WorkflowJobCleanupResponse(BaseModel): deleted_job_ids: list[str] -class ApiErrorResponse(BaseModel): - """Stable API error envelope.""" - - error: str - error_code: str - message: str - resource_id: str | None = None - process_id: str | None = None - job_id: str | None = None - status: str | None = None - - class WorkflowCatalogItem(BaseModel): """Discoverable workflow definition summary.""" diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index c1110de..1453f45 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -16,6 +16,7 @@ from ...components import services as component_services from ...data_registry.services.datasets import get_dataset from ...publications.services import register_workflow_output_publication +from ...shared.api_errors import api_error from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowJobStatus from .definitions import WorkflowDefinition, WorkflowPublicationPolicy, load_workflow_definition from .job_store import initialize_job, mark_job_failed, mark_job_running, mark_job_success @@ -96,7 +97,15 @@ def execute_workflow( dataset = get_dataset(request.dataset_id) if dataset is None: - raise HTTPException(status_code=404, detail=f"Dataset '{request.dataset_id}' not found") + raise HTTPException( + status_code=404, + detail=api_error( + error="dataset_not_found", + error_code="DATASET_NOT_FOUND", + message=f"Dataset '{request.dataset_id}' not found", + resource_id=request.dataset_id, + ), + ) artifacts = WorkflowArtifacts() @@ -107,7 +116,14 @@ def execute_workflow( try: workflow = load_workflow_definition(workflow_id) except ValueError as exc: - raise HTTPException(status_code=422, detail=str(exc)) from exc + raise HTTPException( + status_code=422, + detail=api_error( + error="workflow_definition_invalid", + error_code="WORKFLOW_DEFINITION_INVALID", + message=str(exc), + ), + ) from exc initialize_job( job_id=runtime.run_id, @@ -198,14 +214,14 @@ def execute_workflow( error = "upstream_unreachable" if exc.error_code == "UPSTREAM_UNREACHABLE" else "workflow_execution_failed" raise HTTPException( status_code=exc.status_code, - detail={ - "error": error, - "error_code": exc.error_code, - "message": str(exc), - "failed_component": exc.component, - "failed_component_version": exc.component_version, - "run_id": runtime.run_id, - }, + detail=api_error( + error=error, + error_code=exc.error_code, + message=str(exc), + run_id=runtime.run_id, + failed_component=exc.component, + failed_component_version=exc.component_version, + ), ) from exc except HTTPException: run_log_file = persist_run_log( @@ -237,14 +253,14 @@ def execute_workflow( last_component = runtime.component_runs[-1].component if runtime.component_runs else "unknown" raise HTTPException( status_code=500, - detail={ - "error": "workflow_execution_failed", - "error_code": "EXECUTION_FAILED", - "message": str(exc), - "failed_component": last_component, - "failed_component_version": "unknown", - "run_id": runtime.run_id, - }, + detail=api_error( + error="workflow_execution_failed", + error_code="EXECUTION_FAILED", + message=str(exc), + run_id=runtime.run_id, + failed_component=last_component, + failed_component_version="unknown", + ), ) from exc diff --git a/src/eo_api/workflows/services/job_store.py b/src/eo_api/workflows/services/job_store.py index 34fe1ac..9ebc6f4 100644 --- a/src/eo_api/workflows/services/job_store.py +++ b/src/eo_api/workflows/services/job_store.py @@ -208,7 +208,7 @@ def delete_job(job_id: str) -> dict[str, Any] | None: "deleted_publication": publication.resource_id if publication is not None else None, "materialized_config_path": str(config_path), "materialized_openapi_path": str(openapi_path), - "pygeoapi_runtime_reload_required": True, + "pygeoapi_runtime_reload_required": False, } diff --git a/src/eo_api/workflows/services/simple_mapper.py b/src/eo_api/workflows/services/simple_mapper.py index e8c016f..07fb368 100644 --- a/src/eo_api/workflows/services/simple_mapper.py +++ b/src/eo_api/workflows/services/simple_mapper.py @@ -2,9 +2,8 @@ from __future__ import annotations -from fastapi import HTTPException - from ...data_registry.services.datasets import get_dataset +from ...shared.api_errors import raise_api_error from ..schemas import ( Dhis2DataValueSetConfig, FeatureSourceConfig, @@ -23,6 +22,9 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR inputs = payload dataset_id = inputs.dataset_id dataset = get_dataset(dataset_id) + start: str + end: str + feature_source: FeatureSourceConfig period_type = str(dataset.get("period_type", "")).lower() if dataset else "" @@ -45,7 +47,12 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR start = f"{inputs.start_year}-01-01" end = f"{inputs.end_year}-12-31" else: - raise HTTPException(status_code=422, detail="Provide either start_date/end_date or start_year/end_year") + raise_api_error( + 422, + error="workflow_request_invalid", + error_code="REQUEST_VALIDATION_FAILED", + message="Provide either start_date/end_date or start_year/end_year", + ) if inputs.org_unit_level is not None: feature_source = FeatureSourceConfig( @@ -60,7 +67,12 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR feature_id_property=inputs.feature_id_property, ) else: - raise HTTPException(status_code=422, detail="Provide org_unit_level or org_unit_ids") + raise_api_error( + 422, + error="workflow_request_invalid", + error_code="REQUEST_VALIDATION_FAILED", + message="Provide org_unit_level or org_unit_ids", + ) normalized = WorkflowExecuteRequest( dataset_id=dataset_id, diff --git a/tests/test_workflows.py b/tests/test_workflows.py index aa600af..2e5ca7b 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -77,6 +77,41 @@ def _patch_successful_execution(monkeypatch: pytest.MonkeyPatch) -> None: ) +def _patch_successful_execution_multi_period(monkeypatch: pytest.MonkeyPatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), [[[1.0]], [[2.0]]])}, + coords={"time": ["2024-01-01", "2024-02-01"], "lat": [0], "lon": [0]}, + ) + monkeypatch.setattr( + engine, + "get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip"}, + ) + monkeypatch.setattr( + engine.component_services, + "feature_source_component", + lambda config: ( + {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + [0.0, 0.0, 1.0, 1.0], + ), + ) + monkeypatch.setattr(engine.component_services, "download_dataset_component", lambda **kwargs: None) + monkeypatch.setattr(engine.component_services, "temporal_aggregation_component", lambda **kwargs: ds) + monkeypatch.setattr( + engine.component_services, + "spatial_aggregation_component", + lambda **kwargs: [ + {"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}, + {"org_unit": "OU_1", "time": "2024-02-01", "value": 12.0}, + ], + ) + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}, {"value": "12.0"}]}, "/tmp/data/out.json"), + ) + + def test_workflow_endpoint_exists_once() -> None: workflow_routes = { route.path @@ -95,9 +130,7 @@ def test_ogc_process_routes_exist() -> None: ogc_routes = { route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/ogcapi") } - assert "/ogcapi/collections" in ogc_routes - assert "/ogcapi/collections/{collection_id}" in ogc_routes - assert "/ogcapi/collections/{collection_id}/items" in ogc_routes + assert "/ogcapi" in ogc_routes assert "/ogcapi/processes" in ogc_routes assert "/ogcapi/processes/{process_id}" in ogc_routes assert "/ogcapi/processes/{process_id}/execution" in ogc_routes @@ -141,6 +174,50 @@ def test_pygeoapi_mount_serves_landing_page(client: TestClient) -> None: assert response.status_code == 200 body = response.json() assert body["title"] == "DHIS2 EO API" + rels = {link["rel"] for link in body["links"]} + assert {"self", "alternate", "data", "processes", "jobs"} <= rels + + +def test_publication_endpoint_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/publications/does-not-exist") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "published_resource_not_found" + assert body["error_code"] == "PUBLISHED_RESOURCE_NOT_FOUND" + assert body["resource_id"] == "does-not-exist" + + +def test_analytics_endpoint_missing_uses_typed_error_envelope(client: TestClient) -> None: + response = client.get("/analytics/publications/does-not-exist") + assert response.status_code == 404 + body = response.json()["detail"] + assert body["error"] == "published_resource_not_found" + assert body["error_code"] == "PUBLISHED_RESOURCE_NOT_FOUND" + assert body["resource_id"] == "does-not-exist" + + +def test_mapper_validation_uses_typed_error_envelope() -> None: + payload = WorkflowRequest.model_construct( # type: ignore[call-arg] + workflow_id="dhis2_datavalue_set_v1", + dataset_id="chirps3_precipitation_daily", + org_unit_level=3, + data_element="DE_UID", + temporal_resolution=PeriodType.MONTHLY, + temporal_reducer=AggregationMethod.SUM, + spatial_reducer=AggregationMethod.MEAN, + overwrite=False, + dry_run=True, + feature_id_property="id", + include_component_run_details=False, + ) + + with pytest.raises(HTTPException) as exc_info: + normalize_simple_request(payload) + + assert exc_info.value.status_code == 422 + detail = cast(dict[str, Any], exc_info.value.detail) + assert detail["error"] == "workflow_request_invalid" + assert detail["error_code"] == "REQUEST_VALIDATION_FAILED" def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClient) -> None: @@ -282,13 +359,9 @@ def test_workflow_job_result_missing_uses_typed_error_envelope(client: TestClien assert body["job_id"] == "does-not-exist" -def test_ogc_collection_missing_uses_typed_error_envelope(client: TestClient) -> None: +def test_pygeoapi_collection_missing_returns_not_found(client: TestClient) -> None: response = client.get("/ogcapi/collections/does-not-exist", params={"f": "json"}) assert response.status_code == 404 - body = response.json()["detail"] - assert body["error"] == "collection_not_found" - assert body["error_code"] == "COLLECTION_NOT_FOUND" - assert body["resource_id"] == "does-not-exist" def test_ogc_job_results_unavailable_uses_typed_error_envelope( @@ -563,7 +636,7 @@ def test_workflow_job_endpoints_return_persisted_result( assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") assert links["trace"].endswith(f"/workflows/jobs/{run_id}/trace") assert links["collection"].endswith(f"/ogcapi/collections/workflow-output-{run_id}") - assert links["analytics"].endswith(f"/analytics/publications/workflow-output-{run_id}/viewer") + assert "analytics" not in links assert "result" not in job_body results_response = client.get(f"/workflows/jobs/{run_id}/result") @@ -624,7 +697,7 @@ def test_delete_workflow_job_cascades_derived_artifacts( assert delete_body["job_id"] == run_id assert delete_body["deleted"] is True assert delete_body["deleted_publication"] == f"workflow-output-{run_id}" - assert delete_body["pygeoapi_runtime_reload_required"] is True + assert delete_body["pygeoapi_runtime_reload_required"] is False assert not job_file.exists() assert not run_log_file.exists() @@ -795,8 +868,9 @@ def test_workflow_success_registers_derived_publication( assert derived["asset_format"] == "geojson" assert derived["path"].endswith(".geojson") assert derived["metadata"]["native_output_file"].endswith(".json") - analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") - assert analytics_link["href"] == f"/analytics/publications/workflow-output-{run_id}/viewer" + assert derived["metadata"]["period_count"] == 1 + assert derived["metadata"]["analytics_eligible"] is False + assert not any(link["rel"] == "analytics" for link in derived["links"]) geojson = Path(derived["path"]).read_text(encoding="utf-8") assert '"org_unit_name"' in geojson assert '"period": "2024-01"' in geojson @@ -826,7 +900,7 @@ def test_dynamic_ogc_collection_routes_reflect_new_publication_without_restart( detail = detail_response.json() detail_links = {link["rel"]: link["href"] for link in detail["links"]} assert detail["id"] == collection_id - assert detail_links["analytics"].endswith(f"/analytics/publications/{collection_id}/viewer") + assert "analytics" not in detail_links items_response = client.get(f"/ogcapi/collections/{collection_id}/items", params={"f": "json", "limit": 5}) assert items_response.status_code == 200 @@ -882,6 +956,44 @@ def test_analytics_viewer_config_and_html_for_publication( assert resource_id in viewer_response.text +def test_multi_period_publication_adds_analytics_link( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution_multi_period(monkeypatch) + + response = client.post("/workflows/dhis2-datavalue-set", json=_valid_public_payload()) + assert response.status_code == 200 + run_id = response.json()["run_id"] + resource_id = f"workflow-output-{run_id}" + + publication_response = client.get(f"/publications/{resource_id}") + assert publication_response.status_code == 200 + publication = publication_response.json() + assert publication["metadata"]["period_count"] == 2 + assert publication["metadata"]["analytics_eligible"] is True + analytics_link = next(link for link in publication["links"] if link["rel"] == "analytics") + assert analytics_link["href"] == f"/analytics/publications/{resource_id}/viewer" + + job_response = client.get(f"/workflows/jobs/{run_id}") + assert job_response.status_code == 200 + job_links = {item["rel"]: item["href"] for item in job_response.json()["links"]} + assert job_links["analytics"].endswith(f"/analytics/publications/{resource_id}/viewer") + + config_response = client.get("/publications/pygeoapi/config") + assert config_response.status_code == 200 + derived = config_response.json()["resources"][resource_id] + analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") + assert analytics_link["type"] == "text/html" + assert analytics_link["title"] == "Analytics Viewer" + assert analytics_link["href"].endswith(f"/analytics/publications/{resource_id}/viewer") + + def test_workflow_with_publication_disabled_does_not_register_derived_publication( client: TestClient, monkeypatch: pytest.MonkeyPatch, @@ -1084,10 +1196,7 @@ def test_generated_pygeoapi_config_includes_geojson_derived_resource( assert derived["providers"][0]["name"] == "GeoJSON" assert derived["providers"][0]["type"] == "feature" assert derived["providers"][0]["data"].endswith(".geojson") - analytics_link = next(link for link in derived["links"] if link["rel"] == "analytics") - assert analytics_link["type"] == "text/html" - assert analytics_link["title"] == "Analytics Viewer" - assert analytics_link["href"].endswith(f"/analytics/publications/workflow-output-{run_id}/viewer") + assert not any(link["rel"] == "analytics" for link in derived["links"]) def test_materialize_generated_pygeoapi_documents_writes_files( From 7eca31c462e7f7ceaeceba26d2cdc86b15e73c1f Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Thu, 19 Mar 2026 19:34:13 +0100 Subject: [PATCH 09/15] Simplify ogcapi landing page --- src/eo_api/ogc/routes.py | 307 ++++++++++++++++++++++----------------- 1 file changed, 177 insertions(+), 130 deletions(-) diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 971f80e..2cf1d81 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -329,34 +329,30 @@ def _wants_html(request: Request, f: str | None) -> bool: def _render_ogc_root_html(body: dict[str, Any]) -> str: + # Map icon SVGs to navigation items by title + icons_map = { + "Browse Collections": '', + "List Processes": '', + "List Jobs": '', + "Conformance": '', + } + nav_cards = "".join( ( '' - '' - '{title}' - '{description}' - '' + '
{icon}
' + '{title}' + '{description}' + '
→
' "
" ).format( href=escape(item["href"]), title=escape(item["title"]), description=escape(item["description"]), + icon=icons_map.get(item["title"], ''), ) for item in body.get("navigation", []) ) - link_items = "".join( - ( - '' - '{rel}' - '{type}' - "" - ).format( - href=escape(link["href"]), - rel=escape(link["rel"]), - type=escape(link["type"]), - ) - for link in body["links"] - ) return f""" @@ -365,148 +361,196 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: {escape(body["title"])} @@ -515,12 +559,15 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str:
OGC API

{escape(body["title"])}

-

{escape(body["description"])}

- -
-

API Links

- +

{escape(body["description"])}

+ + + +
+

🌍 DHIS2 Earth Observation API • GitHub

+
""" From 49d5c1c1f7c04792ba2dca30160c84a068b66b24 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Thu, 19 Mar 2026 20:49:19 +0100 Subject: [PATCH 10/15] feat: harden workflow wiring and schedule contracts --- data/workflows/dhis2_datavalue_set.yaml | 37 +++- ...alue_set_without_temporal_aggregation.yaml | 27 ++- src/eo_api/components/services.py | 4 +- src/eo_api/ogc/routes.py | 99 ++++++---- src/eo_api/shared/api_errors.py | 3 + src/eo_api/workflows/routes.py | 87 ++++++++ src/eo_api/workflows/schemas.py | 58 ++++++ src/eo_api/workflows/services/definitions.py | 100 +++++++++- src/eo_api/workflows/services/engine.py | 173 +++++++++------- src/eo_api/workflows/services/job_store.py | 19 ++ src/eo_api/workflows/services/schedules.py | 186 ++++++++++++++++++ tests/test_workflows.py | 87 +++++++- 12 files changed, 751 insertions(+), 129 deletions(-) create mode 100644 src/eo_api/workflows/services/schedules.py diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml index da50d5d..7ca4e88 100644 --- a/data/workflows/dhis2_datavalue_set.yaml +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -6,13 +6,40 @@ publication: intent: feature_collection exposure: ogc steps: - - component: feature_source + - id: get_features + component: feature_source version: v1 - - component: download_dataset + - id: download + component: download_dataset version: v1 - - component: temporal_aggregation + inputs: + bbox: + from_step: get_features + output: bbox + - id: temporal_agg + component: temporal_aggregation version: v1 - - component: spatial_aggregation + inputs: + bbox: + from_step: get_features + output: bbox + - id: spatial_agg + component: spatial_aggregation version: v1 - - component: build_datavalueset + inputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + temporal_dataset: + from_step: temporal_agg + output: temporal_dataset + - id: build_dhis2_payload + component: build_datavalueset version: v1 + inputs: + records: + from_step: spatial_agg + output: records diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml index 9beb8e0..ccff28d 100644 --- a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -4,11 +4,30 @@ publication: publishable: false exposure: registry_only steps: - - component: feature_source + - id: get_features + component: feature_source version: v1 - - component: download_dataset + - id: download + component: download_dataset version: v1 - - component: spatial_aggregation + inputs: + bbox: + from_step: get_features + output: bbox + - id: spatial_agg + component: spatial_aggregation version: v1 - - component: build_datavalueset + inputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + - id: build_dhis2_payload + component: build_datavalueset version: v1 + inputs: + records: + from_step: spatial_agg + output: records diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index 58ad6df..910f632 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -104,7 +104,7 @@ version="v1", description="Aggregate dataset over time dimension.", inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], - outputs=["dataset"], + outputs=["temporal_dataset"], input_schema={ "type": "object", "properties": { @@ -128,7 +128,7 @@ }, "additionalProperties": False, }, - output_schema={"type": "object", "properties": {"dataset": {"type": "object"}}}, + output_schema={"type": "object", "properties": {"temporal_dataset": {"type": "object"}}}, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/temporal-aggregation", method="POST"), ), diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 2cf1d81..67eb9ec 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -329,14 +329,35 @@ def _wants_html(request: Request, f: str | None) -> bool: def _render_ogc_root_html(body: dict[str, Any]) -> str: - # Map icon SVGs to navigation items by title - icons_map = { - "Browse Collections": '', - "List Processes": '', - "List Jobs": '', - "Conformance": '', + # Map icon SVGs to navigation items by title # noqa: E501 + icons_map = { # noqa: E501 + "Browse Collections": ( # noqa: E501 + '' + ), + "List Processes": ( # noqa: E501 + '' + "" + ), + "List Jobs": ( # noqa: E501 + '' + '' + ), + "Conformance": ( # noqa: E501 + '' + ), } - + nav_cards = "".join( ( '' @@ -349,7 +370,15 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: href=escape(item["href"]), title=escape(item["title"]), description=escape(item["description"]), - icon=icons_map.get(item["title"], ''), + icon=icons_map.get( + item["title"], + ( # noqa: E501 + '' + '' + "" + ), + ), ) for item in body.get("navigation", []) ) @@ -377,26 +406,28 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: --shadow-md: 0 10px 15px rgba(0, 0, 0, 0.1); --shadow-lg: 0 20px 25px rgba(0, 0, 0, 0.1); }} - + * {{ box-sizing: border-box; }} - + body {{ margin: 0; padding: 0; - font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", sans-serif; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", + "Oxygen", "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", + "Helvetica Neue", sans-serif; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; color: var(--text); background: linear-gradient(135deg, var(--bg-light) 0%, var(--bg) 100%); min-height: 100vh; }} - + main {{ max-width: 1200px; margin: 0 auto; padding: 60px 24px; }} - + .eyebrow {{ display: inline-block; padding: 8px 14px; @@ -409,7 +440,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: text-transform: uppercase; margin-bottom: 16px; }} - + h1 {{ margin: 0 0 12px; font-size: clamp(2.4rem, 6vw, 3.6rem); @@ -421,7 +452,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: -webkit-text-fill-color: transparent; background-clip: text; }} - + .subtitle {{ max-width: 720px; margin: 0 0 48px; @@ -430,7 +461,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: line-height: 1.6; font-weight: 400; }} - + /* Navigation Grid */ .nav-grid {{ display: grid; @@ -438,7 +469,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: gap: 20px; margin-bottom: 56px; }} - + .nav-card {{ position: relative; display: flex; @@ -453,7 +484,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: color: inherit; overflow: hidden; }} - + .nav-card::before {{ content: ''; position: absolute; @@ -466,26 +497,26 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: transform-origin: left; transition: transform 280ms ease; }} - + .nav-card:hover {{ border-color: var(--primary); box-shadow: var(--shadow-md); transform: translateY(-6px); }} - + .nav-card:hover::before {{ transform: scaleX(1); }} - + .nav-card:hover .card-icon {{ color: var(--primary); transform: scale(1.1) rotate(5deg); }} - + .nav-card:hover .card-arrow {{ transform: translateX(4px); }} - + .card-icon-wrapper {{ margin-bottom: 16px; display: flex; @@ -496,7 +527,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: border-radius: 12px; background: linear-gradient(135deg, rgba(25, 118, 210, 0.1) 0%, rgba(79, 195, 247, 0.05) 100%); }} - + .card-icon {{ width: 28px; height: 28px; @@ -504,14 +535,14 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: flex-shrink: 0; transition: all 280ms ease; }} - + .card-title {{ margin: 0 0 8px; font-size: 1.125rem; font-weight: 700; line-height: 1.3; }} - + .card-desc {{ flex-grow: 1; margin: 0 0 16px; @@ -519,7 +550,7 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: font-size: 0.875rem; line-height: 1.5; }} - + .card-arrow {{ color: var(--primary); font-weight: 700; @@ -527,28 +558,28 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str: transition: transform 280ms ease; margin-top: auto; }} - + /* Responsive */ @media (max-width: 768px) {{ main {{ padding: 40px 16px; }} - + h1 {{ font-size: clamp(2rem, 5vw, 2.8rem); }} - + .nav-grid {{ grid-template-columns: 1fr; }} }} - + /* Print styles */ @media print {{ body {{ background: white; }} - + .nav-card, .link-row {{ box-shadow: none; }} @@ -560,11 +591,11 @@ def _render_ogc_root_html(body: dict[str, Any]) -> str:
OGC API

{escape(body["title"])}

{escape(body["description"])}

- + - +
diff --git a/src/eo_api/shared/api_errors.py b/src/eo_api/shared/api_errors.py index 544f5e4..cde2d7d 100644 --- a/src/eo_api/shared/api_errors.py +++ b/src/eo_api/shared/api_errors.py @@ -18,6 +18,7 @@ class ApiErrorResponse(BaseModel): process_id: str | None = None job_id: str | None = None run_id: str | None = None + schedule_id: str | None = None status: str | None = None failed_component: str | None = None failed_component_version: str | None = None @@ -32,6 +33,7 @@ def api_error( process_id: str | None = None, job_id: str | None = None, run_id: str | None = None, + schedule_id: str | None = None, status: str | None = None, failed_component: str | None = None, failed_component_version: str | None = None, @@ -45,6 +47,7 @@ def api_error( process_id=process_id, job_id=job_id, run_id=run_id, + schedule_id=schedule_id, status=status, failed_component=failed_component, failed_component_version=failed_component_version, diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index 55a0cd4..f03c260 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -17,6 +17,10 @@ WorkflowJobListResponse, WorkflowJobRecord, WorkflowJobStatus, + WorkflowSchedule, + WorkflowScheduleCreateRequest, + WorkflowScheduleTriggerRequest, + WorkflowScheduleTriggerResponse, WorkflowValidateRequest, WorkflowValidateResponse, WorkflowValidateStep, @@ -24,6 +28,7 @@ from .services.definitions import list_workflow_definitions, load_workflow_definition from .services.engine import execute_workflow, validate_workflow_steps from .services.job_store import cleanup_jobs, delete_job, get_job, get_job_result, get_job_trace, list_jobs +from .services.schedules import create_schedule, delete_schedule, get_schedule, list_schedules, trigger_schedule from .services.simple_mapper import normalize_simple_request router = APIRouter() @@ -209,6 +214,88 @@ def cleanup_workflow_jobs( return WorkflowJobCleanupResponse.model_validate(result) +@router.post("/schedules", response_model=WorkflowSchedule) +def create_workflow_schedule(payload: WorkflowScheduleCreateRequest) -> WorkflowSchedule: + """Create a recurring workflow schedule contract.""" + try: + return create_schedule(payload) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="schedule_invalid", + error_code="SCHEDULE_INVALID", + message=str(exc), + ), + ) from exc + + +@router.get("/schedules", response_model=list[WorkflowSchedule]) +def list_workflow_schedules(workflow_id: str | None = None) -> list[WorkflowSchedule]: + """List persisted workflow schedules.""" + return list_schedules(workflow_id=workflow_id) + + +@router.get("/schedules/{schedule_id}", response_model=WorkflowSchedule) +def get_workflow_schedule(schedule_id: str) -> WorkflowSchedule: + """Fetch one persisted workflow schedule.""" + schedule = get_schedule(schedule_id) + if schedule is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="schedule_not_found", + error_code="SCHEDULE_NOT_FOUND", + message=f"Unknown schedule_id '{schedule_id}'", + schedule_id=schedule_id, + ), + ) + return schedule + + +@router.delete("/schedules/{schedule_id}", status_code=204) +def delete_workflow_schedule(schedule_id: str) -> None: + """Delete one persisted workflow schedule.""" + deleted = delete_schedule(schedule_id) + if deleted is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="schedule_not_found", + error_code="SCHEDULE_NOT_FOUND", + message=f"Unknown schedule_id '{schedule_id}'", + schedule_id=schedule_id, + ), + ) + + +@router.post("/schedules/{schedule_id}/trigger", response_model=WorkflowScheduleTriggerResponse) +def trigger_workflow_schedule( + schedule_id: str, + payload: WorkflowScheduleTriggerRequest | None = None, +) -> WorkflowScheduleTriggerResponse: + """Trigger one persisted schedule immediately.""" + try: + trigger_response, _result = trigger_schedule( + schedule_id=schedule_id, + execution_time=(payload.execution_time if payload is not None else None), + ) + except ValueError as exc: + message = str(exc) + error_code = "SCHEDULE_NOT_FOUND" if "Unknown schedule_id" in message else "SCHEDULE_TRIGGER_INVALID" + status_code = 404 if error_code == "SCHEDULE_NOT_FOUND" else 422 + raise HTTPException( + status_code=status_code, + detail=api_error( + error="schedule_trigger_failed" if status_code == 422 else "schedule_not_found", + error_code=error_code, + message=message, + schedule_id=schedule_id, + ), + ) from exc + return trigger_response + + @router.post("/dhis2-datavalue-set", response_model=WorkflowExecuteResponse) def run_dhis2_datavalue_set_workflow(payload: WorkflowExecuteEnvelopeRequest) -> WorkflowExecuteResponse: """Run workflow from a single flat request payload.""" diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index 6d4f27a..cfd63fe 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -139,9 +139,11 @@ class WorkflowJobStatus(StrEnum): class WorkflowJobOrchestrationStep(BaseModel): """Compact summary of one workflow step.""" + id: str component: str version: str execution_mode: str | None = None + inputs: dict[str, dict[str, str]] = Field(default_factory=dict) class WorkflowJobOrchestration(BaseModel): @@ -172,6 +174,9 @@ class WorkflowJobRecord(BaseModel): error_code: str | None = None failed_component: str | None = None failed_component_version: str | None = None + trigger_type: str = "on_demand" + schedule_id: str | None = None + idempotency_key: str | None = None links: list[dict[str, Any]] = Field(default_factory=list) @@ -298,9 +303,62 @@ class WorkflowValidateStep(BaseModel): """Resolved workflow step metadata from validation.""" index: int + id: str | None = None component: str version: str resolved_config: dict[str, Any] + resolved_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + + +class JobRetentionPolicy(BaseModel): + """Retention policy metadata for scheduled runs.""" + + keep_latest: int | None = Field(default=None, ge=0) + older_than_hours: int | None = Field(default=None, ge=0) + automatic_cleanup: bool = True + + +class WorkflowSchedule(BaseModel): + """Recurring workflow execution contract.""" + + schedule_id: str + workflow_id: str + cron_expression: str + request: WorkflowRequest + enabled: bool = True + idempotency_key_template: str = "{workflow_id}:{schedule_id}:{date}" + retention_policy: JobRetentionPolicy = Field(default_factory=JobRetentionPolicy) + created_at: str + updated_at: str + last_triggered_at: str | None = None + + +class WorkflowScheduleCreateRequest(BaseModel): + """Create a recurring workflow execution schedule.""" + + workflow_id: str | None = None + cron_expression: str + request: WorkflowRequest + enabled: bool = True + idempotency_key_template: str = "{workflow_id}:{schedule_id}:{date}" + retention_policy: JobRetentionPolicy = Field(default_factory=JobRetentionPolicy) + + +class WorkflowScheduleTriggerRequest(BaseModel): + """Trigger one schedule execution.""" + + execution_time: str | None = None + + +class WorkflowScheduleTriggerResponse(BaseModel): + """Result of triggering a schedule execution.""" + + schedule_id: str + workflow_id: str + job_id: str + status: WorkflowJobStatus + idempotency_key: str + reused_existing_job: bool = False class WorkflowValidateResponse(BaseModel): diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py index 56ff715..e51de0b 100644 --- a/src/eo_api/workflows/services/definitions.py +++ b/src/eo_api/workflows/services/definitions.py @@ -29,9 +29,17 @@ "build_datavalueset": {"records"}, } +COMPONENT_OPTIONAL_INPUTS: Final[dict[str, set[str]]] = { + "feature_source": set(), + "download_dataset": set(), + "temporal_aggregation": set(), + "spatial_aggregation": {"temporal_dataset"}, + "build_datavalueset": set(), +} + COMPONENT_OUTPUTS: Final[dict[str, set[str]]] = { "feature_source": {"features", "bbox"}, - "download_dataset": set(), + "download_dataset": {"status"}, "temporal_aggregation": {"temporal_dataset"}, "spatial_aggregation": {"records"}, "build_datavalueset": {"data_value_set", "output_file"}, @@ -45,9 +53,11 @@ class WorkflowStep(BaseModel): """One component step in a declarative workflow definition.""" + id: str | None = None component: ComponentName version: str = "v1" config: dict[str, Any] = Field(default_factory=dict) + inputs: dict[str, "WorkflowStepInput"] = Field(default_factory=dict) @model_validator(mode="after") def validate_component_version(self) -> "WorkflowStep": @@ -61,6 +71,13 @@ def validate_component_version(self) -> "WorkflowStep": return self +class WorkflowStepInput(BaseModel): + """Reference one named output from a prior workflow step.""" + + from_step: str + output: str = Field(validation_alias=AliasChoices("output", "output_key")) + + class WorkflowPublicationPolicy(BaseModel): """Publication policy for workflow outputs.""" @@ -101,16 +118,26 @@ def validate_steps(self) -> "WorkflowDefinition": """Require terminal DataValueSet step and validate component compatibility.""" if not self.steps: raise ValueError("Workflow steps cannot be empty") + _assign_step_ids(self.steps) if self.steps[-1].component != "build_datavalueset": raise ValueError("The last workflow step must be 'build_datavalueset'") - available_context: set[str] = set() + available_outputs: dict[str, set[str]] = {} + latest_producer_for_output: dict[str, str] = {} for step in self.steps: - required_inputs = COMPONENT_INPUTS[step.component] - missing_inputs = required_inputs - available_context - if missing_inputs: - missing = ", ".join(sorted(missing_inputs)) - raise ValueError(f"Component '{step.component}' is missing required upstream outputs: {missing}") - available_context.update(COMPONENT_OUTPUTS[step.component]) + if step.id is None: + raise ValueError(f"Workflow step '{step.component}' is missing an id") + + resolved_inputs = _normalize_step_inputs( + step=step, + available_outputs=available_outputs, + latest_producer_for_output=latest_producer_for_output, + ) + step.inputs = resolved_inputs + + outputs = COMPONENT_OUTPUTS[step.component] + available_outputs[step.id] = outputs + for output_name in outputs: + latest_producer_for_output[output_name] = step.id return self @@ -174,3 +201,60 @@ def _discover_workflow_files() -> dict[str, Path]: discovered[workflow_id] = workflow_file return discovered + + +def _assign_step_ids(steps: list[WorkflowStep]) -> None: + seen_ids: set[str] = set() + component_counts: dict[str, int] = {} + for step in steps: + if step.id is None: + count = component_counts.get(step.component, 0) + 1 + component_counts[step.component] = count + step.id = step.component if count == 1 else f"{step.component}_{count}" + if step.id in seen_ids: + raise ValueError(f"Duplicate workflow step id '{step.id}'") + seen_ids.add(step.id) + + +def _normalize_step_inputs( + *, + step: WorkflowStep, + available_outputs: dict[str, set[str]], + latest_producer_for_output: dict[str, str], +) -> dict[str, WorkflowStepInput]: + declared_inputs = dict(step.inputs) + required_inputs = COMPONENT_INPUTS[step.component] + optional_inputs = COMPONENT_OPTIONAL_INPUTS.get(step.component, set()) + + if not declared_inputs: + for input_name in sorted(required_inputs | optional_inputs): + producer = latest_producer_for_output.get(input_name) + if producer is None: + continue + declared_inputs[input_name] = WorkflowStepInput(from_step=producer, output=input_name) + + missing_required = required_inputs - set(declared_inputs) + if missing_required: + missing = ", ".join(sorted(missing_required)) + raise ValueError(f"Component '{step.component}' is missing required upstream outputs: {missing}") + + allowed_inputs = required_inputs | optional_inputs + unexpected_inputs = set(declared_inputs) - allowed_inputs + if unexpected_inputs: + unexpected = ", ".join(sorted(unexpected_inputs)) + raise ValueError(f"Component '{step.component}' declares unsupported inputs: {unexpected}") + + for input_name, ref in declared_inputs.items(): + available_for_step = available_outputs.get(ref.from_step) + if available_for_step is None: + raise ValueError( + f"Component '{step.component}' references unknown upstream " + f"step '{ref.from_step}' for input '{input_name}'" + ) + if ref.output not in available_for_step: + raise ValueError( + f"Component '{step.component}' input '{input_name}' references " + f"missing output '{ref.output}' from step '{ref.from_step}'" + ) + + return declared_inputs diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index 1453f45..d57bc48 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -5,7 +5,7 @@ import os import time from collections.abc import Callable -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Any, Literal @@ -18,7 +18,7 @@ from ...publications.services import register_workflow_output_publication from ...shared.api_errors import api_error from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowJobStatus -from .definitions import WorkflowDefinition, WorkflowPublicationPolicy, load_workflow_definition +from .definitions import WorkflowDefinition, WorkflowPublicationPolicy, WorkflowStep, load_workflow_definition from .job_store import initialize_job, mark_job_failed, mark_job_running, mark_job_success from .publication_assets import build_feature_collection_asset from .run_logs import persist_run_log @@ -45,40 +45,26 @@ def __init__( @dataclass -class WorkflowArtifacts: - """Typed workflow artifact handoff between components.""" - - features: dict[str, Any] | None = None - bbox: list[float] | None = None - temporal_dataset: Any | None = None - records: list[dict[str, Any]] | None = None - data_value_set: dict[str, Any] | None = None - output_file: str | None = None - - def require_features(self) -> dict[str, Any]: - if self.features is None: - raise RuntimeError("Workflow definition missing prerequisite for 'features'") - return self.features - - def require_bbox(self) -> list[float]: - if self.bbox is None: - raise RuntimeError("Workflow definition missing prerequisite for 'bbox'") - return self.bbox - - def require_records(self) -> list[dict[str, Any]]: - if self.records is None: - raise RuntimeError("Workflow definition missing prerequisite for 'records'") - return self.records - - def require_data_value_set(self) -> dict[str, Any]: - if self.data_value_set is None: - raise RuntimeError("Workflow definition missing prerequisite for 'data_value_set'") - return self.data_value_set - - def require_output_file(self) -> str: - if self.output_file is None: - raise RuntimeError("Workflow definition missing prerequisite for 'output_file'") - return self.output_file +class WorkflowExecutionContext: + """Step-scoped workflow outputs and compatibility lookup helpers.""" + + step_outputs: dict[str, dict[str, Any]] = field(default_factory=dict) + latest_outputs: dict[str, Any] = field(default_factory=dict) + + def set_step_outputs(self, step_id: str, outputs: dict[str, Any]) -> None: + self.step_outputs[step_id] = outputs + self.latest_outputs.update(outputs) + + def get_step_output(self, *, step_id: str, output_name: str) -> Any: + outputs = self.step_outputs.get(step_id) + if outputs is None or output_name not in outputs: + raise RuntimeError(f"Workflow definition missing prerequisite for '{step_id}.{output_name}'") + return outputs[output_name] + + def require_output(self, output_name: str) -> Any: + if output_name not in self.latest_outputs: + raise RuntimeError(f"Workflow definition missing prerequisite for '{output_name}'") + return self.latest_outputs[output_name] def execute_workflow( @@ -90,6 +76,9 @@ def execute_workflow( include_component_run_details: bool = False, run_id: str | None = None, workflow_definition_source: Literal["catalog", "inline"] = "catalog", + trigger_type: str = "on_demand", + schedule_id: str | None = None, + idempotency_key: str | None = None, ) -> WorkflowExecuteResponse: """Execute the feature->download->aggregate->DataValueSet workflow.""" runtime = WorkflowRuntime(run_id=run_id) @@ -107,7 +96,7 @@ def execute_workflow( ), ) - artifacts = WorkflowArtifacts() + context = WorkflowExecutionContext() try: if workflow_definition is not None: @@ -134,6 +123,9 @@ def execute_workflow( workflow_id=workflow.workflow_id, workflow_version=workflow.version, status=WorkflowJobStatus.RUNNING, + trigger_type=trigger_type, + schedule_id=schedule_id, + idempotency_key=idempotency_key, ) mark_job_running(runtime.run_id) _execute_workflow_steps( @@ -142,12 +134,12 @@ def execute_workflow( request=request, request_params=request_params, dataset=dataset, - artifacts=artifacts, + context=context, ) - features = artifacts.require_features() - bbox = artifacts.require_bbox() - data_value_set = artifacts.require_data_value_set() - output_file = artifacts.require_output_file() + features = context.require_output("features") + bbox = context.require_output("bbox") + data_value_set = context.require_output("data_value_set") + output_file = context.require_output("output_file") run_log_file = persist_run_log( run_id=runtime.run_id, request=request, @@ -182,7 +174,7 @@ def execute_workflow( response=response, request=request, publication=workflow.publication, - artifacts=artifacts, + context=context, ) register_workflow_output_publication( response=response, @@ -295,12 +287,12 @@ def _build_publication_artifact( response: WorkflowExecuteResponse, request: WorkflowExecuteRequest, publication: WorkflowPublicationPolicy, - artifacts: WorkflowArtifacts, + context: WorkflowExecutionContext, ) -> tuple[str, str]: """Build the publication-facing artifact for a publishable workflow output.""" if publication.intent.value == "feature_collection": - features = artifacts.require_features() - records = artifacts.require_records() + features = context.require_output("features") + records = context.require_output("records") path = build_feature_collection_asset( dataset_id=response.dataset_id, features=features, @@ -333,7 +325,7 @@ def _execute_workflow_steps( request: WorkflowExecuteRequest, request_params: dict[str, Any] | None, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + context: WorkflowExecutionContext, ) -> None: """Execute workflow components using declarative YAML step order.""" executors: dict[str, StepExecutor] = { @@ -345,6 +337,14 @@ def _execute_workflow_steps( } for step in workflow.steps: + if step.id is None: + raise WorkflowComponentError( + error_code="INPUT_VALIDATION_FAILED", + message=f"Workflow step '{step.component}' is missing an id", + component=step.component, + component_version=step.version, + status_code=422, + ) executor = executors.get(step.component) if executor is None: raise WorkflowComponentError( @@ -367,11 +367,13 @@ def _execute_workflow_steps( ) from exc try: + resolved_inputs = _resolve_step_inputs(step=step, context=context) updates = executor( + step=step, runtime=runtime, request=request, dataset=dataset, - artifacts=artifacts, + resolved_inputs=resolved_inputs, step_config=step_config, ) except Exception as exc: @@ -391,7 +393,8 @@ def _execute_workflow_steps( status_code=500, ) from exc - _apply_artifact_updates(artifacts, updates) + _validate_step_outputs(step=step, outputs=updates) + context.set_step_outputs(step.id, updates) def validate_workflow_steps( @@ -411,9 +414,14 @@ def validate_workflow_steps( resolved_steps.append( { "index": index + 1, + "id": step.id, "component": step.component, "version": step.version, "resolved_config": resolved_config, + "resolved_inputs": { + input_name: {"from_step": ref.from_step, "output": ref.output} + for input_name, ref in step.inputs.items() + }, } ) return resolved_steps @@ -424,13 +432,14 @@ def validate_workflow_steps( def _run_feature_source( *, + step: WorkflowStep, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + resolved_inputs: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - del dataset, artifacts + del dataset, resolved_inputs, step execution_mode = str(step_config.get("execution_mode", "local")).lower() if execution_mode == "remote": features, bbox = runtime.run( @@ -453,10 +462,11 @@ def _run_feature_source( def _run_download_dataset( *, + step: WorkflowStep, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + resolved_inputs: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: execution_mode = str(step_config.get("execution_mode", "local")).lower() @@ -465,7 +475,7 @@ def _run_download_dataset( overwrite = request.overwrite country_code = request.country_code - bbox = artifacts.require_bbox() + bbox = resolved_inputs["bbox"] if execution_mode == "remote": remote_url = step_config.get("remote_url") if not isinstance(remote_url, str) or not remote_url: @@ -498,17 +508,19 @@ def _run_download_dataset( country_code=country_code, bbox=bbox, ) - return {} + return {"status": "downloaded"} def _run_temporal_aggregation( *, + step: WorkflowStep, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + resolved_inputs: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: + del step target_period_type = request.temporal_aggregation.target_period_type method = request.temporal_aggregation.method execution_mode = str(step_config.get("execution_mode", "local")).lower() @@ -520,7 +532,7 @@ def _run_temporal_aggregation( dataset_id=request.dataset_id, start=request.start, end=request.end, - bbox=artifacts.require_bbox(), + bbox=resolved_inputs["bbox"], target_period_type=target_period_type.value, method=method.value, timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), @@ -534,7 +546,7 @@ def _run_temporal_aggregation( dataset=dataset, start=request.start, end=request.end, - bbox=artifacts.require_bbox(), + bbox=resolved_inputs["bbox"], target_period_type=target_period_type, method=method, ) @@ -543,16 +555,18 @@ def _run_temporal_aggregation( def _run_spatial_aggregation( *, + step: WorkflowStep, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + resolved_inputs: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: + del step method = request.spatial_aggregation.method feature_id_property = request.dhis2.org_unit_property execution_mode = str(step_config.get("execution_mode", "local")).lower() - temporal_dataset = artifacts.temporal_dataset + temporal_dataset = resolved_inputs.get("temporal_dataset") if execution_mode == "remote": if temporal_dataset is not None: raise ValueError( @@ -566,7 +580,7 @@ def _run_spatial_aggregation( dataset_id=request.dataset_id, start=request.start, end=request.end, - bbox=artifacts.require_bbox(), + bbox=resolved_inputs["bbox"], feature_source=request.feature_source.model_dump(mode="json"), method=method.value, feature_id_property=feature_id_property, @@ -581,8 +595,8 @@ def _run_spatial_aggregation( dataset=dataset, start=request.start, end=request.end, - bbox=artifacts.require_bbox(), - features=artifacts.require_features(), + bbox=resolved_inputs["bbox"], + features=resolved_inputs["features"], method=method, feature_id_property=feature_id_property, aggregated_dataset=temporal_dataset, @@ -592,13 +606,14 @@ def _run_spatial_aggregation( def _run_build_datavalueset( *, + step: WorkflowStep, runtime: WorkflowRuntime, request: WorkflowExecuteRequest, dataset: dict[str, Any], - artifacts: WorkflowArtifacts, + resolved_inputs: dict[str, Any], step_config: dict[str, Any], ) -> dict[str, Any]: - del dataset + del dataset, step period_type = request.temporal_aggregation.target_period_type execution_mode = str(step_config.get("execution_mode", "local")).lower() if execution_mode == "remote": @@ -608,7 +623,7 @@ def _run_build_datavalueset( remote_url=str(step_config["remote_url"]), dataset_id=request.dataset_id, period_type=period_type.value, - records=artifacts.require_records(), + records=resolved_inputs["records"], dhis2=request.dhis2.model_dump(mode="json"), timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), retries=int(step_config.get("remote_retries", 1)), @@ -618,7 +633,7 @@ def _run_build_datavalueset( data_value_set, output_file = runtime.run( "build_datavalueset", component_services.build_datavalueset_component, - records=artifacts.require_records(), + records=resolved_inputs["records"], dataset_id=request.dataset_id, period_type=period_type, dhis2=request.dhis2, @@ -626,12 +641,26 @@ def _run_build_datavalueset( return {"data_value_set": data_value_set, "output_file": output_file} -def _apply_artifact_updates(artifacts: WorkflowArtifacts, updates: dict[str, Any]) -> None: - """Apply validated component outputs to the typed artifact handoff.""" - for key, value in updates.items(): - if not hasattr(artifacts, key): - raise RuntimeError(f"Unsupported workflow artifact '{key}'") - setattr(artifacts, key, value) +def _resolve_step_inputs(step: WorkflowStep, context: WorkflowExecutionContext) -> dict[str, Any]: + """Resolve one step's declared upstream references into concrete values.""" + resolved: dict[str, Any] = {} + for input_name, ref in step.inputs.items(): + resolved[input_name] = context.get_step_output(step_id=ref.from_step, output_name=ref.output) + return resolved + + +def _validate_step_outputs(*, step: WorkflowStep, outputs: dict[str, Any]) -> None: + """Ensure a step only emits its declared outputs and required outputs are present.""" + declared_outputs = set(component_services.component_registry()[f"{step.component}@{step.version}"].outputs) + internal_outputs = set(outputs) + unexpected_outputs = internal_outputs - declared_outputs + if unexpected_outputs: + unexpected = ", ".join(sorted(unexpected_outputs)) + raise RuntimeError(f"Component '{step.component}' emitted undeclared outputs: {unexpected}") + missing_outputs = declared_outputs - internal_outputs + if missing_outputs: + missing = ", ".join(sorted(missing_outputs)) + raise RuntimeError(f"Component '{step.component}' did not emit declared outputs: {missing}") def _resolve_step_config(config: dict[str, Any], request_params: dict[str, Any]) -> dict[str, Any]: diff --git a/src/eo_api/workflows/services/job_store.py b/src/eo_api/workflows/services/job_store.py index 9ebc6f4..8408297 100644 --- a/src/eo_api/workflows/services/job_store.py +++ b/src/eo_api/workflows/services/job_store.py @@ -35,6 +35,9 @@ def initialize_job( workflow_version: int, status: WorkflowJobStatus = WorkflowJobStatus.RUNNING, process_id: str = _DEFAULT_PROCESS_ID, + trigger_type: str = "on_demand", + schedule_id: str | None = None, + idempotency_key: str | None = None, ) -> WorkflowJobRecord: """Create or replace a persisted job record.""" existing = get_stored_job(job_id) @@ -61,6 +64,9 @@ def initialize_job( error_code=existing.error_code if existing is not None else None, failed_component=existing.failed_component if existing is not None else None, failed_component_version=existing.failed_component_version if existing is not None else None, + trigger_type=trigger_type if existing is None else existing.trigger_type, + schedule_id=schedule_id if existing is None else existing.schedule_id, + idempotency_key=idempotency_key if existing is None else existing.idempotency_key, ) _write_job(record) return record @@ -175,6 +181,14 @@ def get_job_trace(job_id: str) -> dict[str, Any] | None: return cast(dict[str, Any], json.loads(path.read_text(encoding="utf-8"))) +def find_job_by_schedule_key(*, schedule_id: str, idempotency_key: str) -> WorkflowJobRecord | None: + """Return the newest job matching one schedule/idempotency pair.""" + for job in list_jobs(): + if job.schedule_id == schedule_id and job.idempotency_key == idempotency_key: + return job + return None + + def delete_job(job_id: str) -> dict[str, Any] | None: """Delete a job and cascade removal of run-owned derived artifacts.""" record = get_stored_job(job_id) @@ -313,9 +327,14 @@ def _build_orchestration_summary( components=[step.component for step in workflow.steps], steps=[ WorkflowJobOrchestrationStep( + id=step.id or step.component, component=step.component, version=step.version, execution_mode=cast(str | None, step.config.get("execution_mode")), + inputs={ + input_name: {"from_step": ref.from_step, "output": ref.output} + for input_name, ref in step.inputs.items() + }, ) for step in workflow.steps ], diff --git a/src/eo_api/workflows/services/schedules.py b/src/eo_api/workflows/services/schedules.py new file mode 100644 index 0000000..c5582ca --- /dev/null +++ b/src/eo_api/workflows/services/schedules.py @@ -0,0 +1,186 @@ +"""Disk-backed workflow schedule persistence and execution helpers.""" + +from __future__ import annotations + +import datetime as dt +import uuid +from pathlib import Path + +from ...data_manager.services.downloader import DOWNLOAD_DIR +from ..schemas import ( + WorkflowExecuteResponse, + WorkflowJobStatus, + WorkflowSchedule, + WorkflowScheduleCreateRequest, + WorkflowScheduleTriggerResponse, +) +from .definitions import load_workflow_definition +from .engine import execute_workflow +from .job_store import find_job_by_schedule_key +from .simple_mapper import normalize_simple_request + + +def create_schedule(payload: WorkflowScheduleCreateRequest) -> WorkflowSchedule: + """Persist one workflow schedule.""" + timestamp = _utc_now() + workflow_id = payload.workflow_id or payload.request.workflow_id + if payload.workflow_id is not None and payload.request.workflow_id != payload.workflow_id: + raise ValueError("workflow_id must match request.workflow_id when both are provided") + schedule = WorkflowSchedule( + schedule_id=str(uuid.uuid4()), + workflow_id=workflow_id, + cron_expression=payload.cron_expression, + request=payload.request.model_copy(update={"workflow_id": workflow_id}), + enabled=payload.enabled, + idempotency_key_template=payload.idempotency_key_template, + retention_policy=payload.retention_policy, + created_at=timestamp, + updated_at=timestamp, + last_triggered_at=None, + ) + _validate_cron(schedule.cron_expression) + load_workflow_definition(schedule.workflow_id) + _write_schedule(schedule) + return schedule + + +def list_schedules(*, workflow_id: str | None = None) -> list[WorkflowSchedule]: + """List persisted schedules ordered by newest first.""" + schedules: list[WorkflowSchedule] = [] + for path in _schedules_dir().glob("*.json"): + schedules.append(WorkflowSchedule.model_validate_json(path.read_text(encoding="utf-8"))) + schedules.sort(key=lambda item: item.created_at, reverse=True) + if workflow_id is not None: + schedules = [item for item in schedules if item.workflow_id == workflow_id] + return schedules + + +def get_schedule(schedule_id: str) -> WorkflowSchedule | None: + """Fetch one persisted schedule.""" + path = _schedule_path(schedule_id) + if not path.exists(): + return None + return WorkflowSchedule.model_validate_json(path.read_text(encoding="utf-8")) + + +def delete_schedule(schedule_id: str) -> WorkflowSchedule | None: + """Delete one persisted schedule.""" + schedule = get_schedule(schedule_id) + if schedule is None: + return None + path = _schedule_path(schedule_id) + if path.exists(): + path.unlink() + return schedule + + +def trigger_schedule( + *, + schedule_id: str, + execution_time: str | None = None, +) -> tuple[WorkflowScheduleTriggerResponse, WorkflowExecuteResponse | None]: + """Execute one schedule immediately with idempotency protection.""" + schedule = get_schedule(schedule_id) + if schedule is None: + raise ValueError(f"Unknown schedule_id '{schedule_id}'") + if not schedule.enabled: + raise ValueError(f"Schedule '{schedule_id}' is disabled") + + trigger_time = _parse_execution_time(execution_time) + idempotency_key = _render_idempotency_key( + template=schedule.idempotency_key_template, + workflow_id=schedule.workflow_id, + schedule_id=schedule.schedule_id, + execution_time=trigger_time, + ) + existing_job = find_job_by_schedule_key(schedule_id=schedule.schedule_id, idempotency_key=idempotency_key) + if existing_job is not None: + return ( + WorkflowScheduleTriggerResponse( + schedule_id=schedule.schedule_id, + workflow_id=schedule.workflow_id, + job_id=existing_job.job_id, + status=existing_job.status, + idempotency_key=idempotency_key, + reused_existing_job=True, + ), + None, + ) + + request, _warnings = normalize_simple_request(schedule.request) + response = execute_workflow( + request, + workflow_id=schedule.workflow_id, + request_params=schedule.request.model_dump(exclude_none=True), + include_component_run_details=schedule.request.include_component_run_details, + run_id=str(uuid.uuid4()), + workflow_definition_source="catalog", + trigger_type="scheduled", + schedule_id=schedule.schedule_id, + idempotency_key=idempotency_key, + ) + updated_schedule = schedule.model_copy(update={"updated_at": _utc_now(), "last_triggered_at": _utc_now()}) + _write_schedule(updated_schedule) + return ( + WorkflowScheduleTriggerResponse( + schedule_id=schedule.schedule_id, + workflow_id=schedule.workflow_id, + job_id=response.run_id, + status=WorkflowJobStatus.SUCCESSFUL, + idempotency_key=idempotency_key, + reused_existing_job=False, + ), + response, + ) + + +def _write_schedule(schedule: WorkflowSchedule) -> None: + _schedules_dir().mkdir(parents=True, exist_ok=True) + _schedule_path(schedule.schedule_id).write_text(schedule.model_dump_json(indent=2), encoding="utf-8") + + +def _schedules_dir() -> Path: + return DOWNLOAD_DIR / "workflow_schedules" + + +def _schedule_path(schedule_id: str) -> Path: + return _schedules_dir() / f"{schedule_id}.json" + + +def _utc_now() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def _validate_cron(value: str) -> None: + parts = value.split() + if len(parts) != 5: + raise ValueError("cron_expression must have 5 space-separated fields") + + +def _parse_execution_time(value: str | None) -> dt.datetime: + if value is None: + return dt.datetime.now(dt.timezone.utc) + parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=dt.timezone.utc) + return parsed.astimezone(dt.timezone.utc) + + +def _render_idempotency_key( + *, + template: str, + workflow_id: str, + schedule_id: str, + execution_time: dt.datetime, +) -> str: + values = { + "workflow_id": workflow_id, + "schedule_id": schedule_id, + "date": execution_time.strftime("%Y-%m-%d"), + "datetime": execution_time.strftime("%Y-%m-%dT%H:%M:%SZ"), + "hour": execution_time.strftime("%Y-%m-%dT%H"), + } + rendered = template + for key, value in values.items(): + rendered = rendered.replace(f"{{{key}}}", value) + return rendered diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 2e5ca7b..5df9d2e 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -122,6 +122,8 @@ def test_workflow_endpoint_exists_once() -> None: "/workflows/dhis2-datavalue-set", "/workflows/execute", "/workflows/jobs/cleanup", + "/workflows/schedules", + "/workflows/schedules/{schedule_id}/trigger", "/workflows/validate", } @@ -630,7 +632,12 @@ def test_workflow_job_endpoints_return_persisted_result( "build_datavalueset", ] assert job_body["orchestration"]["steps"][0]["component"] == "feature_source" + assert job_body["orchestration"]["steps"][0]["id"] == "get_features" assert job_body["orchestration"]["steps"][0]["version"] == "v1" + assert job_body["orchestration"]["steps"][1]["inputs"]["bbox"] == { + "from_step": "get_features", + "output": "bbox", + } links = {item["rel"]: item["href"] for item in job_body["links"]} assert links["self"].endswith(f"/workflows/jobs/{run_id}") assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") @@ -1595,6 +1602,13 @@ def test_default_workflow_definition_has_expected_steps() -> None: workflow = load_workflow_definition() assert workflow.workflow_id == "dhis2_datavalue_set_v1" assert workflow.version == 1 + assert [step.id for step in workflow.steps] == [ + "get_features", + "download", + "temporal_agg", + "spatial_agg", + "build_dhis2_payload", + ] assert [step.component for step in workflow.steps] == [ "feature_source", "download_dataset", @@ -1628,10 +1642,25 @@ def test_engine_follows_declarative_workflow_order(monkeypatch: pytest.MonkeyPat "workflow_id": workflow_id, "version": 1, "steps": [ - {"component": "feature_source"}, - {"component": "download_dataset"}, - {"component": "spatial_aggregation"}, - {"component": "build_datavalueset"}, + {"id": "features", "component": "feature_source"}, + { + "id": "download", + "component": "download_dataset", + "inputs": {"bbox": {"from_step": "features", "output": "bbox"}}, + }, + { + "id": "aggregate", + "component": "spatial_aggregation", + "inputs": { + "bbox": {"from_step": "features", "output": "bbox"}, + "features": {"from_step": "features", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "inputs": {"records": {"from_step": "aggregate", "output": "records"}}, + }, ], } ), @@ -1674,6 +1703,56 @@ def test_engine_follows_declarative_workflow_order(monkeypatch: pytest.MonkeyPat ] +def test_validate_workflow_reports_explicit_input_wiring(client: TestClient) -> None: + response = client.post("/workflows/validate", json={"workflow_id": "dhis2_datavalue_set_v1"}) + assert response.status_code == 200 + body = response.json() + assert body["valid"] is True + assert body["resolved_steps"][0]["id"] == "get_features" + assert body["resolved_steps"][1]["resolved_inputs"]["bbox"] == { + "from_step": "get_features", + "output": "bbox", + } + assert body["resolved_steps"][3]["resolved_inputs"]["temporal_dataset"] == { + "from_step": "temporal_agg", + "output": "temporal_dataset", + } + + +def test_schedule_trigger_reuses_existing_job(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + _patch_successful_execution(monkeypatch) + + create_response = client.post( + "/workflows/schedules", + json={ + "cron_expression": "0 2 * * *", + "request": _valid_public_payload()["request"], + }, + ) + assert create_response.status_code == 200 + schedule_id = create_response.json()["schedule_id"] + + trigger_payload = {"execution_time": "2026-03-19T02:00:00Z"} + first_trigger = client.post(f"/workflows/schedules/{schedule_id}/trigger", json=trigger_payload) + assert first_trigger.status_code == 200 + first_body = first_trigger.json() + assert first_body["reused_existing_job"] is False + assert first_body["status"] == "successful" + + second_trigger = client.post(f"/workflows/schedules/{schedule_id}/trigger", json=trigger_payload) + assert second_trigger.status_code == 200 + second_body = second_trigger.json() + assert second_body["reused_existing_job"] is True + assert second_body["job_id"] == first_body["job_id"] + + job_response = client.get(f"/workflows/jobs/{first_body['job_id']}") + assert job_response.status_code == 200 + job_body = job_response.json() + assert job_body["trigger_type"] == "scheduled" + assert job_body["schedule_id"] == schedule_id + assert job_body["idempotency_key"] == first_body["idempotency_key"] + + def test_engine_rejects_unknown_workflow_id(monkeypatch: pytest.MonkeyPatch) -> None: request = WorkflowExecuteRequest.model_validate( { From bde7f9fa635e2679c3d0ef608897d6b123da6aeb Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Fri, 20 Mar 2026 12:14:02 +0100 Subject: [PATCH 11/15] feat: generalize workflow contracts and component manifests --- .gitignore | 3 +- data/workflows/dhis2_datavalue_set.yaml | 26 + ...alue_set_without_temporal_aggregation.yaml | 16 + pygeoapi-config.yml | 2 +- pyproject.toml | 1 + src/eo_api/analytics_viewer/routes.py | 6 +- src/eo_api/components/schemas.py | 52 +- src/eo_api/components/services.py | 671 ++++++++++++++++- src/eo_api/data_accessor/routes.py | 126 +++- src/eo_api/data_accessor/services/accessor.py | 221 +++++- src/eo_api/main.py | 25 +- src/eo_api/ogc/routes.py | 8 +- src/eo_api/publications/capabilities.py | 82 +++ src/eo_api/publications/pygeoapi.py | 26 +- .../collections/collection.html | 372 +++++++++- src/eo_api/publications/services.py | 143 +++- src/eo_api/raster/__init__.py | 3 + src/eo_api/raster/routes.py | 516 +++++++++++++ src/eo_api/startup.py | 33 +- src/eo_api/workflows/routes.py | 38 +- src/eo_api/workflows/schemas.py | 29 +- src/eo_api/workflows/services/definitions.py | 157 ++-- src/eo_api/workflows/services/engine.py | 675 ++++-------------- .../workflows/services/publication_assets.py | 19 +- tests/test_data_accessor.py | 166 +++++ tests/test_raster_routes.py | 180 +++++ tests/test_workflows.py | 505 +++++++++++-- uv.lock | 65 ++ 28 files changed, 3445 insertions(+), 721 deletions(-) create mode 100644 src/eo_api/publications/capabilities.py create mode 100644 src/eo_api/raster/__init__.py create mode 100644 src/eo_api/raster/routes.py create mode 100644 tests/test_data_accessor.py create mode 100644 tests/test_raster_routes.py diff --git a/.gitignore b/.gitignore index cae9b3e..225ce03 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__/ .venv/ .env eo_api.egg-info/ -data/downloads \ No newline at end of file +data/downloads +docs/ diff --git a/data/workflows/dhis2_datavalue_set.yaml b/data/workflows/dhis2_datavalue_set.yaml index 7ca4e88..51eb8a3 100644 --- a/data/workflows/dhis2_datavalue_set.yaml +++ b/data/workflows/dhis2_datavalue_set.yaml @@ -5,6 +5,32 @@ publication: strategy: on_success intent: feature_collection exposure: ogc + inputs: + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + output_file: + from_step: build_dhis2_payload + output: output_file +outputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + data_value_set: + from_step: build_dhis2_payload + output: data_value_set + output_file: + from_step: build_dhis2_payload + output: output_file steps: - id: get_features component: feature_source diff --git a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml index ccff28d..88c932d 100644 --- a/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml +++ b/data/workflows/dhis2_datavalue_set_without_temporal_aggregation.yaml @@ -3,6 +3,22 @@ version: 1 publication: publishable: false exposure: registry_only +outputs: + bbox: + from_step: get_features + output: bbox + features: + from_step: get_features + output: features + records: + from_step: spatial_agg + output: records + data_value_set: + from_step: build_dhis2_payload + output: data_value_set + output_file: + from_step: build_dhis2_payload + output: output_file steps: - id: get_features component: feature_source diff --git a/pygeoapi-config.yml b/pygeoapi-config.yml index ca32423..893a817 100644 --- a/pygeoapi-config.yml +++ b/pygeoapi-config.yml @@ -2,7 +2,7 @@ server: bind: host: 0.0.0.0 port: 5000 - url: http://127.0.0.1:8000/ogcapi + url: http://127.0.0.1:8000/pygeoapi mimetype: application/json; charset=UTF-8 encoding: utf-8 languages: diff --git a/pyproject.toml b/pyproject.toml index d319dfd..7486d0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "earthkit-transforms==0.5.*", "metpy>=1.7,<2", "zarr==3.1.5", + "titiler-xarray>=1.2.0", ] [tool.ruff] diff --git a/src/eo_api/analytics_viewer/routes.py b/src/eo_api/analytics_viewer/routes.py index fcf8203..ec74515 100644 --- a/src/eo_api/analytics_viewer/routes.py +++ b/src/eo_api/analytics_viewer/routes.py @@ -50,12 +50,12 @@ def get_publication_analytics_config(resource_id: str) -> dict[str, Any]: "workflow_id": resource.workflow_id, "job_id": resource.job_id, "data_url": data_url, - "ogc_items_url": f"/ogcapi/collections/{resource.resource_id}/items", + "ogc_items_url": f"/pygeoapi/collections/{resource.resource_id}/items", "links": { "ogc_home": "/ogcapi", "publication": f"/publications/{resource.resource_id}", - "collection": f"/ogcapi/collections/{resource.resource_id}", - "items": f"/ogcapi/collections/{resource.resource_id}/items", + "collection": f"/pygeoapi/collections/{resource.resource_id}", + "items": f"/pygeoapi/collections/{resource.resource_id}/items", }, } diff --git a/src/eo_api/components/schemas.py b/src/eo_api/components/schemas.py index 1c2d684..5064b9c 100644 --- a/src/eo_api/components/schemas.py +++ b/src/eo_api/components/schemas.py @@ -1,8 +1,8 @@ -"""Schemas for component discovery and execution endpoints.""" +"""Schemas for component discovery, manifests, and execution endpoints.""" from __future__ import annotations -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, Field @@ -29,6 +29,8 @@ class ComponentDefinition(BaseModel): description: str inputs: list[str] outputs: list[str] + workflow_inputs_required: list[str] = Field(default_factory=list) + workflow_inputs_optional: list[str] = Field(default_factory=list) input_schema: dict[str, Any] = Field(default_factory=dict) config_schema: dict[str, Any] | None = None output_schema: dict[str, Any] = Field(default_factory=dict) @@ -36,6 +38,52 @@ class ComponentDefinition(BaseModel): endpoint: ComponentEndpoint +class ComponentRuntimeManifest(BaseModel): + """Runtime metadata for one registered component.""" + + type: Literal["python"] = "python" + supported_execution_modes: list[str] = Field(default_factory=lambda: ["local"]) + local_handler: str | None = None + remote_handler: str | None = None + remote_request_bindings: dict[str, Any] = Field(default_factory=dict) + remote_response_bindings: dict[str, str] = Field(default_factory=dict) + + +class ComponentManifest(BaseModel): + """Internal manifest used to register a component.""" + + name: str + version: str = "v1" + description: str + inputs: list[str] + outputs: list[str] + workflow_inputs_required: list[str] = Field(default_factory=list) + workflow_inputs_optional: list[str] = Field(default_factory=list) + input_schema: dict[str, Any] = Field(default_factory=dict) + config_schema: dict[str, Any] | None = None + output_schema: dict[str, Any] = Field(default_factory=dict) + error_codes: list[str] = Field(default_factory=list) + endpoint: ComponentEndpoint + runtime: ComponentRuntimeManifest + + def to_definition(self) -> ComponentDefinition: + """Project internal manifest to public discovery metadata.""" + return ComponentDefinition( + name=self.name, + version=self.version, + description=self.description, + inputs=self.inputs, + outputs=self.outputs, + workflow_inputs_required=self.workflow_inputs_required, + workflow_inputs_optional=self.workflow_inputs_optional, + input_schema=self.input_schema, + config_schema=self.config_schema, + output_schema=self.output_schema, + error_codes=self.error_codes, + endpoint=self.endpoint, + ) + + class ComponentCatalogResponse(BaseModel): """List of discoverable components.""" diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index 910f632..d502eb2 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -2,11 +2,15 @@ from __future__ import annotations +import time from collections.abc import Mapping +from dataclasses import dataclass from typing import Any, Final +import httpx import xarray as xr from fastapi import HTTPException +from pydantic import BaseModel, ConfigDict, ValidationError from ..data_accessor.services.accessor import get_data from ..data_manager.services import downloader @@ -22,7 +26,19 @@ from ..workflows.services.preflight import check_upstream_connectivity from ..workflows.services.spatial import aggregate_to_features from ..workflows.services.temporal import aggregate_temporal -from .schemas import ComponentDefinition, ComponentEndpoint +from .schemas import ComponentDefinition, ComponentEndpoint, ComponentManifest, ComponentRuntimeManifest + +type WorkflowStepExecutor = Any + + +@dataclass(frozen=True) +class ComponentRuntimeDefinition: + """Runtime binding for one workflow-executable component version.""" + + component: str + version: str + executor: WorkflowStepExecutor + config_model: type[BaseModel] _ERROR_CODES_V1: Final[list[str]] = [ "INPUT_VALIDATION_FAILED", @@ -32,13 +48,15 @@ "EXECUTION_FAILED", ] -_COMPONENT_REGISTRY: Final[dict[str, ComponentDefinition]] = { - "feature_source@v1": ComponentDefinition( +_COMPONENT_REGISTRY: Final[dict[str, ComponentManifest]] = { + "feature_source@v1": ComponentManifest( name="feature_source", version="v1", description="Resolve feature source and compute bbox.", inputs=["feature_source"], outputs=["features", "bbox"], + workflow_inputs_required=[], + workflow_inputs_optional=[], input_schema={ "type": "object", "properties": {"feature_source": {"type": "object"}}, @@ -65,13 +83,28 @@ }, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/feature-source", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.feature_source", + remote_handler="workflow.feature_source", + remote_request_bindings={ + "feature_source": "$request.feature_source", + "include_features": True, + }, + remote_response_bindings={ + "features": "features", + "bbox": "bbox", + }, + ), ), - "download_dataset@v1": ComponentDefinition( + "download_dataset@v1": ComponentManifest( name="download_dataset", version="v1", description="Download dataset files for period and bbox.", inputs=["dataset_id", "start", "end", "overwrite", "country_code", "bbox"], outputs=["status"], + workflow_inputs_required=["bbox"], + workflow_inputs_optional=[], input_schema={ "type": "object", "properties": { @@ -98,13 +131,29 @@ output_schema={"type": "object", "properties": {"status": {"type": "string"}}}, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/download-dataset", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.download_dataset", + remote_handler="workflow.download_dataset", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "start": "$request.start", + "end": "$request.end", + "overwrite": "$request.overwrite", + "country_code": "$request.country_code", + "bbox": "$resolved.bbox", + }, + remote_response_bindings={"status": "status"}, + ), ), - "temporal_aggregation@v1": ComponentDefinition( + "temporal_aggregation@v1": ComponentManifest( name="temporal_aggregation", version="v1", description="Aggregate dataset over time dimension.", inputs=["dataset_id", "start", "end", "target_period_type", "method", "bbox"], outputs=["temporal_dataset"], + workflow_inputs_required=["bbox"], + workflow_inputs_optional=[], input_schema={ "type": "object", "properties": { @@ -131,13 +180,20 @@ output_schema={"type": "object", "properties": {"temporal_dataset": {"type": "object"}}}, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/temporal-aggregation", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local"], + local_handler="workflow.temporal_aggregation", + remote_handler=None, + ), ), - "spatial_aggregation@v1": ComponentDefinition( + "spatial_aggregation@v1": ComponentManifest( name="spatial_aggregation", version="v1", description="Aggregate gridded dataset to features.", inputs=["dataset_id", "start", "end", "feature_source", "method"], outputs=["records"], + workflow_inputs_required=["bbox", "features"], + workflow_inputs_optional=["temporal_dataset"], input_schema={ "type": "object", "properties": { @@ -163,13 +219,31 @@ output_schema={"type": "object", "properties": {"records": {"type": "array"}}}, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/spatial-aggregation", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.spatial_aggregation", + remote_handler="workflow.spatial_aggregation", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "start": "$request.start", + "end": "$request.end", + "feature_source": "$request.feature_source", + "method": "$request.spatial_aggregation.method", + "bbox": "$resolved.bbox", + "feature_id_property": "$request.dhis2.org_unit_property", + "include_records": True, + }, + remote_response_bindings={"records": "records"}, + ), ), - "build_datavalueset@v1": ComponentDefinition( + "build_datavalueset@v1": ComponentManifest( name="build_datavalueset", version="v1", description="Build and serialize DHIS2 DataValueSet JSON.", inputs=["dataset_id", "period_type", "records", "dhis2"], outputs=["data_value_set", "output_file"], + workflow_inputs_required=["records"], + workflow_inputs_optional=[], input_schema={ "type": "object", "properties": { @@ -198,6 +272,21 @@ }, error_codes=_ERROR_CODES_V1, endpoint=ComponentEndpoint(path="/components/build-datavalue-set", method="POST"), + runtime=ComponentRuntimeManifest( + supported_execution_modes=["local", "remote"], + local_handler="workflow.build_datavalueset", + remote_handler="workflow.build_datavalueset", + remote_request_bindings={ + "dataset_id": "$request.dataset_id", + "period_type": "$request.temporal_aggregation.target_period_type", + "records": "$resolved.records", + "dhis2": "$request.dhis2", + }, + remote_response_bindings={ + "data_value_set": "data_value_set", + "output_file": "output_file", + }, + ), ), } @@ -207,17 +296,27 @@ def component_catalog(*, include_internal: bool = False) -> list[ComponentDefini By default, internal orchestration-only metadata (config_schema) is hidden. """ - components = list(_COMPONENT_REGISTRY.values()) + components = [manifest.to_definition() for manifest in _COMPONENT_REGISTRY.values()] if include_internal: return components return [component.model_copy(update={"config_schema": None}) for component in components] -def component_registry() -> dict[str, ComponentDefinition]: - """Return registry entries keyed by component@version.""" +def component_registry() -> dict[str, ComponentManifest]: + """Return manifest registry entries keyed by component@version.""" return dict(_COMPONENT_REGISTRY) +class _RemoteCapableStepConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + execution_mode: str = "local" + remote_url: str | None = None + remote_timeout_sec: float = 30.0 + remote_retries: int = 1 + remote_retry_delay_sec: float = 1.0 + + def feature_source_component(config: FeatureSourceConfig) -> tuple[dict[str, Any], list[float]]: """Run feature source component.""" return resolve_features(config) @@ -299,6 +398,204 @@ def build_datavalueset_component( return build_data_value_set(records=records, dataset_id=dataset_id, period_type=period_type, config=dhis2) +def run_feature_source_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for feature_source.""" + del dataset, step + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + outputs = runtime.run( + "feature_source", + _invoke_registered_remote_component, + component_key="feature_source@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + features = outputs["features"] + bbox = outputs["bbox"] + else: + features, bbox = runtime.run( + "feature_source", + feature_source_component, + config=request.feature_source, + ) + return {"features": features, "bbox": bbox} + + +def run_download_dataset_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for download_dataset.""" + del step + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode not in {"local", "remote"}: + raise ValueError("download_dataset.execution_mode must be 'local' or 'remote'") + bbox = resolved_inputs["bbox"] + if execution_mode == "remote": + remote_url = step_config.get("remote_url") + if not isinstance(remote_url, str) or not remote_url: + raise ValueError("download_dataset remote mode requires non-empty 'remote_url'") + outputs = runtime.run( + "download_dataset", + _invoke_registered_remote_component, + component_key="download_dataset@v1", + remote_url=remote_url, + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + return outputs + else: + runtime.run( + "download_dataset", + download_dataset_component, + dataset=dataset, + start=request.start, + end=request.end, + overwrite=request.overwrite, + country_code=request.country_code, + bbox=bbox, + ) + return {"status": "downloaded"} + + +def run_temporal_aggregation_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for temporal_aggregation.""" + del step + target_period_type = request.temporal_aggregation.target_period_type + method = request.temporal_aggregation.method + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + raise ValueError("temporal_aggregation does not declare a remote HTTP contract") + else: + temporal_ds = runtime.run( + "temporal_aggregation", + temporal_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=resolved_inputs["bbox"], + target_period_type=target_period_type, + method=method, + ) + return {"temporal_dataset": temporal_ds} + + +def run_spatial_aggregation_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for spatial_aggregation.""" + del step + method = request.spatial_aggregation.method + feature_id_property = request.dhis2.org_unit_property + execution_mode = str(step_config.get("execution_mode", "local")).lower() + temporal_dataset = resolved_inputs.get("temporal_dataset") + if execution_mode == "remote": + if temporal_dataset is not None: + raise ValueError( + "remote spatial_aggregation does not yet support workflow temporal_aggregation output; " + "use local spatial_aggregation for temporally aggregated workflows" + ) + outputs = runtime.run( + "spatial_aggregation", + _invoke_registered_remote_component, + component_key="spatial_aggregation@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + records = outputs["records"] + else: + records = runtime.run( + "spatial_aggregation", + spatial_aggregation_component, + dataset=dataset, + start=request.start, + end=request.end, + bbox=resolved_inputs["bbox"], + features=resolved_inputs["features"], + method=method, + feature_id_property=feature_id_property, + aggregated_dataset=temporal_dataset, + ) + return {"records": records} + + +def run_build_datavalueset_step( + *, + step: Any, + runtime: Any, + request: Any, + dataset: dict[str, Any], + resolved_inputs: dict[str, Any], + step_config: dict[str, Any], +) -> dict[str, Any]: + """Workflow runtime adapter for build_datavalueset.""" + del dataset, step + period_type = request.temporal_aggregation.target_period_type + execution_mode = str(step_config.get("execution_mode", "local")).lower() + if execution_mode == "remote": + outputs = runtime.run( + "build_datavalueset", + _invoke_registered_remote_component, + component_key="build_datavalueset@v1", + remote_url=str(step_config["remote_url"]), + request=request, + resolved_inputs=resolved_inputs, + timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), + retries=int(step_config.get("remote_retries", 1)), + retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), + ) + data_value_set = outputs["data_value_set"] + output_file = outputs["output_file"] + else: + data_value_set, output_file = runtime.run( + "build_datavalueset", + build_datavalueset_component, + records=resolved_inputs["records"], + dataset_id=request.dataset_id, + period_type=period_type, + dhis2=request.dhis2, + ) + return {"data_value_set": data_value_set, "output_file": output_file} + + def require_dataset(dataset_id: str) -> dict[str, Any]: """Resolve dataset or raise 404.""" dataset = get_dataset(dataset_id) @@ -307,6 +604,83 @@ def require_dataset(dataset_id: str) -> dict[str, Any]: return dataset +def workflow_runtime_registry() -> dict[str, ComponentRuntimeDefinition]: + """Workflow runtime bindings keyed by component@version.""" + handler_registry = _workflow_runtime_handler_registry() + runtime_bindings: dict[str, ComponentRuntimeDefinition] = {} + for key, manifest in _COMPONENT_REGISTRY.items(): + local_handler = manifest.runtime.local_handler + if local_handler is None: + continue + executor = handler_registry.get(local_handler) + if executor is None: + raise RuntimeError(f"Unknown local runtime handler '{local_handler}' for component '{key}'") + runtime_bindings[key] = ComponentRuntimeDefinition( + component=manifest.name, + version=manifest.version, + executor=executor, + config_model=_RemoteCapableStepConfig, + ) + return runtime_bindings + + +def _workflow_runtime_handler_registry() -> dict[str, WorkflowStepExecutor]: + """Resolve local workflow runtime handlers from manifest identifiers.""" + return { + "workflow.feature_source": run_feature_source_step, + "workflow.download_dataset": run_download_dataset_step, + "workflow.temporal_aggregation": run_temporal_aggregation_step, + "workflow.spatial_aggregation": run_spatial_aggregation_step, + "workflow.build_datavalueset": run_build_datavalueset_step, + } + + +def validate_component_runtime_config(component: str, version: str, config: dict[str, Any]) -> None: + """Validate runtime config for one workflow-executable component.""" + manifest = _COMPONENT_REGISTRY.get(f"{component}@{version}") + if manifest is None: + raise ValueError(f"No component manifest registered for '{component}@{version}'") + runtime_definition = workflow_runtime_registry().get(f"{component}@{version}") + if runtime_definition is None: + raise ValueError(f"No runtime config schema registered for component '{component}@{version}'") + try: + validated = runtime_definition.config_model.model_validate(config) + except ValidationError as exc: + raise ValueError(f"Invalid config for component '{component}@{version}': {exc}") from exc + mode = str(getattr(validated, "execution_mode", "local")).lower() + if mode not in {"local", "remote"}: + raise ValueError( + f"Invalid config for component '{component}@{version}': execution_mode must be local or remote" + ) + if mode not in set(manifest.runtime.supported_execution_modes): + allowed = ", ".join(manifest.runtime.supported_execution_modes) + raise ValueError( + f"Invalid config for component '{component}@{version}': execution_mode '{mode}' not supported; " + f"allowed values: {allowed}" + ) + remote_url = getattr(validated, "remote_url", None) + remote_timeout_sec = getattr(validated, "remote_timeout_sec", 30.0) + remote_retries = getattr(validated, "remote_retries", 1) + remote_retry_delay_sec = getattr(validated, "remote_retry_delay_sec", 1.0) + + has_remote_config = bool( + (isinstance(remote_url, str) and remote_url.strip()) + or float(remote_timeout_sec) != 30.0 + or int(remote_retries) != 1 + or float(remote_retry_delay_sec) != 1.0 + ) + + if mode == "local" and has_remote_config: + raise ValueError( + f"Invalid config for component '{component}@{version}': " + "remote_url/remote_timeout_sec/remote_retries/remote_retry_delay_sec are only allowed in remote mode" + ) + if mode == "remote" and (not isinstance(remote_url, str) or not remote_url.strip()): + raise ValueError( + f"Invalid config for component '{component}@{version}': remote_url is required for remote mode" + ) + + def _dataset_period_type(dataset: Mapping[str, Any]) -> PeriodType | None: raw_value = dataset.get("period_type") if not isinstance(raw_value, str): @@ -316,3 +690,280 @@ def _dataset_period_type(dataset: Mapping[str, Any]) -> PeriodType | None: return PeriodType(normalized) except ValueError: return None + + +def _invoke_remote_download_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + overwrite: bool, + country_code: str | None, + bbox: list[float], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> None: + payload = { + "dataset_id": dataset_id, + "start": start, + "end": end, + "overwrite": overwrite, + "country_code": country_code, + "bbox": bbox, + } + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + return + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote download invocation failed without exception context") + raise last_exc + + +def _invoke_registered_remote_component( + *, + component_key: str, + remote_url: str, + request: Any, + resolved_inputs: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + """Invoke a manifest-registered HTTP component as a black box.""" + manifest = _COMPONENT_REGISTRY.get(component_key) + if manifest is None: + raise RuntimeError(f"Unknown component manifest '{component_key}'") + payload = _resolve_runtime_bindings(manifest.runtime.remote_request_bindings, request, resolved_inputs) + response = _post_remote_json( + remote_url=remote_url, + payload=payload, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + return _extract_remote_outputs(manifest=manifest, response=response) + + +def _invoke_remote_feature_source_component( + *, + remote_url: str, + feature_source: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], list[float]]: + result = _post_remote_json( + remote_url=remote_url, + payload={"feature_source": feature_source, "include_features": True}, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + features = result.get("features") + bbox = result.get("bbox") + if not isinstance(features, dict) or not isinstance(bbox, list): + raise RuntimeError("Remote feature_source response missing features/bbox") + return features, [float(x) for x in bbox] + + +def _invoke_remote_temporal_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + target_period_type: str, + method: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + return _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "start": start, + "end": end, + "bbox": bbox, + "target_period_type": target_period_type, + "method": method, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + + +def _invoke_remote_spatial_aggregation_component( + *, + remote_url: str, + dataset_id: str, + start: str, + end: str, + bbox: list[float], + feature_source: dict[str, Any], + method: str, + feature_id_property: str, + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> list[dict[str, Any]]: + result = _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "start": start, + "end": end, + "feature_source": feature_source, + "method": method, + "bbox": bbox, + "feature_id_property": feature_id_property, + "include_records": True, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + records = result.get("records") + if not isinstance(records, list): + raise RuntimeError("Remote spatial_aggregation response missing records") + return records + + +def _invoke_remote_build_datavalueset_component( + *, + remote_url: str, + dataset_id: str, + period_type: str, + records: list[dict[str, Any]], + dhis2: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> tuple[dict[str, Any], str]: + result = _post_remote_json( + remote_url=remote_url, + payload={ + "dataset_id": dataset_id, + "period_type": period_type, + "records": records, + "dhis2": dhis2, + }, + timeout_sec=timeout_sec, + retries=retries, + retry_delay_sec=retry_delay_sec, + ) + data_value_set = result.get("data_value_set") + output_file = result.get("output_file") + if not isinstance(data_value_set, dict) or not isinstance(output_file, str): + raise RuntimeError("Remote build_datavalueset response missing data_value_set/output_file") + return data_value_set, output_file + + +def _post_remote_json( + *, + remote_url: str, + payload: dict[str, Any], + timeout_sec: float, + retries: int, + retry_delay_sec: float, +) -> dict[str, Any]: + attempts = max(1, retries) + last_exc: Exception | None = None + for attempt in range(1, attempts + 1): + try: + with httpx.Client(timeout=timeout_sec) as client: + response = client.post(remote_url, json=payload) + response.raise_for_status() + body = response.json() + if not isinstance(body, dict): + raise RuntimeError("Remote component returned non-object JSON response") + return body + except Exception as exc: + last_exc = exc + if attempt < attempts: + time.sleep(max(0.0, retry_delay_sec)) + if last_exc is None: + raise RuntimeError("Remote component invocation failed without exception context") + raise last_exc + + +def _resolve_runtime_bindings( + bindings: dict[str, Any], + request: Any, + resolved_inputs: dict[str, Any], +) -> dict[str, Any]: + """Resolve manifest-declared HTTP payload bindings.""" + return { + key: _resolve_runtime_value(value, request=request, resolved_inputs=resolved_inputs) + for key, value in bindings.items() + } + + +def _resolve_runtime_value(value: Any, *, request: Any, resolved_inputs: dict[str, Any]) -> Any: + """Resolve one runtime binding value.""" + if isinstance(value, str) and value.startswith("$request."): + return _dump_runtime_value(_lookup_object_path(request, value.removeprefix("$request."))) + if isinstance(value, str) and value.startswith("$resolved."): + return _dump_runtime_value(_lookup_mapping_path(resolved_inputs, value.removeprefix("$resolved."))) + if isinstance(value, dict): + return { + key: _resolve_runtime_value(item, request=request, resolved_inputs=resolved_inputs) + for key, item in value.items() + } + if isinstance(value, list): + return [_resolve_runtime_value(item, request=request, resolved_inputs=resolved_inputs) for item in value] + return value + + +def _lookup_object_path(obj: Any, path: str) -> Any: + """Resolve dotted attribute path from object or mapping.""" + current = obj + for part in path.split("."): + if isinstance(current, Mapping): + current = current[part] + else: + current = getattr(current, part) + return current + + +def _lookup_mapping_path(mapping: Mapping[str, Any], path: str) -> Any: + """Resolve dotted path from mapping.""" + current: Any = mapping + for part in path.split("."): + if not isinstance(current, Mapping): + raise KeyError(path) + current = current[part] + return current + + +def _dump_runtime_value(value: Any) -> Any: + """Convert pydantic/enums to JSON-friendly values for HTTP payloads.""" + if hasattr(value, "model_dump"): + return value.model_dump(mode="json") + if hasattr(value, "value"): + return value.value + return value + + +def _extract_remote_outputs(*, manifest: ComponentManifest, response: dict[str, Any]) -> dict[str, Any]: + """Project HTTP response into declared workflow outputs.""" + bindings = manifest.runtime.remote_response_bindings + if not bindings: + return {output_name: response[output_name] for output_name in manifest.outputs} + extracted: dict[str, Any] = {} + for output_name, response_key in bindings.items(): + extracted[output_name] = response.get(response_key) + return extracted diff --git a/src/eo_api/data_accessor/routes.py b/src/eo_api/data_accessor/routes.py index 507b27d..033b4c5 100644 --- a/src/eo_api/data_accessor/routes.py +++ b/src/eo_api/data_accessor/routes.py @@ -1,11 +1,21 @@ -"""FastAPI router exposing dataset endpoints.""" +"""FastAPI router exposing dataset retrieval endpoints.""" -from fastapi import APIRouter +from typing import Any + +from fastapi import APIRouter, HTTPException from fastapi.responses import FileResponse from starlette.background import BackgroundTask from ..data_registry.routes import _get_dataset_or_404 -from .services.accessor import cleanup_file, get_data, xarray_to_temporary_netcdf +from ..shared.api_errors import api_error +from .services.accessor import ( + cleanup_file, + get_coverage_summary, + get_data, + get_point_values, + get_preview_summary, + xarray_to_temporary_netcdf, +) router = APIRouter() @@ -47,3 +57,113 @@ def get_file( filename="eo-api-raster-download.nc", background=BackgroundTask(cleanup_file, file_path), ) + + +@router.get("/{dataset_id}/point") +def get_point_value( + dataset_id: str, + lon: float, + lat: float, + start: str | None = None, + end: str | None = None, +) -> dict[str, Any]: + """Return one dataset's value series at a requested lon/lat point.""" + dataset = _get_dataset_or_404(dataset_id) + try: + return get_point_values(dataset, lon=lon, lat=lat, start=start, end=end) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="point_query_invalid", + error_code="POINT_QUERY_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc + + +@router.get("/{dataset_id}/preview") +def get_dataset_preview( + dataset_id: str, + start: str | None = None, + end: str | None = None, + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return summary stats and a small raster sample for preview workflows.""" + dataset = _get_dataset_or_404(dataset_id) + bbox: list[float] | None + if any(value is not None for value in (xmin, ymin, xmax, ymax)): + if not all(value is not None for value in (xmin, ymin, xmax, ymax)): + raise HTTPException( + status_code=422, + detail=api_error( + error="preview_invalid", + error_code="PREVIEW_INVALID", + message="Provide all of xmin, ymin, xmax, ymax together", + resource_id=dataset_id, + ), + ) + bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] + else: + bbox = None + + try: + return get_preview_summary(dataset, start=start, end=end, bbox=bbox, max_cells=max_cells) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="preview_invalid", + error_code="PREVIEW_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc + + +@router.get("/{dataset_id}/coverage") +def get_dataset_coverage_summary( + dataset_id: str, + start: str | None = None, + end: str | None = None, + xmin: float | None = None, + ymin: float | None = None, + xmax: float | None = None, + ymax: float | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return a lightweight coverage-style response for a raster subset.""" + dataset = _get_dataset_or_404(dataset_id) + bbox: list[float] | None + if any(value is not None for value in (xmin, ymin, xmax, ymax)): + if not all(value is not None for value in (xmin, ymin, xmax, ymax)): + raise HTTPException( + status_code=422, + detail=api_error( + error="coverage_invalid", + error_code="COVERAGE_INVALID", + message="Provide all of xmin, ymin, xmax, ymax together", + resource_id=dataset_id, + ), + ) + bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] + else: + bbox = None + + try: + return get_coverage_summary(dataset, start=start, end=end, bbox=bbox, max_cells=max_cells) + except ValueError as exc: + raise HTTPException( + status_code=422, + detail=api_error( + error="coverage_invalid", + error_code="COVERAGE_INVALID", + message=str(exc), + resource_id=dataset_id, + ), + ) from exc diff --git a/src/eo_api/data_accessor/services/accessor.py b/src/eo_api/data_accessor/services/accessor.py index b9b78e9..0d411ef 100644 --- a/src/eo_api/data_accessor/services/accessor.py +++ b/src/eo_api/data_accessor/services/accessor.py @@ -5,6 +5,7 @@ import tempfile from typing import Any +import numpy as np import xarray as xr from ...data_manager.services.downloader import get_cache_files, get_zarr_path @@ -58,24 +59,167 @@ def get_data( def get_data_coverage(dataset: dict[str, Any]) -> dict[str, Any]: """Return temporal and spatial coverage metadata for downloaded data.""" ds = get_data(dataset) + try: + if not ds: + return {"temporal_coverage": None, "spatial_coverage": None} + + time_dim = get_time_dim(ds) + lon_dim, lat_dim = get_lon_lat_dims(ds) + + start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset["period_type"]) # type: ignore[arg-type] + end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset["period_type"]) # type: ignore[arg-type] + + xmin, xmax = ds[lon_dim].min().item(), ds[lon_dim].max().item() + ymin, ymax = ds[lat_dim].min().item(), ds[lat_dim].max().item() + + return { + "coverage": { + "temporal": {"start": start, "end": end}, + "spatial": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}, + } + } + finally: + ds.close() + + +def get_point_values( + dataset: dict[str, Any], + *, + lon: float, + lat: float, + start: str | None = None, + end: str | None = None, +) -> dict[str, Any]: + """Return dataset values at one point across the requested time range.""" + ds = get_data(dataset, start=start, end=end, bbox=None) + try: + if not ds.data_vars: + raise ValueError(f"Dataset '{dataset['id']}' has no data variables available") + + lon_dim, lat_dim = get_lon_lat_dims(ds) + time_dim = get_time_dim(ds) + lon_values = ds[lon_dim] + lat_values = ds[lat_dim] + + xmin, xmax = float(lon_values.min().item()), float(lon_values.max().item()) + ymin, ymax = float(lat_values.min().item()), float(lat_values.max().item()) + if lon < xmin or lon > xmax or lat < ymin or lat > ymax: + raise ValueError( + f"Requested point ({lon}, {lat}) is outside dataset coverage " + f"([{xmin}, {ymin}] to [{xmax}, {ymax}])" + ) + + variable_name = str(dataset.get("variable") or next(iter(ds.data_vars))) + if variable_name not in ds.data_vars: + variable_name = next(iter(ds.data_vars)) + data_array = ds[variable_name] + point = data_array.sel({lon_dim: lon, lat_dim: lat}, method="nearest") + + actual_lon = float(point.coords[lon_dim].item()) + actual_lat = float(point.coords[lat_dim].item()) + series: list[dict[str, Any]] = [] + for raw_time, raw_value in zip(point[time_dim].values, point.values.tolist(), strict=False): + value = _to_float(raw_value) + series.append( + { + "period": str(numpy_datetime_to_period_string(np.asarray(raw_time), dataset["period_type"])), + "value": value, + } + ) + + if not series: + raise ValueError(f"Dataset '{dataset['id']}' returned no values for the requested time range") + + return { + "dataset_id": dataset["id"], + "variable": variable_name, + "requested": {"lon": lon, "lat": lat, "start": start, "end": end}, + "resolved_point": {"lon": actual_lon, "lat": actual_lat}, + "value_count": len(series), + "values": series, + } + finally: + ds.close() - if not ds: - return {"temporal_coverage": None, "spatial_coverage": None} - time_dim = get_time_dim(ds) - lon_dim, lat_dim = get_lon_lat_dims(ds) +def get_preview_summary( + dataset: dict[str, Any], + *, + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return summary statistics and a small sample for preview-oriented clients.""" + ds = get_data(dataset, start=start, end=end, bbox=bbox) + try: + if not ds.data_vars: + raise ValueError(f"Dataset '{dataset['id']}' has no data variables available") - start = numpy_datetime_to_period_string(ds[time_dim].min(), dataset["period_type"]) # type: ignore[arg-type] - end = numpy_datetime_to_period_string(ds[time_dim].max(), dataset["period_type"]) # type: ignore[arg-type] + variable_name = str(dataset.get("variable") or next(iter(ds.data_vars))) + if variable_name not in ds.data_vars: + variable_name = next(iter(ds.data_vars)) + data_array = ds[variable_name] + lon_dim, lat_dim = get_lon_lat_dims(data_array) + time_dim = get_time_dim(data_array) - xmin, xmax = ds[lon_dim].min().item(), ds[lon_dim].max().item() - ymin, ymax = ds[lat_dim].min().item(), ds[lat_dim].max().item() + valid = data_array.where(~xr.apply_ufunc(np.isnan, data_array)) + sample = _build_preview_sample( + valid, + dataset=dataset, + lon_dim=lon_dim, + lat_dim=lat_dim, + time_dim=time_dim, + max_cells=max_cells, + ) + return { + "dataset_id": dataset["id"], + "variable": variable_name, + "requested": {"start": start, "end": end, "bbox": bbox}, + "dims": {str(k): int(v) for k, v in valid.sizes.items()}, + "stats": { + "min": _to_float(valid.min(skipna=True).item()), + "max": _to_float(valid.max(skipna=True).item()), + "mean": _to_float(valid.mean(skipna=True).item()), + "value_count": int(valid.count().item()), + }, + "sample": sample, + } + finally: + ds.close() + + +def get_coverage_summary( + dataset: dict[str, Any], + *, + start: str | None = None, + end: str | None = None, + bbox: list[float] | None = None, + max_cells: int = 25, +) -> dict[str, Any]: + """Return a lightweight coverage-style summary for a raster subset.""" + preview = get_preview_summary( + dataset, + start=start, + end=end, + bbox=bbox, + max_cells=max_cells, + ) + full_coverage = get_data_coverage(dataset).get("coverage", {}) return { + "dataset_id": preview["dataset_id"], + "variable": preview["variable"], + "requested": preview["requested"], "coverage": { - "temporal": {"start": start, "end": end}, - "spatial": {"xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax}, - } + "spatial": full_coverage.get("spatial"), + "temporal": full_coverage.get("temporal"), + }, + "subset": { + "dims": preview["dims"], + "stats": preview["stats"], + "sample": preview["sample"], + }, } @@ -91,3 +235,58 @@ def xarray_to_temporary_netcdf(ds: xr.Dataset) -> str: def cleanup_file(path: str) -> None: """Remove a file from disk.""" os.remove(path) + + +def _to_float(value: Any) -> float | None: + if value is None: + return None + scalar = np.asarray(value).item() + if np.isnan(scalar): + return None + return float(scalar) + + +def _build_preview_sample( + data_array: xr.DataArray, + *, + dataset: dict[str, Any], + lon_dim: str, + lat_dim: str, + time_dim: str, + max_cells: int, +) -> list[dict[str, Any]]: + """Build a small JSON-safe sample from a raster subset.""" + max_cells = max(1, max_cells) + sample_records: list[dict[str, Any]] = [] + + time_values = data_array[time_dim].values + lat_values = data_array[lat_dim].values + lon_values = data_array[lon_dim].values + + time_step = max(1, int(np.ceil(len(time_values) / max_cells))) + lat_step = max(1, int(np.ceil(len(lat_values) / max_cells))) + lon_step = max(1, int(np.ceil(len(lon_values) / max_cells))) + + for time_index in range(0, len(time_values), time_step): + for lat_index in range(0, len(lat_values), lat_step): + for lon_index in range(0, len(lon_values), lon_step): + value = data_array.isel({time_dim: time_index, lat_dim: lat_index, lon_dim: lon_index}).item() + numeric_value = _to_float(value) + if numeric_value is None: + continue + sample_records.append( + { + "period": str( + numpy_datetime_to_period_string( + np.asarray(time_values[time_index]), + dataset["period_type"], + ) + ), + "lat": float(lat_values[lat_index]), + "lon": float(lon_values[lon_index]), + "value": numeric_value, + } + ) + if len(sample_records) >= max_cells: + return sample_records + return sample_records diff --git a/src/eo_api/main.py b/src/eo_api/main.py index dd40d89..c846b50 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -1,15 +1,18 @@ """DHIS2 EO API -- Earth observation data API for DHIS2.""" -from fastapi import FastAPI +from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, Response from fastapi.staticfiles import StaticFiles +from rio_tiler.errors import TileOutsideBounds import eo_api.startup # noqa: F401 # pyright: ignore[reportUnusedImport] -from eo_api import analytics_viewer, components, data_accessor, data_manager, data_registry, system, workflows +from eo_api import analytics_viewer, components, data_accessor, data_manager, data_registry, raster, system, workflows from eo_api.ogc import routes as ogc_routes from eo_api.ogc_api import ogc_api_app from eo_api.publications import generated_routes as publication_generated_routes from eo_api.publications import routes as publication_routes +from eo_api.shared.api_errors import api_error app = FastAPI() @@ -21,10 +24,26 @@ allow_headers=["*"], ) + +@app.exception_handler(TileOutsideBounds) +async def tile_outside_bounds_handler(request: Request, exc: TileOutsideBounds) -> Response: + """Return a normal 404 when a requested tile lies outside dataset coverage.""" + if "/tiles/" in request.url.path: + return Response(status_code=404) + return JSONResponse( + status_code=404, + content=api_error( + error="tile_outside_bounds", + error_code="TILE_OUTSIDE_BOUNDS", + message=str(exc), + ), + ) + app.include_router(system.routes.router, tags=["System"]) app.include_router(data_registry.routes.router, prefix="/registry", tags=["Data registry"]) app.include_router(data_manager.routes.router, prefix="/manage", tags=["Data manager"]) app.include_router(data_accessor.routes.router, prefix="/retrieve", tags=["Data retrieval"]) +app.include_router(raster.routes.router, prefix="/raster", tags=["Raster"]) app.include_router(workflows.routes.router, prefix="/workflows", tags=["Workflows"]) app.include_router(publication_routes.router, prefix="/publications", tags=["Publications"]) app.include_router(publication_generated_routes.router, prefix="/publications", tags=["Publications"]) @@ -32,4 +51,4 @@ app.include_router(components.routes.router, tags=["Components"]) app.include_router(ogc_routes.router, prefix="/ogcapi", tags=["OGC API"]) app.mount("/data", StaticFiles(directory="data/downloads"), name="Data") -app.mount("/ogcapi", ogc_api_app) +app.mount("/pygeoapi", ogc_api_app) diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 67eb9ec..4341bc0 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -38,8 +38,8 @@ def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTM {"rel": "self", "type": "application/json", "href": _request_href(request, f="json")}, {"rel": "alternate", "type": "text/html", "href": _request_href(request, f="html")}, {"rel": "service-desc", "type": "application/vnd.oai.openapi+json;version=3.0", "href": "/ogcapi/openapi"}, - {"rel": "conformance", "type": "application/json", "href": f"{base_url}/ogcapi/conformance"}, - {"rel": "data", "type": "application/json", "href": f"{base_url}/ogcapi/collections"}, + {"rel": "conformance", "type": "application/json", "href": f"{base_url}/pygeoapi/conformance"}, + {"rel": "data", "type": "application/json", "href": f"{base_url}/pygeoapi/collections"}, {"rel": "processes", "type": "application/json", "href": f"{base_url}/ogcapi/processes"}, {"rel": "jobs", "type": "application/json", "href": f"{base_url}/ogcapi/jobs"}, ], @@ -47,7 +47,7 @@ def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTM { "title": "Browse Collections", "description": "Open the OGC publication surface for collections and items.", - "href": f"{base_url}/ogcapi/collections?f=html", + "href": f"{base_url}/pygeoapi/collections?f=html", }, { "title": "List Processes", @@ -306,7 +306,7 @@ def _run_async_workflow_job( def _collection_href(request: Request, collection_id: str) -> str: - return str(request.base_url).rstrip("/") + f"/ogcapi/collections/{collection_id}" + return str(request.base_url).rstrip("/") + f"/pygeoapi/collections/{collection_id}" def _request_href(request: Request, **updates: Any) -> str: diff --git a/src/eo_api/publications/capabilities.py b/src/eo_api/publications/capabilities.py new file mode 100644 index 0000000..61987dc --- /dev/null +++ b/src/eo_api/publications/capabilities.py @@ -0,0 +1,82 @@ +"""Publication serving capability policy.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from .schemas import PublishedResourceExposure, PublishedResourceKind + + +@dataclass(frozen=True) +class PublicationServingCapability: + """Serving support for one publication contract.""" + + supported: bool + asset_format: str + served_by: tuple[str, ...] + ogc_collection: bool = False + error: str | None = None + + +def default_asset_format_for_kind(kind: PublishedResourceKind) -> str: + """Default asset format for a publication kind.""" + defaults = { + PublishedResourceKind.FEATURE_COLLECTION: "geojson", + PublishedResourceKind.COVERAGE: "zarr", + PublishedResourceKind.TILESET: "tiles", + PublishedResourceKind.COLLECTION: "json", + } + return defaults.get(kind, "file") + + +def evaluate_publication_serving( + *, + kind: PublishedResourceKind, + exposure: PublishedResourceExposure, + asset_format: str | None, +) -> PublicationServingCapability: + """Evaluate whether the server can expose a publication contract.""" + normalized_format = (asset_format or default_asset_format_for_kind(kind)).strip().lower() + + if exposure == PublishedResourceExposure.REGISTRY_ONLY: + return PublicationServingCapability( + supported=True, + asset_format=normalized_format, + served_by=("registry",), + ogc_collection=False, + ) + + supported_matrix: dict[tuple[PublishedResourceKind, str], PublicationServingCapability] = { + ( + PublishedResourceKind.FEATURE_COLLECTION, + "geojson", + ): PublicationServingCapability( + supported=True, + asset_format="geojson", + served_by=("pygeoapi", "analytics"), + ogc_collection=True, + ), + ( + PublishedResourceKind.COVERAGE, + "zarr", + ): PublicationServingCapability( + supported=True, + asset_format="zarr", + served_by=("pygeoapi", "raster"), + ogc_collection=True, + ), + } + capability = supported_matrix.get((kind, normalized_format)) + if capability is not None: + return capability + + return PublicationServingCapability( + supported=False, + asset_format=normalized_format, + served_by=(), + ogc_collection=False, + error=( + "Unsupported publication serving contract: " + f"kind='{kind}', asset_format='{normalized_format}', exposure='{exposure}'" + ), + ) diff --git a/src/eo_api/publications/pygeoapi.py b/src/eo_api/publications/pygeoapi.py index 8521c6e..4cfa650 100644 --- a/src/eo_api/publications/pygeoapi.py +++ b/src/eo_api/publications/pygeoapi.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any +from urllib.parse import urlsplit import yaml @@ -17,7 +18,7 @@ ) from .services import collection_id_for_resource, ensure_source_dataset_publications, list_published_resources -_DEFAULT_SERVER_URL = "http://127.0.0.1:8000/ogcapi" +_DEFAULT_SERVER_URL = "http://127.0.0.1:8000/pygeoapi" _TEMPLATES_DIR = Path(__file__).resolve().parent / "pygeoapi_templates" @@ -212,12 +213,14 @@ def _description_for_resource(resource: PublishedResource) -> str: def _build_provider(resource: PublishedResource) -> dict[str, Any]: if resource.kind == PublishedResourceKind.COVERAGE: - dataset = get_dataset(str(resource.dataset_id)) - if dataset is None: - raise ValueError(f"Unknown dataset_id '{resource.dataset_id}' for resource '{resource.resource_id}'") - zarr_path = get_zarr_path(dataset) + zarr_path = Path(resource.path) if resource.path is not None else None if zarr_path is None: - raise ValueError(f"No zarr cache available for dataset '{resource.dataset_id}'") + dataset = get_dataset(str(resource.dataset_id)) + if dataset is None: + raise ValueError(f"Unknown dataset_id '{resource.dataset_id}' for resource '{resource.resource_id}'") + zarr_path = get_zarr_path(dataset) + if zarr_path is None: + raise ValueError(f"No zarr cache available for dataset '{resource.dataset_id}'") return { "name": "xarray", "type": "coverage", @@ -258,7 +261,12 @@ def _pygeoapi_links(resource: PublishedResource) -> list[dict[str, str]]: if href == "": continue link_type = "text/html" if rel == "analytics" else "application/json" - title = "Analytics Viewer" if rel == "analytics" else rel.replace("-", " ").title() + if rel == "analytics": + title = "Analytics Viewer" + elif rel == "raster-capabilities": + title = "Raster Rendering Capabilities" + else: + title = rel.replace("-", " ").title() links.append( { "type": link_type, @@ -273,4 +281,6 @@ def _pygeoapi_links(resource: PublishedResource) -> list[dict[str, str]]: def _absolute_ogc_href(href: str) -> str: if href.startswith("http://") or href.startswith("https://"): return href - return f"{_DEFAULT_SERVER_URL.removesuffix('/ogcapi')}{href}" + parsed = urlsplit(_DEFAULT_SERVER_URL) + origin = f"{parsed.scheme}://{parsed.netloc}" + return f"{origin}{href}" diff --git a/src/eo_api/publications/pygeoapi_templates/collections/collection.html b/src/eo_api/publications/pygeoapi_templates/collections/collection.html index e52eb94..79a5c31 100644 --- a/src/eo_api/publications/pygeoapi_templates/collections/collection.html +++ b/src/eo_api/publications/pygeoapi_templates/collections/collection.html @@ -49,11 +49,96 @@ color: #5f6c80; margin-top: -4px; } + .example-links { + display: grid; + gap: 8px; + margin-top: 10px; + } + .example-links a { + display: inline-flex; + align-items: center; + gap: 6px; + font-weight: 600; + overflow-wrap: anywhere; + } + .subtle-code { + display: inline-block; + margin-top: 8px; + color: #b83280; + } + .coverage-controls { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 12px; + margin: 18px 0 12px; + } + .coverage-controls label { + display: block; + margin-bottom: 4px; + font-size: 0.8rem; + font-weight: 700; + color: #5f6c80; + text-transform: uppercase; + letter-spacing: 0.04em; + } + .coverage-controls input, + .coverage-controls select, + .coverage-controls button { + width: 100%; + padding: 9px 10px; + border-radius: 10px; + border: 1px solid rgba(23, 32, 51, 0.14); + background: white; + font: inherit; + } + .coverage-controls button { + background: linear-gradient(135deg, #1b5f94 0%, #dd8d55 100%); + color: white; + border: none; + font-weight: 700; + } + .coverage-status { + margin: 8px 0 14px; + color: #5f6c80; + } + .coverage-legend { + display: flex; + align-items: center; + gap: 12px; + margin-bottom: 12px; + } + .coverage-legend-bar { + flex: 1; + height: 16px; + border-radius: 999px; + background: linear-gradient(90deg, #fff7bc 0%, #fec44f 35%, #fe9929 65%, #d95f0e 100%); + border: 1px solid rgba(23, 32, 51, 0.14); + } + .coverage-readout { + margin-top: 10px; + padding: 12px; + border-radius: 14px; + background: rgba(27, 95, 148, 0.06); + border: 1px solid rgba(27, 95, 148, 0.12); + } {% endblock %} {% block body %} -
+
+ {% set resource_config = config.get('resources', {}).get(data['id'], {}) %} + {% set resource_metadata = resource_config.get('metadata', {}) %} + {% set providers = resource_config.get('providers', []) %} + {% set raster_links = data['links'] | selectattr('rel', 'equalto', 'raster-capabilities') | list %} + {% set variable_name = resource_metadata.get('variable', '...') %} + {% set prefers_aggregated_view = data['id'].startswith('chirps') or variable_name == 'precip' %} + {% set single_preview_href = '/raster/' ~ data['id'] ~ '/preview.png?variable=' ~ variable_name ~ '&datetime=2024-01-01' %} + {% set aggregate_preview_href = '/raster/' ~ data['id'] ~ '/preview.png?variable=' ~ variable_name ~ '&aggregation=sum&start=2024-01-01&end=2024-01-31' %} + {% set tilejson_href = '/raster/' ~ data['id'] ~ '/WebMercatorQuad/tilejson.json?variable=' ~ variable_name ~ '&datetime=2024-01-01' %}

{{ data['title'] }}

@@ -67,7 +152,58 @@

{{ data['title'] }}

+ {% if providers and providers[0]['type'] == 'coverage' %} +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
Loading raster layer…
+
+ 0 +
+ 50 +
+ {% endif %}
+ {% if providers and providers[0]['type'] == 'coverage' %} +
+ Click the map to inspect the rendered raster value for the active selection. +
+ {% endif %}
@@ -131,7 +267,6 @@

{% trans %}Schema{% endtrans %}

{% trans %}Open schema{% endtrans %}
- {% set providers = config.get('resources', {}).get(data['id'], {}).get('providers', []) %} {% for provider in providers %} {% if 'tile' in provider['type'] %}
@@ -146,6 +281,43 @@

{% trans %}Tiles{% endtrans %}

{% endif %} + {% if providers and providers[0]['type'] == 'coverage' %} +
+
+

{% trans %}Raster Rendering{% endtrans %}

+

{% trans %}This collection exposes raster rendering via the backend-owned raster publication surface.{% endtrans %}

+ {% for link in raster_links %} + + {{ link['title'] or 'Raster Capabilities' }} + + {% endfor %} +
+
+

{% trans %}Temporal Contract{% endtrans %}

+

{% trans %}Temporal raster rendering requires either a single date or an aggregation window.{% endtrans %}

+ ?datetime=YYYY-MM-DD
+ ?aggregation=sum&start=YYYY-MM-DD&end=YYYY-MM-DD + +
+
+

{% trans %}Variable{% endtrans %}

+

{% trans %}Use the dataset variable when requesting preview, tilejson, or tile rendering.{% endtrans %}

+ variable={{ variable_name }} + +
+
+

{% trans %}Default Styling{% endtrans %}

+

{% trans %}Rendering uses dataset-aware defaults so previews are readable without manual rescaling.{% endtrans %}

+
CHIRPS defaults: colormap ylorrd, single date 0,50, monthly sum 0,300
+
+
+ {% endif %} + {% if 'parameter_names' in data %}

Parameters

@@ -201,6 +373,45 @@

{% trans %}Storage CRS{% endtrans %}

{% block extrafoot %} {% endblock %} diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py index 45cb1d8..cd36033 100644 --- a/src/eo_api/publications/services.py +++ b/src/eo_api/publications/services.py @@ -4,16 +4,28 @@ import datetime as dt import json +import logging from pathlib import Path from typing import TYPE_CHECKING -from ..data_manager.services.downloader import DOWNLOAD_DIR +from ..data_accessor.services.accessor import get_data_coverage +from ..data_manager.services.downloader import DOWNLOAD_DIR, get_zarr_path from ..data_registry.services.datasets import list_datasets +from .capabilities import ( + PublicationServingCapability, + default_asset_format_for_kind, + evaluate_publication_serving, +) from .schemas import PublishedResource, PublishedResourceClass, PublishedResourceExposure, PublishedResourceKind if TYPE_CHECKING: from ..workflows.schemas import WorkflowExecuteResponse +logger = logging.getLogger(__name__) + +_LEGACY_PYGEOAPI_PREFIX = "/ogcapi" +_PYGEOAPI_PREFIX = "/pygeoapi" + def ensure_source_dataset_publications() -> list[PublishedResource]: """Seed published source dataset resources from the dataset registry.""" @@ -22,6 +34,7 @@ def ensure_source_dataset_publications() -> list[PublishedResource]: resource_id = f"dataset-{dataset['id']}" existing = get_published_resource(resource_id) timestamp = _utc_now() + coverage_metadata = _coverage_metadata_for_dataset(dataset) record = PublishedResource( resource_id=resource_id, resource_class=PublishedResourceClass.SOURCE, @@ -34,7 +47,7 @@ def ensure_source_dataset_publications() -> list[PublishedResource]: ), dataset_id=str(dataset["id"]), path=None, - ogc_path=f"/ogcapi/collections/{dataset['id']}", + ogc_path=f"/pygeoapi/collections/{dataset['id']}", asset_format="zarr", exposure=PublishedResourceExposure.OGC, created_at=existing.created_at if existing is not None else timestamp, @@ -47,12 +60,17 @@ def ensure_source_dataset_publications() -> list[PublishedResource]: "source_url": dataset.get("source_url"), "resolution": dataset.get("resolution"), "units": dataset.get("units"), + **coverage_metadata, }, links=[ { "rel": "collection", - "href": f"/ogcapi/collections/{dataset['id']}", - } + "href": f"/pygeoapi/collections/{dataset['id']}", + }, + { + "rel": "raster-capabilities", + "href": f"/raster/{dataset['id']}/capabilities", + }, ], ) _write_resource(record) @@ -60,9 +78,59 @@ def ensure_source_dataset_publications() -> list[PublishedResource]: return resources +def _coverage_metadata_for_dataset(dataset: dict[str, object]) -> dict[str, object]: + """Best-effort spatial/temporal metadata for one source dataset.""" + zarr_path = get_zarr_path(dataset) + if zarr_path is None: + logger.info( + "Skipping coverage metadata for dataset '%s': no zarr archive available", + dataset.get("id"), + ) + return {} + + try: + coverage = get_data_coverage(dataset).get("coverage") + except (OSError, RuntimeError, ValueError) as exc: + logger.warning( + "Skipping coverage metadata for dataset '%s': %s", + dataset.get("id"), + exc, + ) + return {} + except Exception: + logger.exception("Could not derive coverage metadata for dataset '%s'", dataset.get("id")) + return {} + + if not isinstance(coverage, dict): + return {} + + spatial = coverage.get("spatial") + temporal = coverage.get("temporal") + metadata: dict[str, object] = {} + + if isinstance(spatial, dict): + xmin = spatial.get("xmin") + ymin = spatial.get("ymin") + xmax = spatial.get("xmax") + ymax = spatial.get("ymax") + if all(value is not None for value in (xmin, ymin, xmax, ymax)): + metadata["bbox"] = [float(xmin), float(ymin), float(xmax), float(ymax)] + + if isinstance(temporal, dict): + start = temporal.get("start") + end = temporal.get("end") + if start is not None: + metadata["time_start"] = str(start) + if end is not None: + metadata["time_end"] = str(end) + + return metadata + + def register_workflow_output_publication( *, response: WorkflowExecuteResponse, + kind: PublishedResourceKind, exposure: PublishedResourceExposure, published_path: str | None = None, asset_format: str | None = None, @@ -72,30 +140,46 @@ def register_workflow_output_publication( existing = get_published_resource(resource_id) timestamp = _utc_now() publication_path = published_path or response.output_file - analytics_metadata = _analytics_metadata_for_published_asset(publication_path) + resolved_asset_format = asset_format or default_asset_format_for_kind(kind) + capability = evaluate_publication_serving( + kind=kind, + exposure=exposure, + asset_format=resolved_asset_format, + ) + if not capability.supported: + raise ValueError(capability.error or "Unsupported publication serving contract") + ogc_path = _derived_resource_ogc_path(resource_id=resource_id, capability=capability) + analytics_metadata = ( + _analytics_metadata_for_published_asset(publication_path) + if kind == PublishedResourceKind.FEATURE_COLLECTION + else {"eligible": False, "period_count": 0, "has_period_field": False} + ) links = [ {"rel": "job", "href": f"/workflows/jobs/{response.run_id}"}, {"rel": "job-result", "href": f"/workflows/jobs/{response.run_id}/result"}, - {"rel": "collection", "href": f"/ogcapi/collections/{resource_id}"}, ] - if analytics_metadata["eligible"]: + if ogc_path is not None: + links.append({"rel": "collection", "href": ogc_path}) + if "raster" in capability.served_by: + links.append({"rel": "raster-capabilities", "href": f"/raster/{resource_id}/capabilities"}) + if analytics_metadata["eligible"] and "analytics" in capability.served_by: links.append({"rel": "analytics", "href": f"/analytics/publications/{resource_id}/viewer"}) record = PublishedResource( resource_id=resource_id, resource_class=PublishedResourceClass.DERIVED, - kind=PublishedResourceKind.FEATURE_COLLECTION, + kind=kind, title=f"{response.workflow_id} output for {response.dataset_id}", description=( f"Derived workflow output from {response.workflow_id} for {response.dataset_id} " - f"({response.feature_count} features, {response.value_count} values)." + f"({response.feature_count or 0} features, {response.value_count or 0} values)." ), dataset_id=response.dataset_id, workflow_id=response.workflow_id, job_id=response.run_id, run_id=response.run_id, path=publication_path, - ogc_path=f"/ogcapi/collections/{resource_id}", - asset_format=asset_format or "datavalueset-json", + ogc_path=ogc_path, + asset_format=resolved_asset_format, exposure=exposure, created_at=existing.created_at if existing is not None else timestamp, updated_at=timestamp, @@ -117,6 +201,12 @@ def register_workflow_output_publication( return record +def _derived_resource_ogc_path(*, resource_id: str, capability: PublicationServingCapability) -> str | None: + if capability.ogc_collection: + return f"/pygeoapi/collections/{resource_id}" + return None + + def list_published_resources( *, resource_class: PublishedResourceClass | None = None, @@ -127,7 +217,8 @@ def list_published_resources( """List persisted published resources.""" resources: list[PublishedResource] = [] for path in _resources_dir().glob("*.json"): - resources.append(PublishedResource.model_validate_json(path.read_text(encoding="utf-8"))) + resource = PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + resources.append(_normalize_pygeoapi_resource_links(resource)) resources.sort(key=lambda item: item.created_at, reverse=True) if resource_class is not None: resources = [item for item in resources if item.resource_class == resource_class] @@ -145,7 +236,8 @@ def get_published_resource(resource_id: str) -> PublishedResource | None: path = _resource_path(resource_id) if not path.exists(): return None - return PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + resource = PublishedResource.model_validate_json(path.read_text(encoding="utf-8")) + return _normalize_pygeoapi_resource_links(resource) def delete_published_resource(resource_id: str) -> PublishedResource | None: @@ -196,6 +288,31 @@ def _collection_id_for_resource(resource: PublishedResource) -> str: return resource.resource_id +def _normalize_pygeoapi_resource_links(resource: PublishedResource) -> PublishedResource: + updates: dict[str, object] = {} + + if resource.ogc_path and resource.ogc_path.startswith(_LEGACY_PYGEOAPI_PREFIX): + updates["ogc_path"] = resource.ogc_path.replace(_LEGACY_PYGEOAPI_PREFIX, _PYGEOAPI_PREFIX, 1) + + normalized_links: list[dict[str, object]] = [] + links_changed = False + for link in resource.links: + normalized_link = dict(link) + href = normalized_link.get("href") + if isinstance(href, str) and href.startswith(_LEGACY_PYGEOAPI_PREFIX): + normalized_link["href"] = href.replace(_LEGACY_PYGEOAPI_PREFIX, _PYGEOAPI_PREFIX, 1) + links_changed = True + normalized_links.append(normalized_link) + + if links_changed: + updates["links"] = normalized_links + + if not updates: + return resource + + return resource.model_copy(update=updates) + + def _analytics_metadata_for_published_asset(path_value: str | None) -> dict[str, bool | int]: if path_value is None: return {"eligible": False, "period_count": 0, "has_period_field": False} diff --git a/src/eo_api/raster/__init__.py b/src/eo_api/raster/__init__.py new file mode 100644 index 0000000..fe28f23 --- /dev/null +++ b/src/eo_api/raster/__init__.py @@ -0,0 +1,3 @@ +from . import routes as routes + +__all__ = ["routes"] diff --git a/src/eo_api/raster/routes.py b/src/eo_api/raster/routes.py new file mode 100644 index 0000000..e3e15b9 --- /dev/null +++ b/src/eo_api/raster/routes.py @@ -0,0 +1,516 @@ +"""Raster publication routes and Zarr-backed TiTiler integration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import attr +from fastapi import APIRouter, Depends, HTTPException, Query, Request +from rio_tiler.colormap import cmap +from rio_tiler.io.xarray import XarrayReader +from titiler.core.dependencies import ImageRenderingParams +from titiler.core.routing import EndpointScope +from titiler.xarray.dependencies import XarrayParams +from titiler.xarray.extensions import VariablesExtension +from titiler.xarray.factory import TilerFactory +from titiler.xarray.io import Reader, get_variable + +from ..data_manager.services.downloader import get_zarr_path +from ..data_registry.services.datasets import get_dataset +from ..publications.schemas import PublishedResource, PublishedResourceKind +from ..publications.services import ( + collection_id_for_resource, + ensure_source_dataset_publications, + get_published_resource, + get_published_resource_by_collection_id, +) +from ..shared.api_errors import api_error + +router = APIRouter() + +SUPPORTED_AGGREGATIONS = {"sum", "mean", "max", "min"} + +RASTER_STYLE_PROFILES: dict[str, dict[str, Any]] = { + "chirps3_precipitation_daily": { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 50.0), + "sum": (0.0, 300.0), + "mean": (0.0, 50.0), + "max": (0.0, 100.0), + "min": (0.0, 20.0), + }, + "label": "Precipitation intensity", + }, + "era5land_precipitation_hourly": { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 15.0), + "sum": (0.0, 150.0), + "mean": (0.0, 15.0), + "max": (0.0, 40.0), + "min": (0.0, 10.0), + }, + "label": "Precipitation intensity", + }, + "era5land_temperature_hourly": { + "colormap_name": "coolwarm", + "rescale_by_mode": { + "datetime": (260.0, 320.0), + "sum": (260.0, 320.0), + "mean": (260.0, 320.0), + "max": (260.0, 330.0), + "min": (240.0, 310.0), + }, + "label": "Temperature", + }, + "worldpop_population_yearly": { + "colormap_name": "viridis", + "rescale_by_mode": { + "datetime": (0.0, 500.0), + "sum": (0.0, 1000.0), + "mean": (0.0, 500.0), + "max": (0.0, 1000.0), + "min": (0.0, 100.0), + }, + "label": "Population density", + }, +} + + +@router.get("/{resource_id}/capabilities") +def get_raster_capabilities(resource_id: str) -> dict[str, Any]: + """Describe whether a published resource is TiTiler-eligible.""" + resource = _resolve_published_resource(resource_id) + capabilities = _titiler_capabilities(resource) + return { + "resource_id": resource.resource_id, + "collection_id": collection_id_for_resource(resource), + "kind": str(resource.kind), + "asset_format": resource.asset_format, + "titiler": capabilities, + } + +def _resource_path_dependency(resource_id: str) -> str: + """Resolve one published resource to a TiTiler-readable Zarr dataset path.""" + resource = _resolve_published_resource(resource_id) + capabilities = _titiler_capabilities(resource) + if not capabilities["eligible"]: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_publication_unsupported", + error_code="RASTER_PUBLICATION_UNSUPPORTED", + message=str(capabilities["reason"]), + resource_id=resource.resource_id, + ), + ) + + path = capabilities.get("path") + if not isinstance(path, str) or path == "": + raise HTTPException( + status_code=500, + detail=api_error( + error="raster_publication_invalid", + error_code="RASTER_PUBLICATION_INVALID", + message=f"Resource '{resource.resource_id}' is missing a TiTiler dataset path", + resource_id=resource.resource_id, + ), + ) + return path + + +@dataclass +class RasterReaderParams(XarrayParams): + """Reader params with a user-facing temporal selector.""" + + datetime: str | None = Query( + default=None, + description="Time slice to render for temporal datasets, for example `2024-01-01`.", + ) + aggregation: str | None = Query( + default=None, + description="Temporal aggregation to apply before rendering, for example `sum` or `mean`.", + ) + start: str | None = Query( + default=None, + description="Start date for temporal aggregation, for example `2024-01-01`.", + ) + end: str | None = Query( + default=None, + description="End date for temporal aggregation, for example `2024-01-31`.", + ) + + def __post_init__(self) -> None: + selector_values = list(self.sel or []) + if self.datetime is not None: + selector_values.append(f"time={self.datetime}") + self.sel = selector_values or None + + def as_dict(self, **kwargs: Any) -> dict[str, Any]: + values = super().as_dict(**kwargs) + values.pop("datetime", None) + return values + + +@attr.s +class AggregatingReader(Reader): + """Xarray reader that can collapse a temporal dimension before rendering.""" + + aggregation: str | None = attr.ib(default=None) + start: str | None = attr.ib(default=None) + end: str | None = attr.ib(default=None) + + def __attrs_post_init__(self) -> None: + opener_options = { + "group": self.group, + "decode_times": self.decode_times, + **self.opener_options, + } + + self.ds = self.opener(self.src_path, **opener_options) + self.input = get_variable( + self.ds, + self.variable, + sel=self.sel, + ) + + if self.aggregation is not None: + self.input = _aggregate_temporal_dataarray( + self.input, + aggregation=self.aggregation, + start=self.start, + end=self.end, + ) + + XarrayReader.__attrs_post_init__(self) + + +def _require_temporal_selector_for_rendering(request: Request, resource_id: str) -> None: + """Require a time selector before rendering temporal datasets as images/tiles.""" + resource = _resolve_published_resource(resource_id) + dataset = _resolve_resource_dataset(resource) + if dataset is None or not _dataset_requires_temporal_selector(dataset): + return + + datetime_value = request.query_params.get("datetime") + aggregation = request.query_params.get("aggregation") + start = request.query_params.get("start") + end = request.query_params.get("end") + + time_selectors = [selector for selector in request.query_params.getlist("sel") if selector.startswith("time=")] + + if datetime_value and aggregation: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Use either 'datetime' or 'aggregation' with a date range, not both.", + resource_id=resource.resource_id, + ), + ) + + if aggregation is not None: + if aggregation not in SUPPORTED_AGGREGATIONS: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message=( + f"Unsupported aggregation '{aggregation}'. " + f"Supported values: {', '.join(sorted(SUPPORTED_AGGREGATIONS))}." + ), + resource_id=resource.resource_id, + ), + ) + if not start or not end: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Temporal aggregation requires both 'start' and 'end' query parameters.", + resource_id=resource.resource_id, + ), + ) + if time_selectors: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Do not combine 'aggregation' with a direct 'sel=time=...' selector.", + resource_id=resource.resource_id, + ), + ) + return + + if start or end: + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_temporal_query_invalid", + error_code="RASTER_TEMPORAL_QUERY_INVALID", + message="Use 'start' and 'end' only together with an 'aggregation' query parameter.", + resource_id=resource.resource_id, + ), + ) + + if datetime_value or time_selectors: + return + + raise HTTPException( + status_code=422, + detail=api_error( + error="raster_datetime_required", + error_code="RASTER_DATETIME_REQUIRED", + message=( + f"Temporal raster rendering for dataset '{dataset['id']}' requires a time selector. " + "Use '?datetime=YYYY-MM-DD' or '?aggregation=sum&start=YYYY-MM-DD&end=YYYY-MM-DD'." + ), + resource_id=resource.resource_id, + ), + ) + + +def _colormap_dependency( + resource_id: str, + colormap_name: str | None = Query(default=None, description="Named colormap override."), + colormap: str | None = Query(default=None, description="JSON encoded custom colormap override."), + aggregation: str | None = Query(default=None), +) -> Any: + if colormap_name: + return cmap.get(colormap_name) + + if colormap: + # Delegate explicit custom colormap handling back to TiTiler callers. + from titiler.core.dependencies import create_colormap_dependency + + return create_colormap_dependency(cmap)(colormap_name=None, colormap=colormap) + + profile = _style_profile_for_resource(resource_id) + if profile is None: + return None + + default_map = cmap.get(str(profile["colormap_name"])).copy() + if str(profile["colormap_name"]) in {"ylorrd", "blues", "viridis"}: + default_map[0] = (0, 0, 0, 0) + return default_map + + +def _render_params_dependency( + resource_id: str, + aggregation: str | None = Query(default=None), + rescale: list[str] | None = Query( + default=None, + description="Optional explicit min,max rescaling override.", + ), + color_formula: str | None = Query(default=None), + return_mask: bool | None = Query(default=None, alias="return_mask"), +) -> ImageRenderingParams: + params = ImageRenderingParams(rescale=rescale, color_formula=color_formula, add_mask=return_mask) + if params.rescale is not None: + return params + + profile = _style_profile_for_resource(resource_id) + default_range = _default_rescale_for_profile(profile, aggregation=aggregation) + if default_range is not None: + params.rescale = [default_range] + return params + + +_factory = TilerFactory( + reader=AggregatingReader, + router_prefix="", + path_dependency=_resource_path_dependency, + route_dependencies=[ + ( + [ + EndpointScope(path="/preview", method="GET"), + EndpointScope(path="/preview.{format}", method="GET"), + EndpointScope(path="/preview/{width}x{height}.{format}", method="GET"), + EndpointScope(path="/{tileMatrixSetId}/tilejson.json", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}.{format}", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}@{scale}x", method="GET"), + EndpointScope(path="/tiles/{tileMatrixSetId}/{z}/{x}/{y}@{scale}x.{format}", method="GET"), + ], + [Depends(_require_temporal_selector_for_rendering)], + ) + ], + extensions=[VariablesExtension()], + colormap_dependency=_colormap_dependency, + render_dependency=_render_params_dependency, + reader_dependency=RasterReaderParams, + add_viewer=False, + add_ogc_maps=False, + add_preview=True, + add_part=False, +) +router.include_router(_factory.router, prefix="/{resource_id}") + + +def _resolve_published_resource(resource_id: str) -> PublishedResource: + ensure_source_dataset_publications() + resource = get_published_resource(resource_id) + if resource is None: + resource = get_published_resource_by_collection_id(resource_id) + if resource is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="published_resource_not_found", + error_code="PUBLISHED_RESOURCE_NOT_FOUND", + message=f"Unknown published resource or collection '{resource_id}'", + resource_id=resource_id, + ), + ) + return resource + + +def _titiler_capabilities(resource: PublishedResource) -> dict[str, Any]: + if resource.kind not in {PublishedResourceKind.COVERAGE, PublishedResourceKind.TILESET}: + return { + "eligible": False, + "reader": None, + "reason": f"Resource kind '{resource.kind}' is not raster/tile publishable", + } + + dataset = _resolve_resource_dataset(resource) + if dataset is None: + return { + "eligible": False, + "reader": None, + "reason": ( + "No dataset registry record is linked to this resource, so a Zarr-backed raster " + "publication cannot be resolved." + ), + } + + candidate_path = _resolve_zarr_path(resource, dataset) + if candidate_path is None: + return { + "eligible": False, + "reader": "xarray", + "reason": ( + f"No Zarr archive is available yet for dataset '{dataset['id']}'. " + f"Build it first via '/manage/{dataset['id']}/build_zarr'." + ), + } + + base = f"/raster/{collection_id_for_resource(resource)}" + return { + "eligible": True, + "reader": "xarray", + "reason": None, + "path": str(candidate_path), + "dataset_id": dataset["id"], + "variable_hint": dataset.get("variable"), + "render_time_selector_required": _dataset_requires_temporal_selector(dataset), + "supported_render_aggregations": sorted(SUPPORTED_AGGREGATIONS), + "style_defaults": _style_profile_for_dataset(dataset), + "endpoints": { + "variables": f"{base}/variables", + "info": f"{base}/info", + "tilejson": f"{base}/WebMercatorQuad/tilejson.json", + "tiles": f"{base}/tiles/WebMercatorQuad/{{z}}/{{x}}/{{y}}.png", + "preview": f"{base}/preview.png", + "point": f"{base}/point/{{lon}},{{lat}}", + "statistics": f"{base}/statistics", + }, + } + + +def _resolve_resource_dataset(resource: PublishedResource) -> dict[str, Any] | None: + if resource.dataset_id is None: + return None + return get_dataset(resource.dataset_id) + + +def _dataset_requires_temporal_selector(dataset: dict[str, Any]) -> bool: + return bool(dataset.get("period_type")) + + +def _style_profile_for_resource(resource_id: str) -> dict[str, Any] | None: + resource = _resolve_published_resource(resource_id) + dataset = _resolve_resource_dataset(resource) + if dataset is None: + return None + return _style_profile_for_dataset(dataset) + + +def _style_profile_for_dataset(dataset: dict[str, Any]) -> dict[str, Any] | None: + profile = RASTER_STYLE_PROFILES.get(str(dataset["id"])) + if profile is not None: + return profile + + units = str(dataset.get("units") or "").lower() + variable = str(dataset.get("variable") or "").lower() + if "mm" in units or "precip" in variable: + return { + "colormap_name": "ylorrd", + "rescale_by_mode": { + "datetime": (0.0, 50.0), + "sum": (0.0, 300.0), + "mean": (0.0, 50.0), + "max": (0.0, 100.0), + "min": (0.0, 20.0), + }, + "label": "Precipitation intensity", + } + + return None + + +def _default_rescale_for_profile( + profile: dict[str, Any] | None, + *, + aggregation: str | None, +) -> tuple[float, float] | None: + if profile is None: + return None + rescale_by_mode = profile.get("rescale_by_mode", {}) + mode = aggregation or "datetime" + range_value = rescale_by_mode.get(mode) or rescale_by_mode.get("datetime") + if range_value is None: + return None + return tuple(range_value) + + +def _aggregate_temporal_dataarray( + data_array: Any, + *, + aggregation: str, + start: str | None, + end: str | None, +) -> Any: + if "time" not in data_array.dims: + raise ValueError("Temporal aggregation requires a 'time' dimension") + + time_window = data_array.sel(time=slice(start, end)) + if time_window.sizes.get("time", 0) == 0: + raise ValueError("Temporal aggregation produced no time slices for the requested date range") + + aggregate_fn = getattr(time_window, aggregation, None) + if aggregate_fn is None: + raise ValueError(f"Unsupported temporal aggregation '{aggregation}'") + return aggregate_fn(dim="time", skipna=True) + + +def _resolve_zarr_path(resource: PublishedResource, dataset: dict[str, Any]) -> Path | None: + if resource.path: + resource_path = Path(resource.path) + if resource_path.exists(): + return resource_path + + native_output = resource.metadata.get("native_output_file") + if isinstance(native_output, str): + native_path = Path(native_output) + if native_path.exists(): + return native_path + + return get_zarr_path(dataset) diff --git a/src/eo_api/startup.py b/src/eo_api/startup.py index 8b48d7c..e6be460 100644 --- a/src/eo_api/startup.py +++ b/src/eo_api/startup.py @@ -6,6 +6,8 @@ import logging import os +import sys +from pathlib import Path from dotenv import load_dotenv # noqa: E402 @@ -23,15 +25,44 @@ eo_logger.propagate = False +def _configure_proj_data() -> None: + """Point PROJ at the active environment's data files.""" + candidates: list[Path] = [] + for sys_path in sys.path: + if not sys_path: + continue + candidates.append(Path(sys_path) / "rasterio" / "proj_data") + + try: + from pyproj import datadir + + pyproj_data_dir = datadir.get_data_dir() + if pyproj_data_dir: + candidates.append(Path(pyproj_data_dir)) + except Exception: + pass + + for candidate in candidates: + if candidate.exists(): + proj_path = str(candidate) + os.environ["PROJ_LIB"] = proj_path + os.environ["PROJ_DATA"] = proj_path + eo_logger.info("Configured PROJ data directory: %s", proj_path) + return + + eo_logger.warning("Could not locate a compatible PROJ data directory in the active environment") + + def _configure_generated_pygeoapi() -> None: """Materialize publication-driven pygeoapi documents before pygeoapi import.""" from eo_api.publications.pygeoapi import write_generated_pygeoapi_documents - server_url = os.environ.get("PYGEOAPI_SERVER_URL", "http://127.0.0.1:8000/ogcapi") + server_url = os.environ.get("PYGEOAPI_SERVER_URL", "http://127.0.0.1:8000/pygeoapi") config_path, openapi_path = write_generated_pygeoapi_documents(server_url=server_url) os.environ["PYGEOAPI_CONFIG"] = str(config_path) os.environ["PYGEOAPI_OPENAPI"] = str(openapi_path) eo_logger.info("Configured generated pygeoapi documents: %s %s", config_path, openapi_path) +_configure_proj_data() _configure_generated_pygeoapi() diff --git a/src/eo_api/workflows/routes.py b/src/eo_api/workflows/routes.py index f03c260..450a017 100644 --- a/src/eo_api/workflows/routes.py +++ b/src/eo_api/workflows/routes.py @@ -4,6 +4,7 @@ from fastapi import APIRouter, HTTPException, Request +from ..publications.capabilities import evaluate_publication_serving from ..publications.schemas import PublishedResourceExposure from ..publications.services import collection_id_for_resource, get_published_resource from ..shared.api_errors import api_error @@ -34,6 +35,33 @@ router = APIRouter() +def _workflow_publication_summary(workflow: Any) -> dict[str, Any]: + publication = workflow.publication + capability = evaluate_publication_serving( + kind=publication.intent, + exposure=publication.exposure, + asset_format=publication.asset_format, + ) + asset_binding = None + if publication.asset is not None: + asset_binding = {"from_step": publication.asset.from_step, "output": publication.asset.output} + publication_inputs = { + name: {"from_step": ref.from_step, "output": ref.output} for name, ref in publication.inputs.items() + } + return { + "publication_publishable": publication.publishable, + "publication_intent": str(publication.intent) if publication.publishable else None, + "publication_exposure": str(publication.exposure) if publication.publishable else None, + "publication_asset_format": publication.asset_format, + "publication_asset_binding": asset_binding, + "publication_inputs": publication_inputs, + "serving_supported": capability.supported, + "serving_asset_format": capability.asset_format, + "serving_targets": list(capability.served_by), + "serving_error": capability.error, + } + + @router.get("", response_model=WorkflowCatalogResponse) def list_workflows() -> WorkflowCatalogResponse: """List all allowlisted workflow definitions.""" @@ -53,13 +81,9 @@ def list_workflows() -> WorkflowCatalogResponse: WorkflowCatalogItem( workflow_id=definition.workflow_id, version=definition.version, - publication_publishable=definition.publication.publishable, - publication_intent=(str(definition.publication.intent) if definition.publication.publishable else None), - publication_exposure=( - str(definition.publication.exposure) if definition.publication.publishable else None - ), step_count=len(definition.steps), components=[step.component for step in definition.steps], + **_workflow_publication_summary(definition), ) for definition in definitions ] @@ -100,7 +124,7 @@ def get_workflow_job(job_id: str, request: Request) -> WorkflowJobRecord: links.append( { "rel": "collection", - "href": f"{str(request.base_url).rstrip('/')}/ogcapi/collections/{collection_id}", + "href": f"{str(request.base_url).rstrip('/')}/pygeoapi/collections/{collection_id}", } ) analytics_link = next((link for link in publication.links if link.get("rel") == "analytics"), None) @@ -341,6 +365,7 @@ def validate_workflow_assembly(payload: WorkflowValidateRequest) -> WorkflowVali workflow_version=0, step_count=0, components=[], + publication_publishable=False, warnings=warnings, errors=[str(exc)], ) @@ -366,6 +391,7 @@ def validate_workflow_assembly(payload: WorkflowValidateRequest) -> WorkflowVali workflow_version=workflow.version, step_count=len(workflow.steps), components=[step.component for step in workflow.steps], + **_workflow_publication_summary(workflow), resolved_steps=resolved_steps, warnings=warnings, errors=errors, diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index cfd63fe..b587a66 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -115,12 +115,14 @@ class WorkflowExecuteResponse(BaseModel): workflow_id: str workflow_version: int dataset_id: str - bbox: list[float] - feature_count: int - value_count: int - output_file: str + outputs: dict[str, Any] = Field(default_factory=dict) + primary_output_name: str | None = None + bbox: list[float] | None = None + feature_count: int | None = None + value_count: int | None = None + output_file: str | None = None run_log_file: str - data_value_set: dict[str, Any] + data_value_set: dict[str, Any] | None = None component_runs: list[ComponentRun] component_run_details_included: bool = False component_run_details_available: bool = True @@ -223,6 +225,13 @@ class WorkflowCatalogItem(BaseModel): publication_publishable: bool publication_intent: str | None = None publication_exposure: str | None = None + publication_asset_format: str | None = None + publication_asset_binding: dict[str, str] | None = None + publication_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + serving_supported: bool = False + serving_asset_format: str | None = None + serving_targets: list[str] = Field(default_factory=list) + serving_error: str | None = None step_count: int components: list[str] @@ -369,6 +378,16 @@ class WorkflowValidateResponse(BaseModel): workflow_version: int step_count: int components: list[str] + publication_publishable: bool = False + publication_intent: str | None = None + publication_exposure: str | None = None + publication_asset_format: str | None = None + publication_asset_binding: dict[str, str] | None = None + publication_inputs: dict[str, dict[str, str]] = Field(default_factory=dict) + serving_supported: bool = False + serving_asset_format: str | None = None + serving_targets: list[str] = Field(default_factory=list) + serving_error: str | None = None resolved_steps: list[WorkflowValidateStep] = Field(default_factory=list) warnings: list[str] = Field(default_factory=list) errors: list[str] = Field(default_factory=list) diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py index e51de0b..98e6da0 100644 --- a/src/eo_api/workflows/services/definitions.py +++ b/src/eo_api/workflows/services/definitions.py @@ -3,58 +3,22 @@ from __future__ import annotations from pathlib import Path -from typing import Any, Final, Literal +from typing import Any, Literal import yaml from pydantic import AliasChoices, BaseModel, Field, model_validator +from ...publications.capabilities import evaluate_publication_serving from ...publications.schemas import PublishedResourceExposure, PublishedResourceKind -ComponentName = Literal[ - "feature_source", - "download_dataset", - "temporal_aggregation", - "spatial_aggregation", - "build_datavalueset", -] - -SUPPORTED_COMPONENTS: Final[set[str]] = set(ComponentName.__args__) # type: ignore[attr-defined] -SUPPORTED_COMPONENT_VERSIONS: Final[dict[str, set[str]]] = {component: {"v1"} for component in SUPPORTED_COMPONENTS} - -COMPONENT_INPUTS: Final[dict[str, set[str]]] = { - "feature_source": set(), - "download_dataset": {"bbox"}, - "temporal_aggregation": {"bbox"}, - "spatial_aggregation": {"bbox", "features"}, - "build_datavalueset": {"records"}, -} - -COMPONENT_OPTIONAL_INPUTS: Final[dict[str, set[str]]] = { - "feature_source": set(), - "download_dataset": set(), - "temporal_aggregation": set(), - "spatial_aggregation": {"temporal_dataset"}, - "build_datavalueset": set(), -} - -COMPONENT_OUTPUTS: Final[dict[str, set[str]]] = { - "feature_source": {"features", "bbox"}, - "download_dataset": {"status"}, - "temporal_aggregation": {"temporal_dataset"}, - "spatial_aggregation": {"records"}, - "build_datavalueset": {"data_value_set", "output_file"}, -} - SCRIPT_DIR = Path(__file__).parent.resolve() WORKFLOWS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "workflows" DEFAULT_WORKFLOW_ID = "dhis2_datavalue_set_v1" - - class WorkflowStep(BaseModel): """One component step in a declarative workflow definition.""" id: str | None = None - component: ComponentName + component: str version: str = "v1" config: dict[str, Any] = Field(default_factory=dict) inputs: dict[str, "WorkflowStepInput"] = Field(default_factory=dict) @@ -62,7 +26,7 @@ class WorkflowStep(BaseModel): @model_validator(mode="after") def validate_component_version(self) -> "WorkflowStep": """Ensure component@version exists in the registered component catalog.""" - supported_versions = SUPPORTED_COMPONENT_VERSIONS.get(self.component, set()) + supported_versions = _supported_component_versions(self.component) if self.version not in supported_versions: known = ", ".join(sorted(supported_versions)) or "" raise ValueError( @@ -78,6 +42,14 @@ class WorkflowStepInput(BaseModel): output: str = Field(validation_alias=AliasChoices("output", "output_key")) +class WorkflowOutputBinding(BaseModel): + """Expose one named workflow output from a step output.""" + + from_step: str + output: str = Field(validation_alias=AliasChoices("output", "output_key")) + include_in_response: bool = True + + class WorkflowPublicationPolicy(BaseModel): """Publication policy for workflow outputs.""" @@ -92,16 +64,19 @@ class WorkflowPublicationPolicy(BaseModel): ) exposure: PublishedResourceExposure = PublishedResourceExposure.REGISTRY_ONLY required_output_file_suffixes: list[str] = Field(default_factory=list) + asset: WorkflowStepInput | None = None + asset_format: str | None = None + inputs: dict[str, WorkflowStepInput] = Field(default_factory=dict) @model_validator(mode="after") def validate_publication_policy(self) -> "WorkflowPublicationPolicy": - """Restrict workflow-driven publication to currently supported resource types.""" - if self.publishable and self.intent != PublishedResourceKind.FEATURE_COLLECTION: - raise ValueError("Workflow publication currently supports only intent='feature_collection'") + """Normalize workflow publication policy.""" normalized_suffixes = [] for suffix in self.required_output_file_suffixes: normalized_suffixes.append(suffix if suffix.startswith(".") else f".{suffix}") self.required_output_file_suffixes = normalized_suffixes + if self.asset_format is not None: + self.asset_format = self.asset_format.strip().lower() or None return self @@ -112,15 +87,14 @@ class WorkflowDefinition(BaseModel): version: int = 1 publication: WorkflowPublicationPolicy = Field(default_factory=WorkflowPublicationPolicy) steps: list[WorkflowStep] + outputs: dict[str, WorkflowOutputBinding] = Field(default_factory=dict) @model_validator(mode="after") def validate_steps(self) -> "WorkflowDefinition": - """Require terminal DataValueSet step and validate component compatibility.""" + """Validate component compatibility and exported workflow outputs.""" if not self.steps: raise ValueError("Workflow steps cannot be empty") _assign_step_ids(self.steps) - if self.steps[-1].component != "build_datavalueset": - raise ValueError("The last workflow step must be 'build_datavalueset'") available_outputs: dict[str, set[str]] = {} latest_producer_for_output: dict[str, str] = {} for step in self.steps: @@ -134,10 +108,37 @@ def validate_steps(self) -> "WorkflowDefinition": ) step.inputs = resolved_inputs - outputs = COMPONENT_OUTPUTS[step.component] + outputs = _component_outputs(step.component, step.version) available_outputs[step.id] = outputs for output_name in outputs: latest_producer_for_output[output_name] = step.id + + if not self.outputs: + raise ValueError("Workflow must declare at least one exported output") + + _validate_workflow_outputs(bindings=self.outputs, available_outputs=available_outputs, owner="Workflow outputs") + if self.publication.publishable: + if self.publication.asset is None and not self.publication.inputs: + raise ValueError("Publishable workflows must declare a publication asset or publication inputs") + if self.publication.asset is not None: + _validate_workflow_outputs( + bindings={"asset": self.publication.asset}, + available_outputs=available_outputs, + owner="Workflow publication asset", + ) + if self.publication.inputs: + _validate_workflow_outputs( + bindings=self.publication.inputs, + available_outputs=available_outputs, + owner="Workflow publication", + ) + capability = evaluate_publication_serving( + kind=self.publication.intent, + exposure=self.publication.exposure, + asset_format=self.publication.asset_format, + ) + if not capability.supported: + raise ValueError(capability.error or "Unsupported publication serving contract") return self @@ -223,8 +224,8 @@ def _normalize_step_inputs( latest_producer_for_output: dict[str, str], ) -> dict[str, WorkflowStepInput]: declared_inputs = dict(step.inputs) - required_inputs = COMPONENT_INPUTS[step.component] - optional_inputs = COMPONENT_OPTIONAL_INPUTS.get(step.component, set()) + required_inputs = _component_required_inputs(step.component, step.version) + optional_inputs = _component_optional_inputs(step.component, step.version) if not declared_inputs: for input_name in sorted(required_inputs | optional_inputs): @@ -258,3 +259,61 @@ def _normalize_step_inputs( ) return declared_inputs + + +def _validate_workflow_outputs( + *, + bindings: dict[str, WorkflowStepInput | WorkflowOutputBinding], + available_outputs: dict[str, set[str]], + owner: str, +) -> None: + if not bindings: + raise ValueError(f"{owner} cannot be empty") + for output_name, ref in bindings.items(): + available_for_step = available_outputs.get(ref.from_step) + if available_for_step is None: + raise ValueError(f"{owner} reference '{output_name}' points to unknown step '{ref.from_step}'") + if ref.output not in available_for_step: + raise ValueError( + f"{owner} reference '{output_name}' points to missing output " + f"'{ref.output}' from step '{ref.from_step}'" + ) + + +def _component_definition(component: str, version: str) -> tuple[set[str], set[str], set[str]]: + from ...components import services as component_services + + definition = component_services.component_registry().get(f"{component}@{version}") + if definition is None: + raise ValueError(f"Unsupported component version '{component}@{version}'. Supported versions: ") + return ( + set(definition.workflow_inputs_required), + set(definition.workflow_inputs_optional), + set(definition.outputs), + ) + + +def _supported_component_versions(component: str) -> set[str]: + from ...components import services as component_services + + versions: set[str] = set() + for key in component_services.component_registry(): + name, _, version = key.partition("@") + if name == component and version: + versions.add(version) + return versions + + +def _component_required_inputs(component: str, version: str) -> set[str]: + required, _, _ = _component_definition(component, version) + return required + + +def _component_optional_inputs(component: str, version: str) -> set[str]: + _, optional, _ = _component_definition(component, version) + return optional + + +def _component_outputs(component: str, version: str) -> set[str]: + _, _, outputs = _component_definition(component, version) + return outputs diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index d57bc48..b6cf003 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -3,24 +3,26 @@ from __future__ import annotations import os -import time -from collections.abc import Callable from dataclasses import dataclass, field from pathlib import Path from typing import Any, Literal -import httpx from fastapi import HTTPException -from pydantic import BaseModel, ConfigDict, ValidationError from ...components import services as component_services from ...data_registry.services.datasets import get_dataset from ...publications.services import register_workflow_output_publication from ...shared.api_errors import api_error from ..schemas import WorkflowExecuteRequest, WorkflowExecuteResponse, WorkflowJobStatus -from .definitions import WorkflowDefinition, WorkflowPublicationPolicy, WorkflowStep, load_workflow_definition +from .definitions import ( + WorkflowDefinition, + WorkflowOutputBinding, + WorkflowPublicationPolicy, + WorkflowStep, + load_workflow_definition, +) from .job_store import initialize_job, mark_job_failed, mark_job_running, mark_job_success -from .publication_assets import build_feature_collection_asset +from .publication_assets import build_feature_collection_asset, write_feature_collection_asset, write_json_asset from .run_logs import persist_run_log from .runtime import WorkflowRuntime @@ -67,6 +69,7 @@ def require_output(self, output_name: str) -> Any: return self.latest_outputs[output_name] + def execute_workflow( request: WorkflowExecuteRequest, *, @@ -136,16 +139,14 @@ def execute_workflow( dataset=dataset, context=context, ) - features = context.require_output("features") - bbox = context.require_output("bbox") - data_value_set = context.require_output("data_value_set") - output_file = context.require_output("output_file") + exported_outputs = _resolve_workflow_outputs(workflow.outputs, context) + output_summary = _summarize_workflow_outputs(exported_outputs) run_log_file = persist_run_log( run_id=runtime.run_id, request=request, component_runs=runtime.component_runs, status="completed", - output_file=output_file, + output_file=output_summary["output_file"], ) response = WorkflowExecuteResponse( @@ -154,12 +155,14 @@ def execute_workflow( workflow_id=workflow.workflow_id, workflow_version=workflow.version, dataset_id=request.dataset_id, - bbox=bbox, - feature_count=len(features["features"]), - value_count=len(data_value_set["dataValues"]), - output_file=output_file, + outputs=exported_outputs, + primary_output_name=next(iter(workflow.outputs), None), + bbox=output_summary["bbox"], + feature_count=output_summary["feature_count"], + value_count=output_summary["value_count"], + output_file=output_summary["output_file"], run_log_file=run_log_file, - data_value_set=data_value_set, + data_value_set=output_summary["data_value_set"], component_runs=runtime.component_runs if include_component_run_details else [], component_run_details_included=include_component_run_details, component_run_details_available=True, @@ -175,9 +178,11 @@ def execute_workflow( request=request, publication=workflow.publication, context=context, + exported_outputs=exported_outputs, ) register_workflow_output_publication( response=response, + kind=workflow.publication.intent, exposure=workflow.publication.exposure, published_path=publication_path, asset_format=publication_asset_format, @@ -288,11 +293,27 @@ def _build_publication_artifact( request: WorkflowExecuteRequest, publication: WorkflowPublicationPolicy, context: WorkflowExecutionContext, + exported_outputs: dict[str, Any], ) -> tuple[str, str]: """Build the publication-facing artifact for a publishable workflow output.""" + if publication.asset is not None: + asset_value = context.get_step_output( + step_id=publication.asset.from_step, + output_name=publication.asset.output, + ) + return _materialize_publication_asset( + asset_value=asset_value, + dataset_id=response.dataset_id, + publication=publication, + ) + if publication.intent.value == "feature_collection": - features = context.require_output("features") - records = context.require_output("records") + features_ref = publication.inputs.get("features") + records_ref = publication.inputs.get("records") + if features_ref is None or records_ref is None: + raise ValueError("Feature collection publication requires declared publication inputs: features, records") + features = context.get_step_output(step_id=features_ref.from_step, output_name=features_ref.output) + records = context.get_step_output(step_id=records_ref.from_step, output_name=records_ref.output) path = build_feature_collection_asset( dataset_id=response.dataset_id, features=features, @@ -301,7 +322,94 @@ def _build_publication_artifact( feature_id_property=request.feature_source.feature_id_property, ) return path, "geojson" - return response.output_file, "datavalueset-json" + output_file_ref = publication.inputs.get("output_file") + if output_file_ref is not None: + path_value = context.get_step_output(step_id=output_file_ref.from_step, output_name=output_file_ref.output) + if not isinstance(path_value, str): + raise ValueError("Publication input 'output_file' must resolve to a filesystem path string") + return path_value, Path(path_value).suffix.lstrip(".") or "file" + if response.output_file is not None: + return response.output_file, _asset_format_for_path(response.output_file) + primary_output_name = response.primary_output_name + if primary_output_name is not None: + primary_output = exported_outputs.get(primary_output_name) + if isinstance(primary_output, str): + return primary_output, _asset_format_for_path(primary_output) + raise ValueError("Workflow publication could not resolve a publication artifact") + + +def _materialize_publication_asset( + *, + asset_value: Any, + dataset_id: str, + publication: WorkflowPublicationPolicy, +) -> tuple[str, str]: + """Resolve a declared publication asset to a persisted asset path and format.""" + if isinstance(asset_value, str): + return asset_value, publication.asset_format or _asset_format_for_path(asset_value) + if publication.intent.value == "feature_collection" and isinstance(asset_value, dict): + if asset_value.get("type") == "FeatureCollection": + return write_feature_collection_asset(collection=asset_value, dataset_id=dataset_id), "geojson" + if isinstance(asset_value, (dict, list)): + asset_format = publication.asset_format or "json" + return write_json_asset(payload=asset_value, dataset_id=dataset_id, suffix=asset_format), asset_format + raise ValueError("Declared publication asset must resolve to a file path or JSON-serializable value") + + +def _resolve_workflow_outputs( + bindings: dict[str, WorkflowOutputBinding], + context: WorkflowExecutionContext, +) -> dict[str, Any]: + """Resolve exported workflow outputs from step-scoped execution context.""" + resolved: dict[str, Any] = {} + for name, binding in bindings.items(): + if not binding.include_in_response: + continue + resolved[name] = context.get_step_output(step_id=binding.from_step, output_name=binding.output) + return resolved + + +def _summarize_workflow_outputs(outputs: dict[str, Any]) -> dict[str, Any]: + """Derive compatibility summary fields from declared workflow outputs.""" + features = outputs.get("features") + bbox = outputs.get("bbox") + records = outputs.get("records") + data_value_set = outputs.get("data_value_set") + output_file = outputs.get("output_file") + + if not isinstance(bbox, list): + bbox = None + if not isinstance(output_file, str): + output_file = None + + feature_count: int | None = None + if isinstance(features, dict): + feature_items = features.get("features") + if isinstance(feature_items, list): + feature_count = len(feature_items) + + value_count: int | None = None + if isinstance(data_value_set, dict): + data_values = data_value_set.get("dataValues") + if isinstance(data_values, list): + value_count = len(data_values) + elif isinstance(records, list): + value_count = len(records) + + return { + "bbox": bbox, + "feature_count": feature_count, + "value_count": value_count, + "output_file": output_file, + "data_value_set": data_value_set if isinstance(data_value_set, dict) else None, + } + + +def _asset_format_for_path(path_value: str) -> str: + suffix = Path(path_value).suffix.lower() + if suffix.startswith("."): + suffix = suffix[1:] + return suffix or "file" def _is_upstream_connectivity_error(exc: Exception) -> bool: @@ -328,14 +436,6 @@ def _execute_workflow_steps( context: WorkflowExecutionContext, ) -> None: """Execute workflow components using declarative YAML step order.""" - executors: dict[str, StepExecutor] = { - "feature_source": _run_feature_source, - "download_dataset": _run_download_dataset, - "temporal_aggregation": _run_temporal_aggregation, - "spatial_aggregation": _run_spatial_aggregation, - "build_datavalueset": _run_build_datavalueset, - } - for step in workflow.steps: if step.id is None: raise WorkflowComponentError( @@ -345,18 +445,18 @@ def _execute_workflow_steps( component_version=step.version, status_code=422, ) - executor = executors.get(step.component) - if executor is None: + runtime_definition = component_services.workflow_runtime_registry().get(f"{step.component}@{step.version}") + if runtime_definition is None: raise WorkflowComponentError( error_code="INPUT_VALIDATION_FAILED", - message=f"Unsupported workflow component '{step.component}'", + message=f"Unsupported workflow component '{step.component}@{step.version}'", component=step.component, component_version=step.version, status_code=422, ) try: step_config = _resolve_step_config(step.config, request_params or {}) - _validate_step_config(step.component, step.version, step_config) + component_services.validate_component_runtime_config(step.component, step.version, step_config) except ValueError as exc: raise WorkflowComponentError( error_code="CONFIG_VALIDATION_FAILED", @@ -368,7 +468,7 @@ def _execute_workflow_steps( try: resolved_inputs = _resolve_step_inputs(step=step, context=context) - updates = executor( + updates = runtime_definition.executor( step=step, runtime=runtime, request=request, @@ -408,7 +508,7 @@ def validate_workflow_steps( for index, step in enumerate(workflow.steps): try: resolved_config = _resolve_step_config(step.config, params) - _validate_step_config(step.component, step.version, resolved_config) + component_services.validate_component_runtime_config(step.component, step.version, resolved_config) except ValueError as exc: raise ValueError(f"Step {index + 1} ({step.component}@{step.version}) validation failed: {exc}") from exc resolved_steps.append( @@ -427,220 +527,6 @@ def validate_workflow_steps( return resolved_steps -type StepExecutor = Callable[..., dict[str, Any]] - - -def _run_feature_source( - *, - step: WorkflowStep, - runtime: WorkflowRuntime, - request: WorkflowExecuteRequest, - dataset: dict[str, Any], - resolved_inputs: dict[str, Any], - step_config: dict[str, Any], -) -> dict[str, Any]: - del dataset, resolved_inputs, step - execution_mode = str(step_config.get("execution_mode", "local")).lower() - if execution_mode == "remote": - features, bbox = runtime.run( - "feature_source", - _invoke_remote_feature_source_component, - remote_url=str(step_config["remote_url"]), - feature_source=request.feature_source.model_dump(mode="json"), - timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), - retries=int(step_config.get("remote_retries", 1)), - retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), - ) - else: - features, bbox = runtime.run( - "feature_source", - component_services.feature_source_component, - config=request.feature_source, - ) - return {"features": features, "bbox": bbox} - - -def _run_download_dataset( - *, - step: WorkflowStep, - runtime: WorkflowRuntime, - request: WorkflowExecuteRequest, - dataset: dict[str, Any], - resolved_inputs: dict[str, Any], - step_config: dict[str, Any], -) -> dict[str, Any]: - execution_mode = str(step_config.get("execution_mode", "local")).lower() - if execution_mode not in {"local", "remote"}: - raise ValueError("download_dataset.execution_mode must be 'local' or 'remote'") - - overwrite = request.overwrite - country_code = request.country_code - bbox = resolved_inputs["bbox"] - if execution_mode == "remote": - remote_url = step_config.get("remote_url") - if not isinstance(remote_url, str) or not remote_url: - raise ValueError("download_dataset remote mode requires non-empty 'remote_url'") - remote_timeout = float(step_config.get("remote_timeout_sec", 30.0)) - remote_retries = int(step_config.get("remote_retries", 1)) - remote_retry_delay_sec = float(step_config.get("remote_retry_delay_sec", 1.0)) - runtime.run( - "download_dataset", - _invoke_remote_download_component, - remote_url=remote_url, - dataset_id=request.dataset_id, - start=request.start, - end=request.end, - overwrite=overwrite, - country_code=country_code, - bbox=bbox, - timeout_sec=remote_timeout, - retries=remote_retries, - retry_delay_sec=remote_retry_delay_sec, - ) - else: - runtime.run( - "download_dataset", - component_services.download_dataset_component, - dataset=dataset, - start=request.start, - end=request.end, - overwrite=overwrite, - country_code=country_code, - bbox=bbox, - ) - return {"status": "downloaded"} - - -def _run_temporal_aggregation( - *, - step: WorkflowStep, - runtime: WorkflowRuntime, - request: WorkflowExecuteRequest, - dataset: dict[str, Any], - resolved_inputs: dict[str, Any], - step_config: dict[str, Any], -) -> dict[str, Any]: - del step - target_period_type = request.temporal_aggregation.target_period_type - method = request.temporal_aggregation.method - execution_mode = str(step_config.get("execution_mode", "local")).lower() - if execution_mode == "remote": - temporal_ds = runtime.run( - "temporal_aggregation", - _invoke_remote_temporal_aggregation_component, - remote_url=str(step_config["remote_url"]), - dataset_id=request.dataset_id, - start=request.start, - end=request.end, - bbox=resolved_inputs["bbox"], - target_period_type=target_period_type.value, - method=method.value, - timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), - retries=int(step_config.get("remote_retries", 1)), - retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), - ) - else: - temporal_ds = runtime.run( - "temporal_aggregation", - component_services.temporal_aggregation_component, - dataset=dataset, - start=request.start, - end=request.end, - bbox=resolved_inputs["bbox"], - target_period_type=target_period_type, - method=method, - ) - return {"temporal_dataset": temporal_ds} - - -def _run_spatial_aggregation( - *, - step: WorkflowStep, - runtime: WorkflowRuntime, - request: WorkflowExecuteRequest, - dataset: dict[str, Any], - resolved_inputs: dict[str, Any], - step_config: dict[str, Any], -) -> dict[str, Any]: - del step - method = request.spatial_aggregation.method - feature_id_property = request.dhis2.org_unit_property - execution_mode = str(step_config.get("execution_mode", "local")).lower() - temporal_dataset = resolved_inputs.get("temporal_dataset") - if execution_mode == "remote": - if temporal_dataset is not None: - raise ValueError( - "remote spatial_aggregation does not yet support workflow temporal_aggregation output; " - "use local spatial_aggregation for temporally aggregated workflows" - ) - records = runtime.run( - "spatial_aggregation", - _invoke_remote_spatial_aggregation_component, - remote_url=str(step_config["remote_url"]), - dataset_id=request.dataset_id, - start=request.start, - end=request.end, - bbox=resolved_inputs["bbox"], - feature_source=request.feature_source.model_dump(mode="json"), - method=method.value, - feature_id_property=feature_id_property, - timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), - retries=int(step_config.get("remote_retries", 1)), - retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), - ) - else: - records = runtime.run( - "spatial_aggregation", - component_services.spatial_aggregation_component, - dataset=dataset, - start=request.start, - end=request.end, - bbox=resolved_inputs["bbox"], - features=resolved_inputs["features"], - method=method, - feature_id_property=feature_id_property, - aggregated_dataset=temporal_dataset, - ) - return {"records": records} - - -def _run_build_datavalueset( - *, - step: WorkflowStep, - runtime: WorkflowRuntime, - request: WorkflowExecuteRequest, - dataset: dict[str, Any], - resolved_inputs: dict[str, Any], - step_config: dict[str, Any], -) -> dict[str, Any]: - del dataset, step - period_type = request.temporal_aggregation.target_period_type - execution_mode = str(step_config.get("execution_mode", "local")).lower() - if execution_mode == "remote": - data_value_set, output_file = runtime.run( - "build_datavalueset", - _invoke_remote_build_datavalueset_component, - remote_url=str(step_config["remote_url"]), - dataset_id=request.dataset_id, - period_type=period_type.value, - records=resolved_inputs["records"], - dhis2=request.dhis2.model_dump(mode="json"), - timeout_sec=float(step_config.get("remote_timeout_sec", 30.0)), - retries=int(step_config.get("remote_retries", 1)), - retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), - ) - else: - data_value_set, output_file = runtime.run( - "build_datavalueset", - component_services.build_datavalueset_component, - records=resolved_inputs["records"], - dataset_id=request.dataset_id, - period_type=period_type, - dhis2=request.dhis2, - ) - return {"data_value_set": data_value_set, "output_file": output_file} - - def _resolve_step_inputs(step: WorkflowStep, context: WorkflowExecutionContext) -> dict[str, Any]: """Resolve one step's declared upstream references into concrete values.""" resolved: dict[str, Any] = {} @@ -683,298 +569,3 @@ def _resolve_value(value: Any, request_params: dict[str, Any]) -> Any: if isinstance(value, list): return [_resolve_value(v, request_params) for v in value] return value - - -class _FeatureSourceStepConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - execution_mode: str = "local" - remote_url: str | None = None - remote_timeout_sec: float = 30.0 - remote_retries: int = 1 - remote_retry_delay_sec: float = 1.0 - - -class _DownloadDatasetStepConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - execution_mode: str = "local" - remote_url: str | None = None - remote_timeout_sec: float = 30.0 - remote_retries: int = 1 - remote_retry_delay_sec: float = 1.0 - - -class _TemporalAggregationStepConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - execution_mode: str = "local" - remote_url: str | None = None - remote_timeout_sec: float = 30.0 - remote_retries: int = 1 - remote_retry_delay_sec: float = 1.0 - - -class _SpatialAggregationStepConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - execution_mode: str = "local" - remote_url: str | None = None - remote_timeout_sec: float = 30.0 - remote_retries: int = 1 - remote_retry_delay_sec: float = 1.0 - - -class _BuildDataValueSetStepConfig(BaseModel): - model_config = ConfigDict(extra="forbid") - - execution_mode: str = "local" - remote_url: str | None = None - remote_timeout_sec: float = 30.0 - remote_retries: int = 1 - remote_retry_delay_sec: float = 1.0 - - -_STEP_CONFIG_MODELS: dict[str, type[BaseModel]] = { - "feature_source": _FeatureSourceStepConfig, - "download_dataset": _DownloadDatasetStepConfig, - "temporal_aggregation": _TemporalAggregationStepConfig, - "spatial_aggregation": _SpatialAggregationStepConfig, - "build_datavalueset": _BuildDataValueSetStepConfig, -} - - -def _validate_step_config(component: str, version: str, config: dict[str, Any]) -> None: - """Validate step config with strict Pydantic models.""" - if version != "v1": - raise ValueError(f"Unsupported component version for config validation: {component}@{version}") - model = _STEP_CONFIG_MODELS.get(component) - if model is None: - raise ValueError(f"No config schema registered for component '{component}'") - try: - validated = model.model_validate(config) - except ValidationError as exc: - raise ValueError(f"Invalid config for component '{component}@{version}': {exc}") from exc - mode = str(getattr(validated, "execution_mode", "local")).lower() - if mode not in {"local", "remote"}: - raise ValueError( - f"Invalid config for component '{component}@{version}': execution_mode must be local or remote" - ) - remote_url = getattr(validated, "remote_url", None) - remote_timeout_sec = getattr(validated, "remote_timeout_sec", 30.0) - remote_retries = getattr(validated, "remote_retries", 1) - remote_retry_delay_sec = getattr(validated, "remote_retry_delay_sec", 1.0) - - has_remote_config = bool( - (isinstance(remote_url, str) and remote_url.strip()) - or float(remote_timeout_sec) != 30.0 - or int(remote_retries) != 1 - or float(remote_retry_delay_sec) != 1.0 - ) - - if mode == "local" and has_remote_config: - raise ValueError( - f"Invalid config for component '{component}@{version}': " - "remote_url/remote_timeout_sec/remote_retries/remote_retry_delay_sec are only allowed in remote mode" - ) - if mode == "remote": - if not isinstance(remote_url, str) or not remote_url.strip(): - raise ValueError( - f"Invalid config for component '{component}@{version}': remote_url is required for remote mode" - ) - - -def _invoke_remote_download_component( - *, - remote_url: str, - dataset_id: str, - start: str, - end: str, - overwrite: bool, - country_code: str | None, - bbox: list[float], - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> None: - """Invoke remote download component endpoint with retry/timeout.""" - payload = { - "dataset_id": dataset_id, - "start": start, - "end": end, - "overwrite": overwrite, - "country_code": country_code, - "bbox": bbox, - } - attempts = max(1, retries) - last_exc: Exception | None = None - for attempt in range(1, attempts + 1): - try: - with httpx.Client(timeout=timeout_sec) as client: - response = client.post(remote_url, json=payload) - response.raise_for_status() - return - except Exception as exc: - last_exc = exc - if attempt < attempts: - time.sleep(max(0.0, retry_delay_sec)) - if last_exc is None: - raise RuntimeError("Remote download invocation failed without exception context") - raise last_exc - - -def _invoke_remote_feature_source_component( - *, - remote_url: str, - feature_source: dict[str, Any], - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> tuple[dict[str, Any], list[float]]: - """Invoke remote feature-source component endpoint.""" - payload = { - "feature_source": feature_source, - "include_features": True, - } - result = _post_remote_json( - remote_url=remote_url, - payload=payload, - timeout_sec=timeout_sec, - retries=retries, - retry_delay_sec=retry_delay_sec, - ) - features = result.get("features") - bbox = result.get("bbox") - if not isinstance(features, dict) or not isinstance(bbox, list): - raise RuntimeError("Remote feature_source response missing features/bbox") - return features, [float(x) for x in bbox] - - -def _invoke_remote_temporal_aggregation_component( - *, - remote_url: str, - dataset_id: str, - start: str, - end: str, - bbox: list[float], - target_period_type: str, - method: str, - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> dict[str, Any]: - """Invoke remote temporal-aggregation component endpoint.""" - payload = { - "dataset_id": dataset_id, - "start": start, - "end": end, - "bbox": bbox, - "target_period_type": target_period_type, - "method": method, - } - return _post_remote_json( - remote_url=remote_url, - payload=payload, - timeout_sec=timeout_sec, - retries=retries, - retry_delay_sec=retry_delay_sec, - ) - - -def _invoke_remote_spatial_aggregation_component( - *, - remote_url: str, - dataset_id: str, - start: str, - end: str, - bbox: list[float], - feature_source: dict[str, Any], - method: str, - feature_id_property: str, - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> list[dict[str, Any]]: - """Invoke remote spatial-aggregation component endpoint.""" - payload = { - "dataset_id": dataset_id, - "start": start, - "end": end, - "feature_source": feature_source, - "method": method, - "bbox": bbox, - "feature_id_property": feature_id_property, - "include_records": True, - } - result = _post_remote_json( - remote_url=remote_url, - payload=payload, - timeout_sec=timeout_sec, - retries=retries, - retry_delay_sec=retry_delay_sec, - ) - records = result.get("records") - if not isinstance(records, list): - raise RuntimeError("Remote spatial_aggregation response missing records") - return records - - -def _invoke_remote_build_datavalueset_component( - *, - remote_url: str, - dataset_id: str, - period_type: str, - records: list[dict[str, Any]], - dhis2: dict[str, Any], - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> tuple[dict[str, Any], str]: - """Invoke remote build-datavalue-set component endpoint.""" - payload = { - "dataset_id": dataset_id, - "period_type": period_type, - "records": records, - "dhis2": dhis2, - } - result = _post_remote_json( - remote_url=remote_url, - payload=payload, - timeout_sec=timeout_sec, - retries=retries, - retry_delay_sec=retry_delay_sec, - ) - data_value_set = result.get("data_value_set") - output_file = result.get("output_file") - if not isinstance(data_value_set, dict) or not isinstance(output_file, str): - raise RuntimeError("Remote build_datavalueset response missing data_value_set/output_file") - return data_value_set, output_file - - -def _post_remote_json( - *, - remote_url: str, - payload: dict[str, Any], - timeout_sec: float, - retries: int, - retry_delay_sec: float, -) -> dict[str, Any]: - """POST JSON to remote component endpoint with retry and return JSON body.""" - attempts = max(1, retries) - last_exc: Exception | None = None - for attempt in range(1, attempts + 1): - try: - with httpx.Client(timeout=timeout_sec) as client: - response = client.post(remote_url, json=payload) - response.raise_for_status() - body = response.json() - if not isinstance(body, dict): - raise RuntimeError("Remote component returned non-object JSON response") - return body - except Exception as exc: - last_exc = exc - if attempt < attempts: - time.sleep(max(0.0, retry_delay_sec)) - if last_exc is None: - raise RuntimeError("Remote component invocation failed without exception context") - raise last_exc diff --git a/src/eo_api/workflows/services/publication_assets.py b/src/eo_api/workflows/services/publication_assets.py index a5022e2..7255404 100644 --- a/src/eo_api/workflows/services/publication_assets.py +++ b/src/eo_api/workflows/services/publication_assets.py @@ -45,14 +45,25 @@ def build_feature_collection_asset( ) collection = {"type": "FeatureCollection", "features": output_features} - return _write_feature_collection(collection=collection, dataset_id=dataset_id) + return write_feature_collection_asset(collection=collection, dataset_id=dataset_id) -def _write_feature_collection(*, collection: dict[str, Any], dataset_id: str) -> str: +def write_feature_collection_asset(*, collection: dict[str, Any], dataset_id: str) -> str: + """Persist a ready-made GeoJSON FeatureCollection as a publication asset.""" + return _write_json_asset(payload=collection, dataset_id=dataset_id, suffix="geojson") + + +def write_json_asset(*, payload: Any, dataset_id: str, suffix: str = "json") -> str: + """Persist a JSON-serializable publication payload to disk.""" + normalized_suffix = suffix.lstrip(".") or "json" + return _write_json_asset(payload=payload, dataset_id=dataset_id, suffix=normalized_suffix) + + +def _write_json_asset(*, payload: Any, dataset_id: str, suffix: str) -> str: DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) now = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") - path = DOWNLOAD_DIR / f"{dataset_id}_feature_collection_{now}.geojson" - path.write_text(json.dumps(collection, indent=2), encoding="utf-8") + path = DOWNLOAD_DIR / f"{dataset_id}_publication_{now}.{suffix}" + path.write_text(json.dumps(payload, indent=2), encoding="utf-8") return str(path) diff --git a/tests/test_data_accessor.py b/tests/test_data_accessor.py new file mode 100644 index 0000000..cb58199 --- /dev/null +++ b/tests/test_data_accessor.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import numpy as np +import xarray as xr +from fastapi.testclient import TestClient + +from eo_api.data_accessor.services.accessor import ( + get_coverage_summary, + get_point_values, + get_preview_summary, +) +from eo_api.main import app + + +def test_get_point_values_returns_time_series(monkeypatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + result = get_point_values( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + lon=1.9, + lat=8.0, + start="2024-01", + end="2024-02", + ) + + assert result["dataset_id"] == "chirps3_precipitation_daily" + assert result["variable"] == "precip" + assert result["value_count"] == 2 + assert result["resolved_point"] == {"lon": 2.0, "lat": 8.0} + assert result["values"] == [{"period": "2024-01", "value": 2.0}, {"period": "2024-02", "value": 4.0}] + + +def test_point_query_outside_coverage_returns_typed_error(monkeypatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/point", + params={"lon": 99.0, "lat": 99.0, "start": "2024-01", "end": "2024-02"}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "point_query_invalid" + assert body["error_code"] == "POINT_QUERY_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" + + +def test_get_preview_summary_returns_stats_and_sample(monkeypatch) -> None: + ds = xr.Dataset( + {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, + coords={ + "time": np.array(["2024-01-01", "2024-02-01"], dtype="datetime64[ns]"), + "lat": [8.0], + "lon": [1.0, 2.0], + }, + ) + monkeypatch.setattr("eo_api.data_accessor.services.accessor.get_data", lambda *args, **kwargs: ds) + + result = get_preview_summary( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + start="2024-01", + end="2024-02", + bbox=[1.0, 8.0, 2.0, 8.0], + max_cells=3, + ) + + assert result["dataset_id"] == "chirps3_precipitation_daily" + assert result["stats"] == {"min": 1.0, "max": 4.0, "mean": 2.5, "value_count": 4} + assert result["dims"] == {"time": 2, "lat": 1, "lon": 2} + assert len(result["sample"]) == 3 + assert result["sample"][0]["period"] == "2024-01" + + +def test_preview_endpoint_requires_complete_bbox(monkeypatch) -> None: + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/preview", + params={"start": "2024-01", "end": "2024-02", "xmin": 1.0, "ymin": 8.0}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "preview_invalid" + assert body["error_code"] == "PREVIEW_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" + + +def test_get_coverage_summary_wraps_preview_and_full_coverage(monkeypatch) -> None: + monkeypatch.setattr( + "eo_api.data_accessor.services.accessor.get_preview_summary", + lambda *args, **kwargs: { + "dataset_id": "chirps3_precipitation_daily", + "variable": "precip", + "requested": {"start": "2024-01", "end": "2024-02", "bbox": [1.0, 8.0, 2.0, 8.0]}, + "dims": {"time": 2, "lat": 1, "lon": 2}, + "stats": {"min": 1.0, "max": 4.0, "mean": 2.5, "value_count": 4}, + "sample": [{"period": "2024-01", "lat": 8.0, "lon": 1.0, "value": 1.0}], + }, + ) + monkeypatch.setattr( + "eo_api.data_accessor.services.accessor.get_data_coverage", + lambda dataset: { + "coverage": { + "temporal": {"start": "2024-01", "end": "2024-12"}, + "spatial": {"xmin": 1.0, "ymin": 8.0, "xmax": 2.0, "ymax": 9.0}, + } + }, + ) + + result = get_coverage_summary( + {"id": "chirps3_precipitation_daily", "variable": "precip", "period_type": "monthly"}, + start="2024-01", + end="2024-02", + bbox=[1.0, 8.0, 2.0, 8.0], + max_cells=3, + ) + + assert result["coverage"]["temporal"] == {"start": "2024-01", "end": "2024-12"} + assert result["coverage"]["spatial"] == {"xmin": 1.0, "ymin": 8.0, "xmax": 2.0, "ymax": 9.0} + assert result["subset"]["stats"]["mean"] == 2.5 + assert result["subset"]["sample"][0]["value"] == 1.0 + + +def test_coverage_endpoint_requires_complete_bbox(monkeypatch) -> None: + monkeypatch.setattr( + "eo_api.data_registry.services.datasets.get_dataset", + lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, + ) + + client = TestClient(app) + response = client.get( + "/retrieve/chirps3_precipitation_daily/coverage", + params={"start": "2024-01", "end": "2024-02", "xmin": 1.0, "ymin": 8.0}, + ) + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "coverage_invalid" + assert body["error_code"] == "COVERAGE_INVALID" + assert body["resource_id"] == "chirps3_precipitation_daily" diff --git a/tests/test_raster_routes.py b/tests/test_raster_routes.py new file mode 100644 index 0000000..67a1c55 --- /dev/null +++ b/tests/test_raster_routes.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +import numpy as np +import xarray as xr +from fastapi.testclient import TestClient + +from eo_api.main import app +from eo_api.raster import routes as raster_routes + + +def test_raster_capabilities_report_missing_zarr_archive(monkeypatch) -> None: + client = TestClient(app) + with monkeypatch.context() as patcher: + patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) + response = client.get("/raster/chirps3_precipitation_daily/capabilities") + + assert response.status_code == 200 + body = response.json() + assert body["collection_id"] == "chirps3_precipitation_daily" + assert body["kind"] == "coverage" + assert body["titiler"]["eligible"] is False + assert body["titiler"]["reader"] == "xarray" + assert "build_zarr" in body["titiler"]["reason"] + + +def test_raster_variables_route_rejects_resource_without_zarr_archive(monkeypatch) -> None: + client = TestClient(app) + with monkeypatch.context() as patcher: + patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) + response = client.get("/raster/chirps3_precipitation_daily/variables") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_publication_unsupported" + assert body["error_code"] == "RASTER_PUBLICATION_UNSUPPORTED" + + +def test_raster_variables_route_uses_zarr_backed_xarray_reader(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(4, dtype=float).reshape(1, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/variables") + + assert response.status_code == 200 + assert response.json() == ["precip"] + + +def test_raster_preview_requires_datetime_for_temporal_dataset(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_datetime_required" + assert body["error_code"] == "RASTER_DATETIME_REQUIRED" + + +def test_raster_preview_with_datetime_renders_single_time_slice(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get( + "/raster/chirps3_precipitation_daily/preview.png?variable=precip&datetime=2024-01-01" + ) + + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + assert response.content + + +def test_raster_preview_with_aggregation_renders_time_reduced_image(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get( + "/raster/chirps3_precipitation_daily/preview.png" + "?variable=precip&aggregation=sum&start=2024-01-01&end=2024-01-02" + ) + + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + assert response.content + + +def test_raster_preview_rejects_aggregation_without_range(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip&aggregation=sum") + + assert response.status_code == 422 + body = response.json()["detail"] + assert body["error"] == "raster_temporal_query_invalid" + assert body["error_code"] == "RASTER_TEMPORAL_QUERY_INVALID" + + +def test_raster_tile_outside_bounds_returns_404(tmp_path, monkeypatch) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "lat", "lon"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "lat": [9.5, 10.5], + "lon": [39.5, 40.5], + }, + ).to_zarr(zarr_path, mode="w") + + monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) + + client = TestClient(app) + response = client.get( + "/raster/chirps3_precipitation_daily/tiles/WebMercatorQuad/6/30/31.png" + "?variable=precip&aggregation=sum&start=2024-01-01&end=2024-01-02" + ) + + assert response.status_code == 404 diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 5df9d2e..ccab313 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os from pathlib import Path from typing import Any, cast @@ -45,6 +46,34 @@ def _valid_public_payload() -> dict[str, Any]: } +def _standard_workflow_outputs( + *, + feature_step: str = "feature_source", + spatial_step: str = "spatial_aggregation", + build_step: str = "build_datavalueset", +) -> dict[str, dict[str, str]]: + return { + "bbox": {"from_step": feature_step, "output": "bbox"}, + "features": {"from_step": feature_step, "output": "features"}, + "records": {"from_step": spatial_step, "output": "records"}, + "data_value_set": {"from_step": build_step, "output": "data_value_set"}, + "output_file": {"from_step": build_step, "output": "output_file"}, + } + + +def _standard_publication_inputs( + *, + feature_step: str = "feature_source", + spatial_step: str = "spatial_aggregation", + build_step: str = "build_datavalueset", +) -> dict[str, dict[str, str]]: + return { + "features": {"from_step": feature_step, "output": "features"}, + "records": {"from_step": spatial_step, "output": "records"}, + "output_file": {"from_step": build_step, "output": "output_file"}, + } + + def _patch_successful_execution(monkeypatch: pytest.MonkeyPatch) -> None: ds = xr.Dataset( {"precip": (("time", "lat", "lon"), [[[1.0]]])}, @@ -235,6 +264,13 @@ def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClie assert default["publication_publishable"] is True assert default["publication_intent"] == "feature_collection" assert default["publication_exposure"] == "ogc" + assert default["publication_asset_format"] is None + assert default["publication_asset_binding"] is None + assert default["publication_inputs"]["features"]["from_step"] == "get_features" + assert default["serving_supported"] is True + assert default["serving_asset_format"] == "geojson" + assert default["serving_targets"] == ["pygeoapi", "analytics"] + assert default["serving_error"] is None assert default["step_count"] == 5 assert default["components"] == [ "feature_source", @@ -249,6 +285,13 @@ def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClie assert fast["publication_publishable"] is False assert fast["publication_intent"] is None assert fast["publication_exposure"] is None + assert fast["publication_asset_format"] is None + assert fast["publication_asset_binding"] is None + assert fast["publication_inputs"] == {} + assert fast["serving_supported"] is True + assert fast["serving_asset_format"] == "geojson" + assert fast["serving_targets"] == ["registry"] + assert fast["serving_error"] is None assert fast["step_count"] == 4 assert fast["components"] == [ "feature_source", @@ -258,6 +301,134 @@ def test_workflow_catalog_endpoint_returns_allowlisted_workflow(client: TestClie ] +def test_workflow_definition_allows_non_datavalueset_terminal_step_when_outputs_declared() -> None: + definition = WorkflowDefinition.model_validate( + { + "workflow_id": "generic_records_v1", + "version": 1, + "steps": [ + {"id": "get_features", "component": "feature_source", "version": "v1"}, + { + "id": "spatial_agg", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "get_features", "output": "bbox"}, + "features": {"from_step": "get_features", "output": "features"}, + }, + }, + ], + "outputs": { + "features": {"from_step": "get_features", "output": "features"}, + "records": {"from_step": "spatial_agg", "output": "records"}, + }, + } + ) + + assert [step.component for step in definition.steps] == ["feature_source", "spatial_aggregation"] + assert set(definition.outputs) == {"features", "records"} + + +def test_workflow_definition_requires_explicit_outputs() -> None: + with pytest.raises(ValueError, match="declare at least one exported output"): + WorkflowDefinition.model_validate( + { + "workflow_id": "missing_outputs_v1", + "version": 1, + "steps": [ + {"component": "feature_source", "version": "v1"}, + {"component": "download_dataset", "version": "v1"}, + {"component": "temporal_aggregation", "version": "v1"}, + {"component": "spatial_aggregation", "version": "v1"}, + {"component": "build_datavalueset", "version": "v1"}, + ], + } + ) + + +def test_publishable_workflow_can_declare_publication_asset_without_builder_inputs() -> None: + definition = WorkflowDefinition.model_validate( + { + "workflow_id": "coverage_publish_v1", + "version": 1, + "publication": { + "publishable": True, + "intent": "coverage", + "asset": {"from_step": "build", "output": "output_file"}, + "asset_format": "zarr", + }, + "steps": [ + {"id": "feature_source", "component": "feature_source", "version": "v1"}, + {"id": "download_dataset", "component": "download_dataset", "version": "v1"}, + { + "id": "spatial_aggregation", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "feature_source", "output": "bbox"}, + "features": {"from_step": "feature_source", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "version": "v1", + "inputs": {"records": {"from_step": "spatial_aggregation", "output": "records"}}, + }, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build", + ), + } + ) + + assert definition.publication.asset is not None + assert definition.publication.asset.from_step == "build" + + +def test_publishable_workflow_rejects_unsupported_serving_contract() -> None: + with pytest.raises(ValueError, match="Unsupported publication serving contract"): + WorkflowDefinition.model_validate( + { + "workflow_id": "tileset_publish_v1", + "version": 1, + "publication": { + "publishable": True, + "intent": "tileset", + "exposure": "ogc", + "asset": {"from_step": "build", "output": "output_file"}, + "asset_format": "tiles", + }, + "steps": [ + {"id": "feature_source", "component": "feature_source", "version": "v1"}, + {"id": "download_dataset", "component": "download_dataset", "version": "v1"}, + { + "id": "spatial_aggregation", + "component": "spatial_aggregation", + "version": "v1", + "inputs": { + "bbox": {"from_step": "feature_source", "output": "bbox"}, + "features": {"from_step": "feature_source", "output": "features"}, + }, + }, + { + "id": "build", + "component": "build_datavalueset", + "version": "v1", + "inputs": {"records": {"from_step": "spatial_aggregation", "output": "records"}}, + }, + ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build", + ), + } + ) + + def test_components_catalog_endpoint_returns_five_components(client: TestClient) -> None: response = client.get("/components") assert response.status_code == 200 @@ -362,7 +533,7 @@ def test_workflow_job_result_missing_uses_typed_error_envelope(client: TestClien def test_pygeoapi_collection_missing_returns_not_found(client: TestClient) -> None: - response = client.get("/ogcapi/collections/does-not-exist", params={"f": "json"}) + response = client.get("/pygeoapi/collections/does-not-exist", params={"f": "json"}) assert response.status_code == 404 @@ -481,6 +652,7 @@ def _execute_stub( {"component": "spatial_aggregation", "version": "v1", "config": {}}, {"component": "build_datavalueset", "version": "v1", "config": {}}, ], + "outputs": _standard_workflow_outputs(), }, "request": { "workflow_id": "adhoc_dhis2_v1", @@ -511,6 +683,10 @@ def test_inline_workflow_execute_endpoint_rejects_bad_component_chain(client: Te {"component": "download_dataset", "version": "v1", "config": {}}, {"component": "build_datavalueset", "version": "v1", "config": {}}, ], + "outputs": { + "data_value_set": {"from_step": "build_datavalueset", "output": "data_value_set"}, + "output_file": {"from_step": "build_datavalueset", "output": "output_file"}, + }, }, "request": { "workflow_id": "bad_adhoc_v1", @@ -538,6 +714,7 @@ def test_workflow_validate_endpoint_accepts_valid_inline_workflow(client: TestCl {"component": "spatial_aggregation", "version": "v1", "config": {}}, {"component": "build_datavalueset", "version": "v1", "config": {}}, ], + "outputs": _standard_workflow_outputs(), }, "request": { "workflow_id": "adhoc_validate_v1", @@ -553,6 +730,12 @@ def test_workflow_validate_endpoint_accepts_valid_inline_workflow(client: TestCl body = response.json() assert body["valid"] is True assert body["workflow_id"] == "adhoc_validate_v1" + assert body["publication_publishable"] is False + assert body["publication_intent"] is None + assert body["publication_inputs"] == {} + assert body["serving_supported"] is True + assert body["serving_asset_format"] == "geojson" + assert body["serving_targets"] == ["registry"] assert body["step_count"] == 4 assert len(body["resolved_steps"]) == 4 assert body["errors"] == [] @@ -571,6 +754,7 @@ def test_workflow_validate_endpoint_rejects_runtime_knobs_in_step_config(client: {"component": "spatial_aggregation", "version": "v1", "config": {}}, {"component": "build_datavalueset", "version": "v1", "config": {}}, ], + "outputs": _standard_workflow_outputs(), }, "request": { "workflow_id": "adhoc_invalid_config_v1", @@ -585,6 +769,8 @@ def test_workflow_validate_endpoint_rejects_runtime_knobs_in_step_config(client: assert response.status_code == 200 body = response.json() assert body["valid"] is False + assert body["publication_publishable"] is False + assert body["serving_supported"] is True assert body["resolved_steps"] == [] assert len(body["errors"]) == 1 assert "validation failed" in body["errors"][0].lower() @@ -596,6 +782,7 @@ def test_workflow_validate_endpoint_unknown_workflow_id(client: TestClient) -> N body = response.json() assert body["valid"] is False assert body["step_count"] == 0 + assert body["publication_publishable"] is False assert len(body["errors"]) == 1 assert "Unknown workflow_id" in body["errors"][0] @@ -642,7 +829,7 @@ def test_workflow_job_endpoints_return_persisted_result( assert links["self"].endswith(f"/workflows/jobs/{run_id}") assert links["result"].endswith(f"/workflows/jobs/{run_id}/result") assert links["trace"].endswith(f"/workflows/jobs/{run_id}/trace") - assert links["collection"].endswith(f"/ogcapi/collections/workflow-output-{run_id}") + assert links["collection"].endswith(f"/pygeoapi/collections/workflow-output-{run_id}") assert "analytics" not in links assert "result" not in job_body @@ -848,6 +1035,84 @@ def test_generated_pygeoapi_config_contains_collection_detail( assert collection["type"] == "collection" assert collection["title"]["en"] assert collection["providers"][0]["type"] == "coverage" + raster_link = next(link for link in collection["links"] if link["rel"] == "raster-capabilities") + assert raster_link["href"].endswith("/raster/chirps3_precipitation_daily/capabilities") + assert raster_link["title"] == "Raster Rendering Capabilities" + + +def test_generated_pygeoapi_config_uses_real_source_coverage_extent( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "y", "x"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "y": xr.Variable(("y",), [9.5, 10.5], attrs={"units": "degrees_north"}), + "x": xr.Variable(("x",), [39.5, 40.5], attrs={"units": "degrees_east"}), + }, + ).rio.write_crs("EPSG:4326").to_zarr(zarr_path, mode="w") + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + monkeypatch.setattr(publication_services, "list_datasets", lambda: [ + { + "id": "chirps3_precipitation_daily", + "name": "Total precipitation (CHIRPS3)", + "variable": "precip", + "period_type": "daily", + "source": "CHIRPS v3", + "source_url": "https://example.test/chirps", + "resolution": "5 km x 5 km", + "units": "mm", + } + ]) + monkeypatch.setattr( + publication_services, + "get_data_coverage", + lambda dataset: { + "coverage": { + "spatial": {"xmin": 39.5, "ymin": 9.5, "xmax": 40.5, "ymax": 10.5}, + "temporal": {"start": "2024-01-01", "end": "2024-01-02"}, + } + }, + ) + + response = client.get("/publications/pygeoapi/config") + assert response.status_code == 200 + collection = response.json()["resources"]["chirps3_precipitation_daily"] + assert collection["extents"]["spatial"]["bbox"] == [[39.5, 9.5, 40.5, 10.5]] + assert collection["extents"]["temporal"]["begin"] == "2024-01-01" + assert collection["extents"]["temporal"]["end"] == "2024-01-02" + + +def test_ogc_collection_html_for_coverage_includes_raster_controls( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + xr.Dataset( + data_vars={ + "precip": (("time", "y", "x"), np.arange(8, dtype=float).reshape(2, 2, 2)), + }, + coords={ + "time": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64[ns]"), + "y": xr.Variable(("y",), [9.5, 10.5], attrs={"units": "degrees_north"}), + "x": xr.Variable(("x",), [39.5, 40.5], attrs={"units": "degrees_east"}), + }, + ).rio.write_crs("EPSG:4326").to_zarr(zarr_path, mode="w") + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + + response = client.get("/pygeoapi/collections/chirps3_precipitation_daily?f=html") + assert response.status_code == 200 + assert "Update raster map" in response.text + assert "Single-date preview example" in response.text + assert "TileJSON example" in response.text def test_workflow_success_registers_derived_publication( @@ -870,7 +1135,7 @@ def test_workflow_success_registers_derived_publication( derived = next(item for item in resources if item["resource_id"] == f"workflow-output-{run_id}") assert derived["resource_class"] == "derived" assert derived["job_id"] == run_id - assert derived["ogc_path"] == f"/ogcapi/collections/workflow-output-{run_id}" + assert derived["ogc_path"] == f"/pygeoapi/collections/workflow-output-{run_id}" assert derived["exposure"] == "ogc" assert derived["asset_format"] == "geojson" assert derived["path"].endswith(".geojson") @@ -896,20 +1161,20 @@ def test_dynamic_ogc_collection_routes_reflect_new_publication_without_restart( run_id = response.json()["run_id"] collection_id = f"workflow-output-{run_id}" - collections_response = client.get("/ogcapi/collections", params={"f": "json"}) + collections_response = client.get("/pygeoapi/collections", params={"f": "json"}) assert collections_response.status_code == 200 collections = collections_response.json()["collections"] derived = next(item for item in collections if item["id"] == collection_id) assert derived["itemType"] == "feature" - detail_response = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + detail_response = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) assert detail_response.status_code == 200 detail = detail_response.json() detail_links = {link["rel"]: link["href"] for link in detail["links"]} assert detail["id"] == collection_id assert "analytics" not in detail_links - items_response = client.get(f"/ogcapi/collections/{collection_id}/items", params={"f": "json", "limit": 5}) + items_response = client.get(f"/pygeoapi/collections/{collection_id}/items", params={"f": "json", "limit": 5}) assert items_response.status_code == 200 items = items_response.json() assert items["type"] == "FeatureCollection" @@ -929,13 +1194,13 @@ def test_dynamic_ogc_collection_routes_drop_deleted_publication_without_restart( run_id = response.json()["run_id"] collection_id = f"workflow-output-{run_id}" - before_delete = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + before_delete = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) assert before_delete.status_code == 200 delete_response = client.delete(f"/workflows/jobs/{run_id}") assert delete_response.status_code == 200 - after_delete = client.get(f"/ogcapi/collections/{collection_id}", params={"f": "json"}) + after_delete = client.get(f"/pygeoapi/collections/{collection_id}", params={"f": "json"}) assert after_delete.status_code == 404 @@ -955,7 +1220,7 @@ def test_analytics_viewer_config_and_html_for_publication( config = config_response.json() assert config["resource_id"] == resource_id assert config["data_url"].startswith("/data/") - assert config["links"]["collection"] == f"/ogcapi/collections/{resource_id}" + assert config["links"]["collection"] == f"/pygeoapi/collections/{resource_id}" viewer_response = client.get(f"/analytics/publications/{resource_id}/viewer") assert viewer_response.status_code == 200 @@ -1046,6 +1311,11 @@ def test_inline_workflow_publication_intent_is_blocked_by_server_guardrail( "publishable": True, "strategy": "on_success", "intent": "feature_collection", + "inputs": _standard_publication_inputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), }, "steps": [ {"component": "feature_source", "version": "v1"}, @@ -1054,6 +1324,11 @@ def test_inline_workflow_publication_intent_is_blocked_by_server_guardrail( {"component": "spatial_aggregation", "version": "v1"}, {"component": "build_datavalueset", "version": "v1"}, ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), }, "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, } @@ -1087,6 +1362,11 @@ def test_inline_workflow_publication_intent_can_be_enabled_by_server_policy( "publishable": True, "strategy": "on_success", "intent": "feature_collection", + "inputs": _standard_publication_inputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), }, "steps": [ {"component": "feature_source", "version": "v1"}, @@ -1095,6 +1375,11 @@ def test_inline_workflow_publication_intent_can_be_enabled_by_server_policy( {"component": "spatial_aggregation", "version": "v1"}, {"component": "build_datavalueset", "version": "v1"}, ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), }, "request": _valid_public_payload()["request"] | {"workflow_id": "adhoc_chirps_mixed_exec_v1"}, } @@ -1126,7 +1411,7 @@ def test_ogc_process_sync_execution_links_to_collection( body = response.json() collection_links = [item for item in body["links"] if item["rel"] == "collection"] assert len(collection_links) == 1 - assert "/ogcapi/collections/workflow-output-" in collection_links[0]["href"] + assert "/pygeoapi/collections/workflow-output-" in collection_links[0]["href"] def test_generated_pygeoapi_config_reflects_publication_registry( @@ -1206,6 +1491,75 @@ def test_generated_pygeoapi_config_includes_geojson_derived_resource( assert not any(link["rel"] == "analytics" for link in derived["links"]) +def test_generated_pygeoapi_config_includes_derived_coverage_resource( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "derived_coverage.zarr" + zarr_path.mkdir(parents=True) + + response = WorkflowExecuteResponse( + status="completed", + run_id="coverage-run-1", + workflow_id="coverage_publish_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + outputs={"output_file": str(zarr_path)}, + primary_output_name="output_file", + output_file=str(zarr_path), + run_log_file="/tmp/data/workflow_runs/coverage-run-1.json", + component_runs=[], + ) + publication_services.register_workflow_output_publication( + response=response, + kind=publication_services.PublishedResourceKind.COVERAGE, + exposure=publication_services.PublishedResourceExposure.OGC, + published_path=str(zarr_path), + asset_format="zarr", + ) + + config_response = client.get("/publications/pygeoapi/config") + assert config_response.status_code == 200 + resources = config_response.json()["resources"] + derived = resources["workflow-output-coverage-run-1"] + assert derived["providers"][0]["type"] == "coverage" + assert derived["providers"][0]["data"] == str(zarr_path) + link_rels = {link["rel"] for link in derived["links"]} + assert "collection" in link_rels + assert "raster-capabilities" in link_rels + + +def test_register_workflow_output_publication_rejects_unsupported_serving_contract( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + response = WorkflowExecuteResponse( + status="completed", + run_id="tiles-run-1", + workflow_id="tiles_publish_v1", + workflow_version=1, + dataset_id="chirps3_precipitation_daily", + outputs={"output_file": "/tmp/tiles"}, + primary_output_name="output_file", + output_file="/tmp/tiles", + run_log_file="/tmp/data/workflow_runs/tiles-run-1.json", + component_runs=[], + ) + + with pytest.raises(ValueError, match="Unsupported publication serving contract"): + publication_services.register_workflow_output_publication( + response=response, + kind=publication_services.PublishedResourceKind.TILESET, + exposure=publication_services.PublishedResourceExposure.OGC, + published_path="/tmp/tiles", + asset_format="tiles", + ) + + def test_materialize_generated_pygeoapi_documents_writes_files( client: TestClient, monkeypatch: pytest.MonkeyPatch, @@ -1213,6 +1567,9 @@ def test_materialize_generated_pygeoapi_documents_writes_files( ) -> None: monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) response = client.post("/publications/pygeoapi/materialize") assert response.status_code == 200 @@ -1221,7 +1578,50 @@ def test_materialize_generated_pygeoapi_documents_writes_files( openapi_path = Path(body["openapi_path"]) assert config_path.exists() assert openapi_path.exists() - assert "resources:" in config_path.read_text(encoding="utf-8") + config_text = config_path.read_text(encoding="utf-8") + openapi_text = openapi_path.read_text(encoding="utf-8") + assert "resources:" in config_text + assert "http://127.0.0.1:8000/pygeoapi" in config_text + assert "http://127.0.0.1:8000/pygeoapi" in openapi_text + assert "http://127.0.0.1:8000/pygeoapi/collections/chirps3_precipitation_daily" in config_text + + +def test_get_published_resource_normalizes_legacy_pygeoapi_links( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + resources_dir = tmp_path / "published_resources" + resources_dir.mkdir(parents=True) + legacy_resource = { + "resource_id": "workflow-output-legacy", + "resource_class": "derived", + "kind": "feature_collection", + "title": "Legacy workflow output", + "description": "Legacy collection", + "dataset_id": "chirps3_precipitation_daily", + "workflow_id": "dhis2_datavalue_set_v1", + "job_id": "legacy", + "run_id": "legacy", + "path": "data/downloads/legacy.geojson", + "ogc_path": "/ogcapi/collections/workflow-output-legacy", + "asset_format": "geojson", + "exposure": "ogc", + "created_at": "2026-03-20T00:00:00+00:00", + "updated_at": "2026-03-20T00:00:00+00:00", + "metadata": {}, + "links": [ + {"rel": "collection", "href": "/ogcapi/collections/workflow-output-legacy"}, + {"rel": "job", "href": "/workflows/jobs/legacy"}, + ], + } + (resources_dir / "workflow-output-legacy.json").write_text(json.dumps(legacy_resource), encoding="utf-8") + + resource = publication_services.get_published_resource("workflow-output-legacy") + + assert resource is not None + assert resource.ogc_path == "/pygeoapi/collections/workflow-output-legacy" + assert resource.links[0]["href"] == "/pygeoapi/collections/workflow-output-legacy" def test_component_spatial_aggregation_serializes_numpy_datetime64( @@ -1496,6 +1896,11 @@ def test_engine_rejects_remote_spatial_after_temporal_aggregation(monkeypatch: p }, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs( + feature_step="feature_source", + spatial_step="spatial_aggregation", + build_step="build_datavalueset", + ), } ) @@ -1662,6 +2067,13 @@ def test_engine_follows_declarative_workflow_order(monkeypatch: pytest.MonkeyPat "inputs": {"records": {"from_step": "aggregate", "output": "records"}}, }, ], + "outputs": { + "bbox": {"from_step": "features", "output": "bbox"}, + "features": {"from_step": "features", "output": "features"}, + "records": {"from_step": "aggregate", "output": "records"}, + "data_value_set": {"from_step": "build", "output": "data_value_set"}, + "output_file": {"from_step": "build", "output": "output_file"}, + }, } ), ) @@ -1708,6 +2120,13 @@ def test_validate_workflow_reports_explicit_input_wiring(client: TestClient) -> assert response.status_code == 200 body = response.json() assert body["valid"] is True + assert body["publication_publishable"] is True + assert body["publication_intent"] == "feature_collection" + assert body["publication_exposure"] == "ogc" + assert body["publication_inputs"]["records"]["from_step"] == "spatial_agg" + assert body["serving_supported"] is True + assert body["serving_asset_format"] == "geojson" + assert body["serving_targets"] == ["pygeoapi", "analytics"] assert body["resolved_steps"][0]["id"] == "get_features" assert body["resolved_steps"][1]["resolved_inputs"]["bbox"] == { "from_step": "get_features", @@ -1814,6 +2233,7 @@ def test_engine_resolves_step_config_from_request_params(monkeypatch: pytest.Mon {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -1883,6 +2303,7 @@ def test_engine_rejects_invalid_step_config(monkeypatch: pytest.MonkeyPatch) -> {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -1955,6 +2376,7 @@ def test_engine_download_dataset_remote_mode_uses_remote_adapter(monkeypatch: py {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -1973,10 +2395,11 @@ def test_engine_download_dataset_remote_mode_uses_remote_adapter(monkeypatch: py ) remote_called: dict[str, Any] = {} - def _remote_adapter(**kwargs: Any) -> None: + def _remote_adapter(**kwargs: Any) -> dict[str, Any]: remote_called.update(kwargs) + return {"status": "downloaded"} - monkeypatch.setattr(engine, "_invoke_remote_download_component", _remote_adapter) + monkeypatch.setattr(component_services, "_invoke_registered_remote_component", _remote_adapter) monkeypatch.setattr( engine.component_services, "spatial_aggregation_component", @@ -1991,8 +2414,9 @@ def _remote_adapter(**kwargs: Any) -> None: response = engine.execute_workflow(request) assert response.status == "completed" + assert remote_called["component_key"] == "download_dataset@v1" assert remote_called["remote_url"] == "http://component-host/components/download-dataset" - assert remote_called["dataset_id"] == "chirps3_precipitation_daily" + assert remote_called["request"].dataset_id == "chirps3_precipitation_daily" def test_engine_rejects_remote_download_without_remote_url(monkeypatch: pytest.MonkeyPatch) -> None: @@ -2020,6 +2444,7 @@ def test_engine_rejects_remote_download_without_remote_url(monkeypatch: pytest.M {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -2078,6 +2503,7 @@ def test_engine_rejects_remote_fields_in_local_mode(monkeypatch: pytest.MonkeyPa {"component": "spatial_aggregation"}, {"component": "build_datavalueset"}, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -2153,6 +2579,7 @@ def test_engine_supports_remote_mode_for_remote_compatible_component_chain( }, }, ], + "outputs": _standard_workflow_outputs(), } ), ) @@ -2169,36 +2596,26 @@ def test_engine_supports_remote_mode_for_remote_compatible_component_chain( "build": False, } - monkeypatch.setattr( - engine, - "_invoke_remote_feature_source_component", - lambda **kwargs: ( - called.__setitem__("feature", True), - {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, - [0, 0, 1, 1], - )[1:], - ) - monkeypatch.setattr( - engine, - "_invoke_remote_download_component", - lambda **kwargs: called.__setitem__("download", True), - ) - monkeypatch.setattr( - engine, - "_invoke_remote_spatial_aggregation_component", - lambda **kwargs: ( - called.__setitem__("spatial", True), - [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}], - )[1], - ) - monkeypatch.setattr( - engine, - "_invoke_remote_build_datavalueset_component", - lambda **kwargs: ( - called.__setitem__("build", True), - ({"dataValues": [{"value": "10.0"}]}, "/tmp/data/out.json"), - )[1], - ) + def _remote_adapter(**kwargs: Any) -> dict[str, Any]: + component_key = kwargs["component_key"] + if component_key == "feature_source@v1": + called["feature"] = True + return { + "features": {"type": "FeatureCollection", "features": [{"id": "OU_1", "properties": {"id": "OU_1"}}]}, + "bbox": [0, 0, 1, 1], + } + if component_key == "download_dataset@v1": + called["download"] = True + return {"status": "downloaded"} + if component_key == "spatial_aggregation@v1": + called["spatial"] = True + return {"records": [{"org_unit": "OU_1", "time": "2024-01-01", "value": 10.0}]} + if component_key == "build_datavalueset@v1": + called["build"] = True + return {"data_value_set": {"dataValues": [{"value": "10.0"}]}, "output_file": "/tmp/data/out.json"} + raise AssertionError(f"Unexpected remote component key: {component_key}") + + monkeypatch.setattr(component_services, "_invoke_registered_remote_component", _remote_adapter) monkeypatch.setattr(engine, "persist_run_log", lambda **kwargs: "/tmp/data/workflow_runs/run.json") response = engine.execute_workflow(request) diff --git a/uv.lock b/uv.lock index 6399c0c..4f11d71 100644 --- a/uv.lock +++ b/uv.lock @@ -814,6 +814,7 @@ dependencies = [ { name = "pygeoapi" }, { name = "python-dotenv" }, { name = "titiler-core" }, + { name = "titiler-xarray" }, { name = "uvicorn" }, { name = "zarr" }, ] @@ -841,6 +842,7 @@ requires-dist = [ { name = "pygeoapi", specifier = ">=0.22.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "titiler-core", specifier = ">=1.2.0" }, + { name = "titiler-xarray", specifier = ">=1.2.0" }, { name = "uvicorn", specifier = ">=0.41.0" }, { name = "zarr", specifier = "==3.1.5" }, ] @@ -1917,6 +1919,53 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, ] +[[package]] +name = "obstore" +version = "0.9.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/18/cab734edaeb495a861cfbdced9fecdc0866ed1a85aa5a9202ec77cf4723e/obstore-0.9.2.tar.gz", hash = "sha256:7ef94323127a971c9dea2484109d6c706eb2b2594a2df13c2dd0a6d21a9a69ae", size = 123731, upload-time = "2026-03-11T19:10:18.19Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/d2/b98058a552849719df56d59a53f7d97e6507b37fca0399a866534800f9fa/obstore-0.9.2-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:50d9c9d6de601ad4805a5a76a1a3d731f7b899383f96ef57276f97bc35202f95", size = 4105494, upload-time = "2026-03-11T19:09:06.573Z" }, + { url = "https://files.pythonhosted.org/packages/ec/55/4386622b94fd028cb2298b4780d5a8e2d959fc4c71e599fb63be869aa83d/obstore-0.9.2-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:4c6dcd9b76b802a2278e1cd88ad7305caf3c3c16f800b2bf5f86a606e9e83d96", size = 3878429, upload-time = "2026-03-11T19:09:07.962Z" }, + { url = "https://files.pythonhosted.org/packages/91/8d/0bfad11f1ee5fb1fbdb7833607212ad2586dbd1824b30cf328af63fe92fc/obstore-0.9.2-cp311-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8d46e629beb47565fa67b6ef05919434258d72ef848efa340f911af5de2536da", size = 4041157, upload-time = "2026-03-11T19:09:09.278Z" }, + { url = "https://files.pythonhosted.org/packages/eb/98/bfde825f61a8b2541be9185cd6a4ddbb820de94c79750edc32f9f9dfb795/obstore-0.9.2-cp311-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:350d8cc1cd9564369291396e160ebfa133d705ec349d8c0d444a39158d6ef3e7", size = 4144757, upload-time = "2026-03-11T19:09:10.938Z" }, + { url = "https://files.pythonhosted.org/packages/19/35/1c101f6660ef91e5280c824677d8b5ab11ee25ed52e59b075cd795a86e69/obstore-0.9.2-cp311-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dddd38c9f98fd8eaf11a9805464f0bec7e57d8e04a5e0b0cb17582ec58d2fe41", size = 4427897, upload-time = "2026-03-11T19:09:12.137Z" }, + { url = "https://files.pythonhosted.org/packages/fb/eb/a9bdb64474d4e0ab4e4c0105c959090d6bd7ce38d4a945cae3679ead8c52/obstore-0.9.2-cp311-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca872e88e5c719faf1581632e348a6b01331b4f838d7ac29aff226107088dc35", size = 4336227, upload-time = "2026-03-11T19:09:13.822Z" }, + { url = "https://files.pythonhosted.org/packages/b2/ec/e6d39aa311afec2241adb6f2067d7d6ca2eb4e0aab5a95c47796edadd524/obstore-0.9.2-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ee61ac2af5c32c5282fc13b9eba7ffa332f268cb65bc29134ad8ac45e069871", size = 4229010, upload-time = "2026-03-11T19:09:15.503Z" }, + { url = "https://files.pythonhosted.org/packages/1c/fb/a24fd972b66b2d83829e2e89ccf236a759a82f881f909bf4fbe0b6c398ae/obstore-0.9.2-cp311-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:2f430cf8af76985e7ebb8d5f20c8ccef858c608103af6ea95c870f5380cd62f7", size = 4103835, upload-time = "2026-03-11T19:09:16.729Z" }, + { url = "https://files.pythonhosted.org/packages/d0/d4/c8cc60c8afc597712bf6c5059d629e050de521d901dad0f554b268c2d77f/obstore-0.9.2-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1df403f80feef7ac483ed66a2a5a964a469f3756ded533935640c4baf986dd49", size = 4292174, upload-time = "2026-03-11T19:09:18.461Z" }, + { url = "https://files.pythonhosted.org/packages/a7/80/dcf8f31814f25c390aa5501a95b78b9f6456d30cd4625109c2a6a5105ad1/obstore-0.9.2-cp311-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:c20f62b7c2f57c6f449215c36af4a8d502082ced2185c0b28f07a5e7c9698181", size = 4276266, upload-time = "2026-03-11T19:09:19.787Z" }, + { url = "https://files.pythonhosted.org/packages/16/71/5f5369fba652c5f83b44381d9e7a3cfe00793301d01802059b52b8663f2c/obstore-0.9.2-cp311-abi3-musllinux_1_2_i686.whl", hash = "sha256:c296e7d60ee132babb7fd01eab946396fa28eb0d88264b9e60320922174e6010", size = 4264118, upload-time = "2026-03-11T19:09:21.081Z" }, + { url = "https://files.pythonhosted.org/packages/c5/50/a5bd1948f2b2efb1039852542829a33a198be0586da7d4247996d3f15d26/obstore-0.9.2-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:76f274a170731a4461d0fe3eefde38f3bdaf346011ae020c94a0bd18bfd3c4bc", size = 4446876, upload-time = "2026-03-11T19:09:22.401Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d6/bcc266e391403163ed12dd8cab53012f4db8f5020fb49e3b0a505d7a1bba/obstore-0.9.2-cp311-abi3-win_amd64.whl", hash = "sha256:f644fef2a91973b6c055623692524baf830abb1f8bb3ad348611f0e25224e160", size = 4190639, upload-time = "2026-03-11T19:09:23.637Z" }, + { url = "https://files.pythonhosted.org/packages/9a/da/ea7c5095cf15c026819958f74d3ab7b69aff7ce5bf74188e5df5bba4c252/obstore-0.9.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7161a977e94a94dfd2c4ef66846371bdff46bb8b5f9b91dc29c912deb88a5bb2", size = 4087051, upload-time = "2026-03-11T19:09:24.944Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9f/16d6f41ab87e75a6400959a4708343eaca782b78a5f9de7846c70e2b1381/obstore-0.9.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e3a31fbd68bbe7e061272420337d5ccaf2df7927c2b44ff768531dda02196746", size = 3869338, upload-time = "2026-03-11T19:09:26.404Z" }, + { url = "https://files.pythonhosted.org/packages/99/61/5f13cc91b054d8c93db77e9113ca4924c4320e988284840c8a98238709e6/obstore-0.9.2-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:928da0d131ea33d0b88aa8c3a0dd3f7423261e0c9495444cc14ce0cf62808558", size = 4037703, upload-time = "2026-03-11T19:09:27.743Z" }, + { url = "https://files.pythonhosted.org/packages/58/a2/669620821881559819b8911c4820defa3ffc30a9e49e9d5aca05bd57da45/obstore-0.9.2-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79667de1f0c7eed64b658b3e696bb0565fba4069f6134db502bf4f5f5835aeee", size = 4135488, upload-time = "2026-03-11T19:09:29.232Z" }, + { url = "https://files.pythonhosted.org/packages/9f/12/019e523e97415b4fcfc35b230b270d452fdf5578a7612034c8043c8f2cbf/obstore-0.9.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7318253bc8d03b64473150dad31e611f5bd70a3cc945e3e1d6ac59a901f397c0", size = 4412922, upload-time = "2026-03-11T19:09:30.462Z" }, + { url = "https://files.pythonhosted.org/packages/a6/52/d4a8c1bf588a10bfd17a5a11ebc6af834850fe174a0369648d534a2acb81/obstore-0.9.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:133507229632fde08bc202ca2c81119b2314662dab7a96f8348e97f8e97ae36a", size = 4337193, upload-time = "2026-03-11T19:09:31.773Z" }, + { url = "https://files.pythonhosted.org/packages/aa/59/46c1bdaeae2904bb1edddbfc78e35cb0521ab7c58fe92b147a981873fcdc/obstore-0.9.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c73f208abcddcd3edb7a739d5cac777bdb6fac12a358c9b251654ec7df7866", size = 4221641, upload-time = "2026-03-11T19:09:33.067Z" }, + { url = "https://files.pythonhosted.org/packages/44/9c/b0203594666d11da31e4a7f25ace0718cb1591792e3c1de5225fbd7c8246/obstore-0.9.2-cp313-cp313t-manylinux_2_24_aarch64.whl", hash = "sha256:857b2e7d78c8fb36dcb7c6f1fa89401429667195186ced746a500e54a6aaecdb", size = 4103500, upload-time = "2026-03-11T19:09:34.687Z" }, + { url = "https://files.pythonhosted.org/packages/95/bc/b215712ef24a21247d6e8a4049a76d95e2dca517b8b24efb496600c333c7/obstore-0.9.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:24c24fdba5080524ce79b36782a11563ea40d9ae5aa26bb6b81a6d089184e4eb", size = 4290492, upload-time = "2026-03-11T19:09:35.936Z" }, + { url = "https://files.pythonhosted.org/packages/ad/28/5aa0ecdc6c01b6e020f1ff8efcca35493e0c6091a0b72ec1bbb16b5b18a8/obstore-0.9.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:778785266aaaf3a73d44ee15e33b72c7ecf0585efeaf8745a1889cc02930ae59", size = 4272220, upload-time = "2026-03-11T19:09:37.223Z" }, + { url = "https://files.pythonhosted.org/packages/06/65/c47b0f972bc7acd64385a964dfbc2efc7361207f490b4d16da789da26fd5/obstore-0.9.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:305c415fdb2230a1e096f6f290cf524d030329ad5c5e1c9c41f121e7d2fb27d7", size = 4256524, upload-time = "2026-03-11T19:09:38.592Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/9f826fd49cd17cdbc8d2a7a75698d1cc9d731ca98d645f1ca9366ac93781/obstore-0.9.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a544aad84ae774fac339c686f8a4d7b187c4927b6e33ebb9758c58991d4f27f", size = 4440986, upload-time = "2026-03-11T19:09:40.231Z" }, + { url = "https://files.pythonhosted.org/packages/b9/24/0af1af62239c539975b6c9095428f7597e8f5f9617e897e58dbf7b63f1c5/obstore-0.9.2-cp313-cp313t-win_amd64.whl", hash = "sha256:52da6bd719c4962fdfb3c7504e790a89a9b5d27703ee872db01e2075162706fd", size = 4175182, upload-time = "2026-03-11T19:09:41.617Z" }, + { url = "https://files.pythonhosted.org/packages/fa/63/02ca0378938efd1111aa5d689b527c6f3f0c59f4ee440a7b0bf36c528f46/obstore-0.9.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:1bd4790eaa2bb384b58e1c430b2c8816edd7e60216e813c8120014f742e5d280", size = 4087916, upload-time = "2026-03-11T19:09:43.162Z" }, + { url = "https://files.pythonhosted.org/packages/86/9b/604bfb0ec9f117dbb8e936d64e45d95cd9a1fcb63640453566fb3dc66e9d/obstore-0.9.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e6417ac0b5cb32498490ceb7034ea357ea2ea965c855590496d64b2d7808a621", size = 3869703, upload-time = "2026-03-11T19:09:44.673Z" }, + { url = "https://files.pythonhosted.org/packages/44/6a/04bcb394f2a6bb12c4325e6ff3f7ead24592582a593c70669d9cdb5b4e9c/obstore-0.9.2-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dc07d71e2f9cd30d2db6ac15c2b162d5b14f6a0e7f575ad66676335c256b1a80", size = 4038164, upload-time = "2026-03-11T19:09:45.922Z" }, + { url = "https://files.pythonhosted.org/packages/34/39/2cc1c2c2a7027dd32ae010ac2ae4491b5f653f86c499e6ec20a6a54e799d/obstore-0.9.2-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7606d5f5c682cc8be9f55d3b07d282dfc0e0262ddfd31b8a26b0a6a3787e5b78", size = 4135199, upload-time = "2026-03-11T19:09:47.242Z" }, + { url = "https://files.pythonhosted.org/packages/e7/4c/defabe9c19bddf44f22591bcf0fffbc3b2b3202eb5ab99a0d894562f56de/obstore-0.9.2-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80e870ab402ac0f93799049a6680faacbfc2995c60fa87fd683807ce1366e544", size = 4413291, upload-time = "2026-03-11T19:09:48.934Z" }, + { url = "https://files.pythonhosted.org/packages/10/ce/fcfd0436834657a6617d06f07de7630889036c722d35ed9df7913e6caac7/obstore-0.9.2-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:534049c4b970e1e49c33b47a3e2a051fdc9727f844c3d4737aac4e4c89939fe4", size = 4337512, upload-time = "2026-03-11T19:09:50.13Z" }, + { url = "https://files.pythonhosted.org/packages/70/12/565d0cd60f7ae6bb65bde745e182f745a0520f314b32cb802d5f445ad10a/obstore-0.9.2-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c903949b9994003bda82b57f938ab88f458e75fd27eed809547533bffad99a77", size = 4221955, upload-time = "2026-03-11T19:09:51.499Z" }, + { url = "https://files.pythonhosted.org/packages/0e/27/3fb7f28277fbc929168ff7e02a36a64a56e1288936ac10fce49420c343f4/obstore-0.9.2-cp314-cp314t-manylinux_2_24_aarch64.whl", hash = "sha256:3f07a060702c8b1af51ca15a92658a34bb3ff2e38625173c5592c5aae7fdbfcd", size = 4103438, upload-time = "2026-03-11T19:09:52.748Z" }, + { url = "https://files.pythonhosted.org/packages/67/8f/53ed223ee069da797b09f45e9dbf4a1ed24743081be1ec1411ab6baf8ce9/obstore-0.9.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:462a864782a8d7a1a60c55ac19ce4ad53668a39e35d16b98b787fe97d3fec193", size = 4290842, upload-time = "2026-03-11T19:09:54.3Z" }, + { url = "https://files.pythonhosted.org/packages/05/cd/fc94afca13776c4eb8b7a2f27ecb9ee964156d20d699100b719c6c8b6246/obstore-0.9.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:afe36e0452e753c2fece5e6849dd13f209400d5feca668514c0cca2242b0eee8", size = 4273457, upload-time = "2026-03-11T19:09:55.715Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8e/fb02a7a8d4f966af5e069315075bc4388eb63d9cff1c2f3283f3c5781919/obstore-0.9.2-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3bfae2c634bca903141ef09d6d65e343402de0470e595799881a47ac7c08b2bd", size = 4256979, upload-time = "2026-03-11T19:09:56.983Z" }, + { url = "https://files.pythonhosted.org/packages/c0/87/5621ea304d39b4099d36bfa50dce901eb37b3861e2592d76baa26031d407/obstore-0.9.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:71d4059b5e948fe6e8cfc2b77da9c2fc944dfe0ee98090d985e60dd6ebecd7f6", size = 4441545, upload-time = "2026-03-11T19:09:58.59Z" }, + { url = "https://files.pythonhosted.org/packages/30/44/5a7b98d5d92a2267df7a9a905b3cc4f0ca98fbf207b9fae5179a6838a80b/obstore-0.9.2-cp314-cp314t-win_amd64.whl", hash = "sha256:e75295c9c522dde5020d4ff763315af75a165a8a6b8d7f9ed247ce17b7d7f7b0", size = 4175247, upload-time = "2026-03-11T19:10:00.111Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.39.1" @@ -3324,6 +3373,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/d8/20d2982580c1e13025f7e54391f0b2bbf669cb2b1462f42b64d8fe3cf50c/titiler_core-1.2.0-py3-none-any.whl", hash = "sha256:ba7f34f83b3dab0cae612b88ad087be230bbce2043562e17b8ed9182484c4642", size = 88373, upload-time = "2026-02-09T14:37:52.263Z" }, ] +[[package]] +name = "titiler-xarray" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "obstore" }, + { name = "rioxarray" }, + { name = "titiler-core" }, + { name = "xarray" }, + { name = "zarr" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/b2/e6aec77d4160f610b49e95b9edd2ef585c7f8c83900a0ca66b5c6a02acfc/titiler_xarray-1.2.0.tar.gz", hash = "sha256:7e13b753e636ee5af4db1d7fbc84e8dfb58ba0ae0fdcccefb01d4ffdae82ba8d", size = 32428, upload-time = "2026-02-09T14:37:55.718Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/d3/a3238916c0016a349f309e4ff4ab119c02063317c26d9eacdf4da136c27a/titiler_xarray-1.2.0-py3-none-any.whl", hash = "sha256:781489360d4562e33dd782187b10706ed619b7e0a0ce13c6ff7f459e6ff75915", size = 34150, upload-time = "2026-02-09T14:37:54.446Z" }, +] + [[package]] name = "toml" version = "0.10.2" From 123da7d5ed95c7e3fc12e8c66789bb2c72d1867e Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Sat, 21 Mar 2026 13:07:25 +0100 Subject: [PATCH 12/15] feat: align native ogc process results and conformance --- src/eo_api/ogc/routes.py | 353 +++++++++++++++++++++++++++++++++++++- src/eo_api/ogc/schemas.py | 48 ++++++ tests/test_workflows.py | 108 +++++++++++- 3 files changed, 501 insertions(+), 8 deletions(-) create mode 100644 src/eo_api/ogc/schemas.py diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 4341bc0..3257acf 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -4,24 +4,39 @@ import uuid from html import escape +from pathlib import Path from typing import Any from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, Response -from fastapi.responses import HTMLResponse +from fastapi.responses import FileResponse, HTMLResponse +from ..data_manager.services.downloader import DOWNLOAD_DIR from ..publications.schemas import PublishedResourceExposure from ..publications.services import collection_id_for_resource, get_published_resource from ..shared.api_errors import api_error -from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus +from ..workflows.schemas import WorkflowExecuteEnvelopeRequest, WorkflowJobStatus, WorkflowRequest from ..workflows.services.definitions import load_workflow_definition from ..workflows.services.engine import execute_workflow from ..workflows.services.job_store import get_job, get_job_result, initialize_job, list_jobs from ..workflows.services.simple_mapper import normalize_simple_request +from .schemas import ( + OGCJobResultsExtended, + OGCJobResultsResponse, + OGCOutputFormatInfo, + OGCOutputReference, + OGCOutputValue, +) router = APIRouter() _PROCESS_ID = "generic-dhis2-workflow" _PROCESS_TITLE = "Generic DHIS2 workflow" +_OGC_PROCESSES_CONFORMANCE = [ + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/core", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/oas30", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/json", + "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/job-list", +] @router.get("", response_model=None) @@ -38,7 +53,7 @@ def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTM {"rel": "self", "type": "application/json", "href": _request_href(request, f="json")}, {"rel": "alternate", "type": "text/html", "href": _request_href(request, f="html")}, {"rel": "service-desc", "type": "application/vnd.oai.openapi+json;version=3.0", "href": "/ogcapi/openapi"}, - {"rel": "conformance", "type": "application/json", "href": f"{base_url}/pygeoapi/conformance"}, + {"rel": "conformance", "type": "application/json", "href": f"{base_url}/ogcapi/conformance"}, {"rel": "data", "type": "application/json", "href": f"{base_url}/pygeoapi/collections"}, {"rel": "processes", "type": "application/json", "href": f"{base_url}/ogcapi/processes"}, {"rel": "jobs", "type": "application/json", "href": f"{base_url}/ogcapi/jobs"}, @@ -61,7 +76,7 @@ def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTM }, { "title": "Conformance", - "description": "See the standards conformance declarations for the mounted OGC publication layer.", + "description": "See the native OGC API - Processes conformance declarations.", "href": f"{base_url}/ogcapi/conformance", }, ], @@ -71,6 +86,157 @@ def get_ogc_root(request: Request, f: str | None = None) -> dict[str, Any] | HTM return body +@router.get("/conformance") +def get_ogc_conformance(request: Request) -> dict[str, Any]: + """Return native OGC API - Processes conformance declarations.""" + return { + "conformsTo": _OGC_PROCESSES_CONFORMANCE, + "links": [ + {"rel": "self", "type": "application/json", "href": str(request.url)}, + { + "rel": "service-desc", + "type": "application/vnd.oai.openapi+json;version=3.0", + "href": str(request.base_url).rstrip("/") + "/ogcapi/openapi", + }, + ], + } + + +@router.get("/openapi") +def get_ogc_openapi(request: Request) -> dict[str, Any]: + """Return a minimal native service description for the process/job surface.""" + base_url = str(request.base_url).rstrip("/") + return { + "openapi": "3.0.2", + "info": { + "title": "DHIS2 EO API - Native OGC Processes", + "version": "0.1.0", + "description": ( + "Native OGC API - Processes service description " + "for the FastAPI-owned process and job surface." + ), + }, + "servers": [{"url": f"{base_url}/ogcapi"}], + "paths": { + "/": {"get": {"summary": "Landing page", "responses": {"200": {"description": "Landing page"}}}}, + "/conformance": { + "get": {"summary": "Conformance", "responses": {"200": {"description": "Conformance classes"}}} + }, + "/processes": { + "get": {"summary": "List processes", "responses": {"200": {"description": "Process list"}}} + }, + "/processes/{process_id}": { + "get": { + "summary": "Describe process", + "parameters": [ + { + "name": "process_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Process description"}}, + } + }, + "/processes/{process_id}/execution": { + "post": { + "summary": "Execute process", + "parameters": [ + { + "name": "process_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + }, + { + "name": "Prefer", + "in": "header", + "required": False, + "schema": {"type": "string"}, + }, + ], + "requestBody": { + "required": True, + "content": { + "application/json": { + "schema": {"$ref": "#/components/schemas/WorkflowExecuteEnvelopeRequest"} + } + }, + }, + "responses": { + "200": {"description": "Synchronous execution result"}, + "202": {"description": "Accepted asynchronous execution"}, + }, + } + }, + "/jobs": {"get": {"summary": "List jobs", "responses": {"200": {"description": "Job list"}}}}, + "/jobs/{job_id}": { + "get": { + "summary": "Get job", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Job status"}}, + } + }, + "/jobs/{job_id}/results": { + "get": { + "summary": "Get job results", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + }, + { + "name": "extended", + "in": "query", + "required": False, + "schema": {"type": "boolean", "default": False}, + }, + ], + "responses": {"200": {"description": "OGC-compliant process results"}}, + } + }, + "/jobs/{job_id}/download": { + "get": { + "summary": "Download output artifact", + "parameters": [ + { + "name": "job_id", + "in": "path", + "required": True, + "schema": {"type": "string"}, + } + ], + "responses": {"200": {"description": "Artifact download"}}, + } + }, + }, + "components": { + "schemas": { + "WorkflowRequest": WorkflowRequest.model_json_schema(ref_template="#/components/schemas/{model}"), + "WorkflowExecuteEnvelopeRequest": WorkflowExecuteEnvelopeRequest.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + "OGCJobResultsResponse": OGCJobResultsResponse.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + "OGCJobResultsExtended": OGCJobResultsExtended.model_json_schema( + ref_template="#/components/schemas/{model}" + ), + } + }, + } + + @router.get("/processes") def list_processes(request: Request) -> dict[str, Any]: """List exposed OGC processes.""" @@ -98,12 +264,29 @@ def list_processes(request: Request) -> dict[str, Any]: def describe_process(process_id: str, request: Request) -> dict[str, Any]: """Describe the single exposed generic workflow process.""" _require_process(process_id) + request_schema = WorkflowRequest.model_json_schema() return { "id": _PROCESS_ID, "title": _PROCESS_TITLE, "description": "OGC-facing adapter over the reusable native workflow engine.", "jobControlOptions": ["sync-execute", "async-execute"], "outputTransmission": ["value", "reference"], + "inputs": { + "request": { + "title": "Workflow Request", + "description": "Flat request contract normalized by the native workflow mapper.", + "schema": request_schema, + "minOccurs": 1, + "maxOccurs": 1, + } + }, + "outputs": { + "outputs": { + "title": "Workflow Outputs", + "description": "Declared workflow outputs returned as OGC output objects on the job results endpoint.", + "schema": OGCJobResultsResponse.model_json_schema(), + } + }, "links": [ { "rel": "execute", @@ -245,8 +428,37 @@ def get_ogc_job(job_id: str, request: Request) -> dict[str, Any]: @router.get("/jobs/{job_id}/results", name="get_ogc_job_results") -def get_ogc_job_results(job_id: str) -> dict[str, Any]: - """Return persisted results for a completed OGC job.""" +def get_ogc_job_results(job_id: str, request: Request, extended: bool = False) -> dict[str, Any]: + """Return OGC API - Processes compliant results for a completed job.""" + job = get_job(job_id) + if job is None: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_not_found", + error_code="JOB_NOT_FOUND", + message=f"Unknown job_id '{job_id}'", + job_id=job_id, + ), + ) + result = get_job_result(job_id) + if result is None: + raise HTTPException( + status_code=409, + detail=api_error( + error="job_result_unavailable", + error_code="JOB_RESULT_UNAVAILABLE", + message=f"Result is not available for job '{job_id}'", + job_id=job_id, + status=str(job.status), + ), + ) + return _to_ogc_results(result=result, job_id=job_id, request=request, include_extended=extended) + + +@router.get("/jobs/{job_id}/download", name="download_ogc_job_output") +def download_ogc_job_output(job_id: str) -> FileResponse: + """Download the native artifact for a completed OGC job when available.""" job = get_job(job_id) if job is None: raise HTTPException( @@ -270,7 +482,30 @@ def get_ogc_job_results(job_id: str) -> dict[str, Any]: status=str(job.status), ), ) - return result + output_file = result.get("output_file") + if not isinstance(output_file, str) or not output_file: + raise HTTPException( + status_code=404, + detail=api_error( + error="job_output_unavailable", + error_code="JOB_OUTPUT_UNAVAILABLE", + message=f"No downloadable output artifact is available for job '{job_id}'", + job_id=job_id, + ), + ) + output_path = Path(output_file).resolve() + downloads_root = DOWNLOAD_DIR.resolve() + if downloads_root not in output_path.parents or not output_path.exists(): + raise HTTPException( + status_code=404, + detail=api_error( + error="job_output_unavailable", + error_code="JOB_OUTPUT_UNAVAILABLE", + message=f"Output artifact for job '{job_id}' is not available for download", + job_id=job_id, + ), + ) + return FileResponse(output_path) def _require_process(process_id: str) -> None: @@ -328,6 +563,110 @@ def _wants_html(request: Request, f: str | None) -> bool: return "text/html" in accept and "application/json" not in accept +def _to_ogc_results( + *, + result: dict[str, Any], + job_id: str, + request: Request, + include_extended: bool = False, +) -> dict[str, Any]: + """Transform native workflow results to an OGC API - Processes results envelope.""" + outputs: list[OGCOutputValue | OGCOutputReference] = [] + + native_outputs = result.get("outputs") + if isinstance(native_outputs, dict): + for output_id, output_value in native_outputs.items(): + if output_id in {"output_file", "data_value_set"}: + continue + outputs.append( + OGCOutputValue( + id=output_id, + value=output_value, + format=OGCOutputFormatInfo(media_type=_media_type_for_output_value(output_value)), + title=output_id.replace("_", " ").title(), + description=f"Process output: {output_id}", + ) + ) + + data_value_set = result.get("data_value_set") + if isinstance(data_value_set, dict): + outputs.append( + OGCOutputValue( + id="data_value_set", + value=data_value_set, + format=OGCOutputFormatInfo( + media_type="application/vnd.dhis2+json", + schema_url="https://dhis2.github.io/dhis2-api-specification/schemas/dataValueSet.json", + ), + title="DHIS2 DataValueSet", + description="Import-ready DHIS2 DataValueSet payload", + ) + ) + + download_href = _job_output_download_href(result=result, job_id=job_id, request=request) + if download_href is not None: + outputs.append( + OGCOutputReference( + id="output_file", + href=download_href, + format=OGCOutputFormatInfo(media_type=_media_type_for_path(str(result["output_file"]))), + title="Output File", + description="Downloadable native workflow artifact", + rel="related", + ) + ) + + if include_extended: + return OGCJobResultsExtended( + outputs=outputs, + metadata={ + "job_id": job_id, + "status": result.get("status"), + "run_id": result.get("run_id"), + "workflow_id": result.get("workflow_id"), + "workflow_version": result.get("workflow_version"), + "dataset_id": result.get("dataset_id"), + "bbox": result.get("bbox"), + "feature_count": result.get("feature_count"), + "value_count": result.get("value_count"), + "run_log_file": result.get("run_log_file"), + "component_runs": result.get("component_runs", []), + }, + ).model_dump(mode="json") + + return OGCJobResultsResponse(outputs=outputs).model_dump(mode="json") + + +def _job_output_download_href(*, result: dict[str, Any], job_id: str, request: Request) -> str | None: + output_file = result.get("output_file") + if not isinstance(output_file, str) or not output_file: + return None + output_path = Path(output_file).resolve() + downloads_root = DOWNLOAD_DIR.resolve() + if downloads_root not in output_path.parents or not output_path.exists(): + return None + return str(request.url_for("download_ogc_job_output", job_id=job_id)) + + +def _media_type_for_output_value(value: Any) -> str: + if isinstance(value, (dict, list, bool, int, float)) or value is None: + return "application/json" + return "text/plain" + + +def _media_type_for_path(path_value: str) -> str: + suffix = Path(path_value).suffix.lower() + if suffix == ".json": + return "application/json" + if suffix == ".geojson": + return "application/geo+json" + if suffix in {".tif", ".tiff"}: + return "image/tiff" + if suffix == ".zarr": + return "application/vnd+zarr" + return "application/octet-stream" + + def _render_ogc_root_html(body: dict[str, Any]) -> str: # Map icon SVGs to navigation items by title # noqa: E501 icons_map = { # noqa: E501 diff --git a/src/eo_api/ogc/schemas.py b/src/eo_api/ogc/schemas.py new file mode 100644 index 0000000..754b9b3 --- /dev/null +++ b/src/eo_api/ogc/schemas.py @@ -0,0 +1,48 @@ +"""Native OGC API - Processes schemas.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class OGCOutputFormatInfo(BaseModel): + """Format descriptor for one OGC process output.""" + + media_type: str = Field(description="IANA media type for the output payload") + schema_url: str | None = Field(default=None, description="Optional schema or specification URL") + encoding: str | None = Field(default="UTF-8", description="Character encoding when applicable") + + +class OGCOutputValue(BaseModel): + """Inline OGC process output.""" + + id: str = Field(description="Output identifier") + value: Any = Field(description="Inline output value") + format: OGCOutputFormatInfo = Field(description="Format metadata") + title: str | None = Field(default=None) + description: str | None = Field(default=None) + + +class OGCOutputReference(BaseModel): + """Referenced OGC process output.""" + + id: str = Field(description="Output identifier") + href: str = Field(description="Absolute URL to the referenced output") + format: OGCOutputFormatInfo = Field(description="Format metadata") + title: str | None = Field(default=None) + description: str | None = Field(default=None) + rel: str = Field(default="related", description="Relationship type") + + +class OGCJobResultsResponse(BaseModel): + """Strict OGC API - Processes results envelope.""" + + outputs: list[OGCOutputValue | OGCOutputReference] = Field(default_factory=list) + + +class OGCJobResultsExtended(OGCJobResultsResponse): + """Extended OGC results with native metadata.""" + + metadata: dict[str, Any] | None = Field(default=None) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index ccab313..3d62b69 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -162,12 +162,15 @@ def test_ogc_process_routes_exist() -> None: route.path for route in app.routes if isinstance(route, APIRoute) and route.path.startswith("/ogcapi") } assert "/ogcapi" in ogc_routes + assert "/ogcapi/conformance" in ogc_routes + assert "/ogcapi/openapi" in ogc_routes assert "/ogcapi/processes" in ogc_routes assert "/ogcapi/processes/{process_id}" in ogc_routes assert "/ogcapi/processes/{process_id}/execution" in ogc_routes assert "/ogcapi/jobs" in ogc_routes assert "/ogcapi/jobs/{job_id}" in ogc_routes assert "/ogcapi/jobs/{job_id}/results" in ogc_routes + assert "/ogcapi/jobs/{job_id}/download" in ogc_routes def test_publication_generated_pygeoapi_routes_exist() -> None: @@ -209,6 +212,35 @@ def test_pygeoapi_mount_serves_landing_page(client: TestClient) -> None: assert {"self", "alternate", "data", "processes", "jobs"} <= rels +def test_native_ogc_conformance_exists(client: TestClient) -> None: + response = client.get("/ogcapi/conformance") + assert response.status_code == 200 + body = response.json() + conforms_to = set(body["conformsTo"]) + assert "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/core" in conforms_to + assert "http://www.opengis.net/spec/ogcapi-processes-1/1.0/conf/job-list" in conforms_to + + +def test_native_ogc_openapi_exists(client: TestClient) -> None: + response = client.get("/ogcapi/openapi") + assert response.status_code == 200 + body = response.json() + assert body["openapi"] == "3.0.2" + assert "/jobs/{job_id}/results" in body["paths"] + assert "/processes/{process_id}/execution" in body["paths"] + + +def test_native_ogc_process_description_exposes_inputs_and_outputs(client: TestClient) -> None: + response = client.get("/ogcapi/processes/generic-dhis2-workflow") + assert response.status_code == 200 + body = response.json() + assert body["id"] == "generic-dhis2-workflow" + assert "request" in body["inputs"] + assert "schema" in body["inputs"]["request"] + assert "outputs" in body["outputs"] + assert "schema" in body["outputs"]["outputs"] + + def test_publication_endpoint_missing_uses_typed_error_envelope(client: TestClient) -> None: response = client.get("/publications/does-not-exist") assert response.status_code == 404 @@ -962,9 +994,17 @@ def test_ogc_async_execution_creates_job_and_results( monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) response = client.post( "/ogcapi/processes/generic-dhis2-workflow/execution", @@ -982,7 +1022,73 @@ def test_ogc_async_execution_creates_job_and_results( results_response = client.get(f"/ogcapi/jobs/{job_id}/results") assert results_response.status_code == 200 - assert results_response.json()["run_id"] == job_id + body = results_response.json() + assert "outputs" in body + assert isinstance(body["outputs"], list) + output_ids = {item["id"] for item in body["outputs"]} + assert "data_value_set" in output_ids + assert "output_file" in output_ids + + +def test_ogc_job_results_extended_exposes_native_metadata( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + results_response = client.get(f"/ogcapi/jobs/{job_id}/results", params={"extended": "true"}) + assert results_response.status_code == 200 + body = results_response.json() + assert body["metadata"]["job_id"] == job_id + assert body["metadata"]["workflow_id"] == "dhis2_datavalue_set_v1" + + +def test_ogc_job_download_serves_native_output( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr("eo_api.ogc.routes.DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + _patch_successful_execution(monkeypatch) + artifact_path = tmp_path / "out.json" + artifact_path.write_text('{"dataValues":[{"value":"10.0"}]}', encoding="utf-8") + monkeypatch.setattr( + engine.component_services, + "build_datavalueset_component", + lambda **kwargs: ({"dataValues": [{"value": "10.0"}]}, str(artifact_path)), + ) + + response = client.post( + "/ogcapi/processes/generic-dhis2-workflow/execution", + headers={"Prefer": "respond-async"}, + json=_valid_public_payload(), + ) + assert response.status_code == 202 + job_id = response.json()["jobID"] + + download_response = client.get(f"/ogcapi/jobs/{job_id}/download") + assert download_response.status_code == 200 def test_publications_endpoint_seeds_source_datasets( From 3658f9a710b7474fa4854526779d6ea3a535a054 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Sat, 21 Mar 2026 13:56:49 +0100 Subject: [PATCH 13/15] Fix type-check lint failures --- src/eo_api/components/services.py | 3 ++ src/eo_api/data_accessor/routes.py | 2 + src/eo_api/data_accessor/services/accessor.py | 11 ++-- src/eo_api/main.py | 1 + src/eo_api/ogc/routes.py | 7 +-- src/eo_api/publications/services.py | 1 + src/eo_api/raster/routes.py | 54 +++++++++---------- src/eo_api/workflows/services/definitions.py | 8 +-- src/eo_api/workflows/services/engine.py | 3 +- tests/test_data_accessor.py | 13 ++--- tests/test_raster_routes.py | 27 ++++++---- tests/test_workflows.py | 29 +++++----- 12 files changed, 88 insertions(+), 71 deletions(-) diff --git a/src/eo_api/components/services.py b/src/eo_api/components/services.py index d502eb2..b9c803d 100644 --- a/src/eo_api/components/services.py +++ b/src/eo_api/components/services.py @@ -40,6 +40,7 @@ class ComponentRuntimeDefinition: executor: WorkflowStepExecutor config_model: type[BaseModel] + _ERROR_CODES_V1: Final[list[str]] = [ "INPUT_VALIDATION_FAILED", "CONFIG_VALIDATION_FAILED", @@ -463,6 +464,8 @@ def run_download_dataset_step( retries=int(step_config.get("remote_retries", 1)), retry_delay_sec=float(step_config.get("remote_retry_delay_sec", 1.0)), ) + if not isinstance(outputs, dict): + raise RuntimeError("download_dataset remote mode must return a mapping of outputs") return outputs else: runtime.run( diff --git a/src/eo_api/data_accessor/routes.py b/src/eo_api/data_accessor/routes.py index 033b4c5..6afe16d 100644 --- a/src/eo_api/data_accessor/routes.py +++ b/src/eo_api/data_accessor/routes.py @@ -108,6 +108,7 @@ def get_dataset_preview( resource_id=dataset_id, ), ) + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] else: bbox = None @@ -151,6 +152,7 @@ def get_dataset_coverage_summary( resource_id=dataset_id, ), ) + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None bbox = [float(xmin), float(ymin), float(xmax), float(ymax)] else: bbox = None diff --git a/src/eo_api/data_accessor/services/accessor.py b/src/eo_api/data_accessor/services/accessor.py index 0d411ef..81cdc65 100644 --- a/src/eo_api/data_accessor/services/accessor.py +++ b/src/eo_api/data_accessor/services/accessor.py @@ -105,13 +105,12 @@ def get_point_values( ymin, ymax = float(lat_values.min().item()), float(lat_values.max().item()) if lon < xmin or lon > xmax or lat < ymin or lat > ymax: raise ValueError( - f"Requested point ({lon}, {lat}) is outside dataset coverage " - f"([{xmin}, {ymin}] to [{xmax}, {ymax}])" + f"Requested point ({lon}, {lat}) is outside dataset coverage ([{xmin}, {ymin}] to [{xmax}, {ymax}])" ) - variable_name = str(dataset.get("variable") or next(iter(ds.data_vars))) + variable_name = str(dataset.get("variable") or str(next(iter(ds.data_vars)))) if variable_name not in ds.data_vars: - variable_name = next(iter(ds.data_vars)) + variable_name = str(next(iter(ds.data_vars))) data_array = ds[variable_name] point = data_array.sel({lon_dim: lon, lat_dim: lat}, method="nearest") @@ -156,9 +155,9 @@ def get_preview_summary( if not ds.data_vars: raise ValueError(f"Dataset '{dataset['id']}' has no data variables available") - variable_name = str(dataset.get("variable") or next(iter(ds.data_vars))) + variable_name = str(dataset.get("variable") or str(next(iter(ds.data_vars)))) if variable_name not in ds.data_vars: - variable_name = next(iter(ds.data_vars)) + variable_name = str(next(iter(ds.data_vars))) data_array = ds[variable_name] lon_dim, lat_dim = get_lon_lat_dims(data_array) time_dim = get_time_dim(data_array) diff --git a/src/eo_api/main.py b/src/eo_api/main.py index c846b50..3824738 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -39,6 +39,7 @@ async def tile_outside_bounds_handler(request: Request, exc: TileOutsideBounds) ), ) + app.include_router(system.routes.router, tags=["System"]) app.include_router(data_registry.routes.router, prefix="/registry", tags=["Data registry"]) app.include_router(data_manager.routes.router, prefix="/manage", tags=["Data manager"]) diff --git a/src/eo_api/ogc/routes.py b/src/eo_api/ogc/routes.py index 3257acf..5261b21 100644 --- a/src/eo_api/ogc/routes.py +++ b/src/eo_api/ogc/routes.py @@ -112,8 +112,7 @@ def get_ogc_openapi(request: Request) -> dict[str, Any]: "title": "DHIS2 EO API - Native OGC Processes", "version": "0.1.0", "description": ( - "Native OGC API - Processes service description " - "for the FastAPI-owned process and job surface." + "Native OGC API - Processes service description for the FastAPI-owned process and job surface." ), }, "servers": [{"url": f"{base_url}/ogcapi"}], @@ -122,9 +121,7 @@ def get_ogc_openapi(request: Request) -> dict[str, Any]: "/conformance": { "get": {"summary": "Conformance", "responses": {"200": {"description": "Conformance classes"}}} }, - "/processes": { - "get": {"summary": "List processes", "responses": {"200": {"description": "Process list"}}} - }, + "/processes": {"get": {"summary": "List processes", "responses": {"200": {"description": "Process list"}}}}, "/processes/{process_id}": { "get": { "summary": "Describe process", diff --git a/src/eo_api/publications/services.py b/src/eo_api/publications/services.py index cd36033..ee2392f 100644 --- a/src/eo_api/publications/services.py +++ b/src/eo_api/publications/services.py @@ -114,6 +114,7 @@ def _coverage_metadata_for_dataset(dataset: dict[str, object]) -> dict[str, obje xmax = spatial.get("xmax") ymax = spatial.get("ymax") if all(value is not None for value in (xmin, ymin, xmax, ymax)): + assert xmin is not None and ymin is not None and xmax is not None and ymax is not None metadata["bbox"] = [float(xmin), float(ymin), float(xmax), float(ymax)] if isinstance(temporal, dict): diff --git a/src/eo_api/raster/routes.py b/src/eo_api/raster/routes.py index e3e15b9..7ad4306 100644 --- a/src/eo_api/raster/routes.py +++ b/src/eo_api/raster/routes.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, cast import attr from fastapi import APIRouter, Depends, HTTPException, Query, Request @@ -93,6 +93,7 @@ def get_raster_capabilities(resource_id: str) -> dict[str, Any]: "titiler": capabilities, } + def _resource_path_dependency(resource_id: str) -> str: """Resolve one published resource to a TiTiler-readable Zarr dataset path.""" resource = _resolve_published_resource(resource_id) @@ -149,12 +150,29 @@ def __post_init__(self) -> None: selector_values.append(f"time={self.datetime}") self.sel = selector_values or None - def as_dict(self, **kwargs: Any) -> dict[str, Any]: - values = super().as_dict(**kwargs) + def as_dict(self, exclude_none: bool = True) -> dict[Any, Any]: + values = super().as_dict(exclude_none=exclude_none) values.pop("datetime", None) return values +@dataclass +class RasterImageRenderingParams(ImageRenderingParams): + """Image rendering params with dataset-aware default rescaling.""" + + resource_id: str = Query() + aggregation: str | None = Query(default=None) + + def __post_init__(self) -> None: + raw_rescale = cast(Any, self.__dict__.get("rescale")) + if raw_rescale is None: + profile = _style_profile_for_resource(self.resource_id) + default_range = _default_rescale_for_profile(profile, aggregation=self.aggregation) + if default_range is not None: + self.__dict__["rescale"] = [f"{default_range[0]},{default_range[1]}"] + super().__post_init__() + + @attr.s class AggregatingReader(Reader): """Xarray reader that can collapse a temporal dimension before rendering.""" @@ -296,33 +314,15 @@ def _colormap_dependency( if profile is None: return None - default_map = cmap.get(str(profile["colormap_name"])).copy() + base_colormap = cmap.get(str(profile["colormap_name"])) + if not isinstance(base_colormap, dict): + return base_colormap + default_map = cast(dict[Any, Any], base_colormap.copy()) if str(profile["colormap_name"]) in {"ylorrd", "blues", "viridis"}: default_map[0] = (0, 0, 0, 0) return default_map -def _render_params_dependency( - resource_id: str, - aggregation: str | None = Query(default=None), - rescale: list[str] | None = Query( - default=None, - description="Optional explicit min,max rescaling override.", - ), - color_formula: str | None = Query(default=None), - return_mask: bool | None = Query(default=None, alias="return_mask"), -) -> ImageRenderingParams: - params = ImageRenderingParams(rescale=rescale, color_formula=color_formula, add_mask=return_mask) - if params.rescale is not None: - return params - - profile = _style_profile_for_resource(resource_id) - default_range = _default_rescale_for_profile(profile, aggregation=aggregation) - if default_range is not None: - params.rescale = [default_range] - return params - - _factory = TilerFactory( reader=AggregatingReader, router_prefix="", @@ -344,7 +344,7 @@ def _render_params_dependency( ], extensions=[VariablesExtension()], colormap_dependency=_colormap_dependency, - render_dependency=_render_params_dependency, + render_dependency=RasterImageRenderingParams, reader_dependency=RasterReaderParams, add_viewer=False, add_ogc_maps=False, @@ -478,7 +478,7 @@ def _default_rescale_for_profile( range_value = rescale_by_mode.get(mode) or rescale_by_mode.get("datetime") if range_value is None: return None - return tuple(range_value) + return cast(tuple[float, float], tuple(range_value)) def _aggregate_temporal_dataarray( diff --git a/src/eo_api/workflows/services/definitions.py b/src/eo_api/workflows/services/definitions.py index 98e6da0..2e17779 100644 --- a/src/eo_api/workflows/services/definitions.py +++ b/src/eo_api/workflows/services/definitions.py @@ -2,6 +2,7 @@ from __future__ import annotations +from collections.abc import Mapping from pathlib import Path from typing import Any, Literal @@ -14,6 +15,8 @@ SCRIPT_DIR = Path(__file__).parent.resolve() WORKFLOWS_DIR = SCRIPT_DIR.parent.parent.parent.parent / "data" / "workflows" DEFAULT_WORKFLOW_ID = "dhis2_datavalue_set_v1" + + class WorkflowStep(BaseModel): """One component step in a declarative workflow definition.""" @@ -263,7 +266,7 @@ def _normalize_step_inputs( def _validate_workflow_outputs( *, - bindings: dict[str, WorkflowStepInput | WorkflowOutputBinding], + bindings: Mapping[str, WorkflowStepInput | WorkflowOutputBinding], available_outputs: dict[str, set[str]], owner: str, ) -> None: @@ -275,8 +278,7 @@ def _validate_workflow_outputs( raise ValueError(f"{owner} reference '{output_name}' points to unknown step '{ref.from_step}'") if ref.output not in available_for_step: raise ValueError( - f"{owner} reference '{output_name}' points to missing output " - f"'{ref.output}' from step '{ref.from_step}'" + f"{owner} reference '{output_name}' points to missing output '{ref.output}' from step '{ref.from_step}'" ) diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index b6cf003..4adae3d 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -69,7 +69,6 @@ def require_output(self, output_name: str) -> Any: return self.latest_outputs[output_name] - def execute_workflow( request: WorkflowExecuteRequest, *, @@ -275,6 +274,8 @@ def _should_publish_workflow_output( if not _server_allows_workflow_publication(workflow_definition_source=workflow_definition_source): return False if publication.required_output_file_suffixes: + if response.output_file is None: + return False suffix = Path(response.output_file).suffix.lower() return suffix in publication.required_output_file_suffixes return True diff --git a/tests/test_data_accessor.py b/tests/test_data_accessor.py index cb58199..d55adcc 100644 --- a/tests/test_data_accessor.py +++ b/tests/test_data_accessor.py @@ -1,6 +1,7 @@ from __future__ import annotations import numpy as np +import pytest import xarray as xr from fastapi.testclient import TestClient @@ -12,7 +13,7 @@ from eo_api.main import app -def test_get_point_values_returns_time_series(monkeypatch) -> None: +def test_get_point_values_returns_time_series(monkeypatch: pytest.MonkeyPatch) -> None: ds = xr.Dataset( {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, coords={ @@ -38,7 +39,7 @@ def test_get_point_values_returns_time_series(monkeypatch) -> None: assert result["values"] == [{"period": "2024-01", "value": 2.0}, {"period": "2024-02", "value": 4.0}] -def test_point_query_outside_coverage_returns_typed_error(monkeypatch) -> None: +def test_point_query_outside_coverage_returns_typed_error(monkeypatch: pytest.MonkeyPatch) -> None: ds = xr.Dataset( {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, coords={ @@ -66,7 +67,7 @@ def test_point_query_outside_coverage_returns_typed_error(monkeypatch) -> None: assert body["resource_id"] == "chirps3_precipitation_daily" -def test_get_preview_summary_returns_stats_and_sample(monkeypatch) -> None: +def test_get_preview_summary_returns_stats_and_sample(monkeypatch: pytest.MonkeyPatch) -> None: ds = xr.Dataset( {"precip": (("time", "lat", "lon"), np.array([[[1.0, 2.0]], [[3.0, 4.0]]]))}, coords={ @@ -92,7 +93,7 @@ def test_get_preview_summary_returns_stats_and_sample(monkeypatch) -> None: assert result["sample"][0]["period"] == "2024-01" -def test_preview_endpoint_requires_complete_bbox(monkeypatch) -> None: +def test_preview_endpoint_requires_complete_bbox(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( "eo_api.data_registry.services.datasets.get_dataset", lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, @@ -111,7 +112,7 @@ def test_preview_endpoint_requires_complete_bbox(monkeypatch) -> None: assert body["resource_id"] == "chirps3_precipitation_daily" -def test_get_coverage_summary_wraps_preview_and_full_coverage(monkeypatch) -> None: +def test_get_coverage_summary_wraps_preview_and_full_coverage(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( "eo_api.data_accessor.services.accessor.get_preview_summary", lambda *args, **kwargs: { @@ -147,7 +148,7 @@ def test_get_coverage_summary_wraps_preview_and_full_coverage(monkeypatch) -> No assert result["subset"]["sample"][0]["value"] == 1.0 -def test_coverage_endpoint_requires_complete_bbox(monkeypatch) -> None: +def test_coverage_endpoint_requires_complete_bbox(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( "eo_api.data_registry.services.datasets.get_dataset", lambda dataset_id: {"id": dataset_id, "variable": "precip", "period_type": "monthly"}, diff --git a/tests/test_raster_routes.py b/tests/test_raster_routes.py index 67a1c55..4c212dd 100644 --- a/tests/test_raster_routes.py +++ b/tests/test_raster_routes.py @@ -1,6 +1,9 @@ from __future__ import annotations +from pathlib import Path + import numpy as np +import pytest import xarray as xr from fastapi.testclient import TestClient @@ -8,7 +11,7 @@ from eo_api.raster import routes as raster_routes -def test_raster_capabilities_report_missing_zarr_archive(monkeypatch) -> None: +def test_raster_capabilities_report_missing_zarr_archive(monkeypatch: pytest.MonkeyPatch) -> None: client = TestClient(app) with monkeypatch.context() as patcher: patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) @@ -23,7 +26,7 @@ def test_raster_capabilities_report_missing_zarr_archive(monkeypatch) -> None: assert "build_zarr" in body["titiler"]["reason"] -def test_raster_variables_route_rejects_resource_without_zarr_archive(monkeypatch) -> None: +def test_raster_variables_route_rejects_resource_without_zarr_archive(monkeypatch: pytest.MonkeyPatch) -> None: client = TestClient(app) with monkeypatch.context() as patcher: patcher.setattr(raster_routes, "get_zarr_path", lambda dataset: None) @@ -35,7 +38,7 @@ def test_raster_variables_route_rejects_resource_without_zarr_archive(monkeypatc assert body["error_code"] == "RASTER_PUBLICATION_UNSUPPORTED" -def test_raster_variables_route_uses_zarr_backed_xarray_reader(tmp_path, monkeypatch) -> None: +def test_raster_variables_route_uses_zarr_backed_xarray_reader(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ @@ -57,7 +60,7 @@ def test_raster_variables_route_uses_zarr_backed_xarray_reader(tmp_path, monkeyp assert response.json() == ["precip"] -def test_raster_preview_requires_datetime_for_temporal_dataset(tmp_path, monkeypatch) -> None: +def test_raster_preview_requires_datetime_for_temporal_dataset(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ @@ -81,7 +84,9 @@ def test_raster_preview_requires_datetime_for_temporal_dataset(tmp_path, monkeyp assert body["error_code"] == "RASTER_DATETIME_REQUIRED" -def test_raster_preview_with_datetime_renders_single_time_slice(tmp_path, monkeypatch) -> None: +def test_raster_preview_with_datetime_renders_single_time_slice( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ @@ -97,16 +102,16 @@ def test_raster_preview_with_datetime_renders_single_time_slice(tmp_path, monkey monkeypatch.setattr(raster_routes, "get_zarr_path", lambda dataset: zarr_path) client = TestClient(app) - response = client.get( - "/raster/chirps3_precipitation_daily/preview.png?variable=precip&datetime=2024-01-01" - ) + response = client.get("/raster/chirps3_precipitation_daily/preview.png?variable=precip&datetime=2024-01-01") assert response.status_code == 200 assert response.headers["content-type"] == "image/png" assert response.content -def test_raster_preview_with_aggregation_renders_time_reduced_image(tmp_path, monkeypatch) -> None: +def test_raster_preview_with_aggregation_renders_time_reduced_image( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ @@ -132,7 +137,7 @@ def test_raster_preview_with_aggregation_renders_time_reduced_image(tmp_path, mo assert response.content -def test_raster_preview_rejects_aggregation_without_range(tmp_path, monkeypatch) -> None: +def test_raster_preview_rejects_aggregation_without_range(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ @@ -156,7 +161,7 @@ def test_raster_preview_rejects_aggregation_without_range(tmp_path, monkeypatch) assert body["error_code"] == "RASTER_TEMPORAL_QUERY_INVALID" -def test_raster_tile_outside_bounds_returns_404(tmp_path, monkeypatch) -> None: +def test_raster_tile_outside_bounds_returns_404(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" xr.Dataset( data_vars={ diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 3d62b69..f4b0042 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1165,18 +1165,22 @@ def test_generated_pygeoapi_config_uses_real_source_coverage_extent( }, ).rio.write_crs("EPSG:4326").to_zarr(zarr_path, mode="w") monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) - monkeypatch.setattr(publication_services, "list_datasets", lambda: [ - { - "id": "chirps3_precipitation_daily", - "name": "Total precipitation (CHIRPS3)", - "variable": "precip", - "period_type": "daily", - "source": "CHIRPS v3", - "source_url": "https://example.test/chirps", - "resolution": "5 km x 5 km", - "units": "mm", - } - ]) + monkeypatch.setattr( + publication_services, + "list_datasets", + lambda: [ + { + "id": "chirps3_precipitation_daily", + "name": "Total precipitation (CHIRPS3)", + "variable": "precip", + "period_type": "daily", + "source": "CHIRPS v3", + "source_url": "https://example.test/chirps", + "resolution": "5 km x 5 km", + "units": "mm", + } + ], + ) monkeypatch.setattr( publication_services, "get_data_coverage", @@ -1917,6 +1921,7 @@ def _spatial_aggregation_component(**kwargs: Any) -> list[dict[str, Any]]: response = engine.execute_workflow(request, include_component_run_details=True) assert response.status == "completed" + assert response.data_value_set is not None assert response.data_value_set["dataValues"][0]["period"] == "202401" From e6181a97b9e23388b92eef4c5ea0a85153a050de Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Mon, 23 Mar 2026 08:45:41 +0100 Subject: [PATCH 14/15] fix: use path param for raster resource_id dependency --- src/eo_api/raster/routes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eo_api/raster/routes.py b/src/eo_api/raster/routes.py index 7ad4306..aa9db3a 100644 --- a/src/eo_api/raster/routes.py +++ b/src/eo_api/raster/routes.py @@ -7,7 +7,7 @@ from typing import Any, cast import attr -from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request from rio_tiler.colormap import cmap from rio_tiler.io.xarray import XarrayReader from titiler.core.dependencies import ImageRenderingParams @@ -160,7 +160,7 @@ def as_dict(self, exclude_none: bool = True) -> dict[Any, Any]: class RasterImageRenderingParams(ImageRenderingParams): """Image rendering params with dataset-aware default rescaling.""" - resource_id: str = Query() + resource_id: str = Path() aggregation: str | None = Query(default=None) def __post_init__(self) -> None: From b59d662a7b95fcaf1a6a580f14eb248c9228bd33 Mon Sep 17 00:00:00 2001 From: Abyot Asalefew Gizaw Date: Wed, 25 Mar 2026 06:02:49 +0100 Subject: [PATCH 15/15] feat: make workflow publication explicit per run --- docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md | 352 ++++++++++++++++++ .../PYGEOAPI_MAINTAINABILITY_ARGUMENT.md | 138 +++++++ src/eo_api/components/routes.py | 12 +- src/eo_api/main.py | 2 +- src/eo_api/raster/routes.py | 5 +- src/eo_api/workflows/schemas.py | 2 + src/eo_api/workflows/services/engine.py | 4 + .../workflows/services/simple_mapper.py | 1 + tests/test_workflows.py | 28 ++ 9 files changed, 535 insertions(+), 9 deletions(-) create mode 100644 docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md create mode 100644 docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md diff --git a/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md b/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md new file mode 100644 index 0000000..f883607 --- /dev/null +++ b/docs/internal/OGC_WORKFLOW_DEMO_REFERENCE.md @@ -0,0 +1,352 @@ +# OGC Workflow Demo Reference + +## Purpose + +This document is a stable operator/demo reference for the current branch shape. + +It answers: + +1. how to fetch CHIRPS3 and WorldPop data +2. how to run workflow-backed executions +3. how to inspect jobs and results +4. what the difference is between processes and jobs +5. how to browse published collections +6. why both `/ogcapi` and `/pygeoapi` currently exist + +## Current Route Model + +The current runtime split is: + +1. `/workflows` = native workflow control plane +2. `/ogcapi` = native OGC API - Processes / Jobs adapter +3. `/pygeoapi` = mounted collection/items browse shell for published resources +4. `/publications` = publication registry +5. `/raster` = raster capabilities / rendering layer + +Important current note: + +Some older docs still refer to `/ogcapi/collections`. In the current codebase, collection browsing is under `/pygeoapi/collections`. + +## Dataset IDs Used In Examples + +1. `chirps3_precipitation_daily` +2. `worldpop_population_yearly` + +WorldPop usually needs `country_code`. + +## A. Direct Dataset Fetch Examples + +These examples fetch/cache source data directly. They do not create workflow jobs. + +### A1. Fetch CHIRPS3 + +```bash +curl -X POST http://localhost:8000/components/download-dataset \ + -H 'Content-Type: application/json' \ + -d '{ + "dataset_id": "chirps3_precipitation_daily", + "start": "2024-01-01", + "end": "2024-01-31", + "overwrite": false + }' +``` + +### A2. Fetch WorldPop + +```bash +curl -X POST http://localhost:8000/components/download-dataset \ + -H 'Content-Type: application/json' \ + -d '{ + "dataset_id": "worldpop_population_yearly", + "start": "2020", + "end": "2020", + "country_code": "ETH", + "overwrite": false + }' +``` + +### A3. Alternative Legacy Download Routes + +These older routes still exist, but they are not job-backed and are less aligned with the newer component/workflow model. + +```bash +curl "http://localhost:8000/manage/chirps3_precipitation_daily/download?start=2024-01-01&end=2024-01-31" +curl "http://localhost:8000/manage/worldpop_population_yearly/download?start=2020&end=2020" +``` + +## B. Workflow Execution Example + +This is the canonical native workflow execution shape: + +Important current note: + +`publishable` in workflow policy does not automatically publish every successful run. +The caller must explicitly opt in with `"publish": true`, and the workflow must also allow publication. + +```bash +curl -X POST http://localhost:8000/workflows/dhis2-datavalue-set \ + -H 'Content-Type: application/json' \ + -d '{ + "request": { + "workflow_id": "dhis2_datavalue_set_v1", + "dataset_id": "chirps3_precipitation_daily", + "start_date": "2024-01-01", + "end_date": "2024-01-31", + "org_unit_level": 3, + "data_element": "abc123def45", + "temporal_resolution": "monthly", + "temporal_reducer": "sum", + "spatial_reducer": "mean", + "publish": true, + "dry_run": true + } + }' +``` + +If this succeeds, the response includes a `run_id`. That `run_id` is the native workflow job ID. + +Important current note: + +This path depends on DHIS2-backed feature resolution. If DHIS2 is unavailable or misconfigured, this request can fail with `503`. + +## C. Processes vs Jobs + +A `process` is a reusable executable capability. + +A `job` is one concrete execution of a process. + +In the current OGC layer, there is one exposed generic process: + +1. `generic-dhis2-workflow` + +So the model is: + +1. process = "what can this server execute?" +2. job = "one recorded run of that execution capability" + +The native workflow layer also persists jobs, but it does not use OGC process terminology in its route naming. + +## D. How To Inspect Jobs And Results + +### D1. Native Workflow View + +List jobs: + +```bash +curl http://localhost:8000/workflows/jobs +``` + +Get one job: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id} +``` + +Get persisted result payload: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id}/result +``` + +Get persisted trace/log payload: + +```bash +curl http://localhost:8000/workflows/jobs/{job_id}/trace +``` + +Delete one job and cascade its owned artifacts: + +```bash +curl -X DELETE http://localhost:8000/workflows/jobs/{job_id} +``` + +### D2. OGC View Over The Same Execution Layer + +List OGC processes: + +```bash +curl http://localhost:8000/ogcapi/processes +``` + +Describe the exposed generic process: + +```bash +curl http://localhost:8000/ogcapi/processes/generic-dhis2-workflow +``` + +List OGC jobs: + +```bash +curl http://localhost:8000/ogcapi/jobs +``` + +Get one OGC job: + +```bash +curl http://localhost:8000/ogcapi/jobs/{job_id} +``` + +Get OGC job results: + +```bash +curl http://localhost:8000/ogcapi/jobs/{job_id}/results +``` + +Get OGC job results plus extra native metadata: + +```bash +curl "http://localhost:8000/ogcapi/jobs/{job_id}/results?extended=true" +``` + +Download the native output artifact if available: + +```bash +curl -OJ http://localhost:8000/ogcapi/jobs/{job_id}/download +``` + +## E. How To Browse Published Collections + +### E1. Inspect Publication Registry + +List publications: + +```bash +curl http://localhost:8000/publications +``` + +Get CHIRPS3 source publication: + +```bash +curl http://localhost:8000/publications/dataset-chirps3_precipitation_daily +``` + +Get WorldPop source publication: + +```bash +curl http://localhost:8000/publications/dataset-worldpop_population_yearly +``` + +### E2. Browse Published Collections + +List collections: + +```bash +curl http://localhost:8000/pygeoapi/collections +curl "http://localhost:8000/pygeoapi/collections?f=html" +``` + +Open CHIRPS3 collection: + +```bash +curl http://localhost:8000/pygeoapi/collections/chirps3_precipitation_daily +``` + +Open WorldPop collection: + +```bash +curl http://localhost:8000/pygeoapi/collections/worldpop_population_yearly +``` + +### E3. Raster-Specific Checks + +These two source datasets are primarily coverage/raster resources, so their raster capabilities are also important: + +```bash +curl http://localhost:8000/raster/chirps3_precipitation_daily/capabilities +curl http://localhost:8000/raster/worldpop_population_yearly/capabilities +``` + +### E4. Derived Workflow Output Collections + +If a publishable workflow run succeeds, the system can register a derived publication: + +1. the workflow result is persisted as a job +2. a `PublishedResource` may be created +3. that publication may become visible under `/pygeoapi/collections/{collection_id}` + +The easiest path is: + +1. run a publishable workflow +2. open `/workflows/jobs/{job_id}` +3. follow its `collection` link if present + +## F. Why Both `/ogcapi` And `/pygeoapi` Exist + +Current intent: + +1. `/ogcapi` is the native canonical OGC process/job surface +2. `/pygeoapi` is the current generic browse shell for collection/items publication + +This means: + +1. execution semantics stay FastAPI-owned +2. publication truth stays FastAPI-owned through `PublishedResource` +3. generic collection/items browsing is still delegated to `pygeoapi` where it adds value + +This is a pragmatic transition state, not necessarily the final public shape. + +## G. Publication Bridge Lifecycle + +The publication bridge is file-backed and explicit. + +Current lifecycle: + +1. a source dataset or workflow output becomes eligible for publication +2. the backend registers publication truth as a `PublishedResource` JSON record +3. the pygeoapi projection layer reads those publication records +4. the system generates pygeoapi YAML/OpenAPI documents from that publication state +5. the mounted `/pygeoapi` app serves collections/items from that generated configuration + +In short: + +```text +workflow/source dataset + -> publication registration + -> PublishedResource JSON + -> generated pygeoapi YAML + -> /pygeoapi collection +``` + +Current storage/projection locations: + +1. publication state: + - `data/downloads/published_resources/*.json` +2. generated pygeoapi projection: + - `data/downloads/pygeoapi/pygeoapi-config.generated.yml` + - `data/downloads/pygeoapi/pygeoapi-openapi.generated.yml` + +Important architectural point: + +The JSON publication record is the source of truth. +The pygeoapi YAML is generated serving configuration, not the primary publication database. + +## H. Can The System Eventually Live With Just One? + +Yes. + +The accepted architectural direction is: + +1. `/ogcapi` remains canonical +2. `/pygeoapi` is secondary and potentially transitional + +That means the long-term convergence options are: + +1. move collection/resource routes into native FastAPI under `/ogcapi` +2. or keep pygeoapi as an internal implementation component while exposing one canonical `/ogcapi` surface + +For now, keeping both is reasonable because it preserves a clean native process/job model while still reusing pygeoapi's browse capabilities. + +## I. Short Demo Sequence + +Good operator/demo flow: + +1. `GET /workflows` +2. `GET /ogcapi/processes` +3. `GET /publications` +4. `GET /pygeoapi/collections?f=html` +5. `GET /pygeoapi/collections/chirps3_precipitation_daily` +6. `GET /raster/chirps3_precipitation_daily/capabilities` +7. run one workflow if DHIS2 is available +8. `GET /workflows/jobs/{job_id}` +9. `GET /ogcapi/jobs/{job_id}/results` +10. browse the derived collection if the run published one diff --git a/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md b/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md new file mode 100644 index 0000000..72b6433 --- /dev/null +++ b/docs/internal/PYGEOAPI_MAINTAINABILITY_ARGUMENT.md @@ -0,0 +1,138 @@ +# Pygeoapi Maintainability Argument + +## Purpose + +This note captures the justification for the current publication bridge architecture, especially in response to the critique that an earlier integration used only a few lines of code via: + +```python +from pygeoapi.starlette_app import APP as pygeoapi_app +from pygeoapi.starlette_app import CONFIG +``` + +The short version is: + +- the old approach looked simpler because it leaned on pygeoapi runtime globals +- the current approach is more maintainable because EO API now owns publication truth explicitly +- pygeoapi is still used for standard OGC serving, but it is no longer the owner of backend publication state + +## The Real Question + +The architectural question is not: + +- "Should we use the standard pygeoapi implementation or build our own?" + +The real question is: + +- "Which part of the system should own publication state and execution-linked publication lifecycle?" + +The current branch does **not** replace pygeoapi as a standards-oriented serving component. + +Instead, it moves ownership of publication truth into the EO API backend and uses pygeoapi as one serving projection of that truth. + +## What The Thin Old Approach Did Well + +The old approach had a real strength: + +- it made pygeoapi integration look very small and direct + +That is useful for: + +- quick demos +- minimal bootstrapping +- proving that pygeoapi can be mounted and served successfully + +For a spike or early prototype, this is a good move. + +## What The Thin Old Approach Hid + +The small wrapper did not remove complexity. +It mostly relocated complexity into pygeoapi runtime globals and config handling. + +That creates ambiguity around: + +1. where publication truth actually lives +2. how dynamic publication updates should happen +3. how job/output lifecycle links to collections +4. how cleanup and deletion should remove published resources +5. how source and derived resources should share one publication model +6. how nightly refreshes should update collection extents and metadata + +So the old approach was shorter, but it encouraged a hidden ownership model: + +- pygeoapi `APP` and `CONFIG` start to feel like the publication database + +That is the maintainability problem. + +## Why The Current Approach Is Cleaner + +The current branch introduces an explicit publication bridge: + +```text +workflow/source dataset + -> publication registration + -> PublishedResource JSON + -> generated pygeoapi YAML + -> /pygeoapi collection +``` + +This gives each layer one responsibility: + +1. EO API backend owns execution truth +2. EO API backend owns publication truth through `PublishedResource` +3. generated pygeoapi config is a serving projection of that truth +4. pygeoapi remains the standards-oriented collection/items serving layer + +This is cleaner because: + +- publication state is explicit +- job-to-publication linkage is explicit +- cleanup semantics are explicit +- source and derived resources share one model +- pygeoapi is replaceable because it consumes projection rather than owning truth + +## Why This Is More Maintainable Long Term + +Long-term maintainability improves because the current architecture: + +1. avoids treating pygeoapi runtime globals as the domain model +2. gives EO API a backend-owned publication record that can evolve independently +3. supports dynamic workflow outputs without hardcoding unknown future collections +4. allows retention/cleanup to remove publications coherently +5. allows metadata refresh and extent updates to happen in backend-owned publication state first +6. reduces framework lock-in because pygeoapi becomes an adapter, not the publication brain + +In short: + +- the old approach optimized for a short integration seam +- the current approach optimizes for explicit ownership and lifecycle clarity + +That is the better long-term tradeoff. + +## What We Still Reuse From Pygeoapi + +This architecture does **not** argue against pygeoapi. + +We still rely on pygeoapi for: + +1. standards-oriented collection/items serving +2. provider-backed feature/coverage publication +3. generic browse behavior +4. HTML/JSON OGC browse surfaces while that still adds value + +So the current model is not: + +- "replace the standard implementation" + +It is: + +- "keep the standard implementation for serving, but stop using it as the owner of backend publication state" + +## Short Defense Statement + +Use this if you need a short explanation: + +> The earlier pygeoapi integration was shorter because it outsourced state handling to pygeoapi’s app/config layer. The current approach is more maintainable because EO API now owns workflow-aware publication truth explicitly, while pygeoapi remains the standard serving layer fed by generated configuration. + +## Even Shorter Version + +> We still use pygeoapi for standard OGC serving. What changed is that publication truth now lives in EO API instead of pygeoapi runtime globals. That is why the current design is more maintainable. diff --git a/src/eo_api/components/routes.py b/src/eo_api/components/routes.py index 3754535..feab9c4 100644 --- a/src/eo_api/components/routes.py +++ b/src/eo_api/components/routes.py @@ -40,13 +40,13 @@ def _json_safe_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]: return [{key: _to_jsonable_scalar(value) for key, value in record.items()} for record in records] -@router.get("/components", response_model=ComponentCatalogResponse, response_model_exclude_none=True) +@router.get("", response_model=ComponentCatalogResponse, response_model_exclude_none=True) def list_components(include_internal: bool = Query(default=False)) -> ComponentCatalogResponse: """List all discoverable reusable components.""" return ComponentCatalogResponse(components=services.component_catalog(include_internal=include_internal)) -@router.post("/components/feature-source", response_model=FeatureSourceRunResponse) +@router.post("/feature-source", response_model=FeatureSourceRunResponse) def run_feature_source(payload: FeatureSourceRunRequest) -> FeatureSourceRunResponse: """Resolve feature source to features and bbox.""" features, bbox = services.feature_source_component(payload.feature_source) @@ -57,7 +57,7 @@ def run_feature_source(payload: FeatureSourceRunRequest) -> FeatureSourceRunResp ) -@router.post("/components/download-dataset", response_model=DownloadDatasetRunResponse) +@router.post("/download-dataset", response_model=DownloadDatasetRunResponse) def run_download_dataset(payload: DownloadDatasetRunRequest) -> DownloadDatasetRunResponse: """Download dataset files for the selected period/scope.""" dataset = services.require_dataset(payload.dataset_id) @@ -78,7 +78,7 @@ def run_download_dataset(payload: DownloadDatasetRunRequest) -> DownloadDatasetR ) -@router.post("/components/temporal-aggregation", response_model=TemporalAggregationRunResponse) +@router.post("/temporal-aggregation", response_model=TemporalAggregationRunResponse) def run_temporal_aggregation(payload: TemporalAggregationRunRequest) -> TemporalAggregationRunResponse: """Aggregate a dataset temporally.""" dataset = services.require_dataset(payload.dataset_id) @@ -97,7 +97,7 @@ def run_temporal_aggregation(payload: TemporalAggregationRunRequest) -> Temporal ) -@router.post("/components/spatial-aggregation", response_model=SpatialAggregationRunResponse) +@router.post("/spatial-aggregation", response_model=SpatialAggregationRunResponse) def run_spatial_aggregation(payload: SpatialAggregationRunRequest) -> SpatialAggregationRunResponse: """Aggregate a dataset spatially to features.""" dataset = services.require_dataset(payload.dataset_id) @@ -120,7 +120,7 @@ def run_spatial_aggregation(payload: SpatialAggregationRunRequest) -> SpatialAgg ) -@router.post("/components/build-datavalue-set", response_model=BuildDataValueSetRunResponse) +@router.post("/build-datavalue-set", response_model=BuildDataValueSetRunResponse) def run_build_datavalueset(payload: BuildDataValueSetRunRequest) -> BuildDataValueSetRunResponse: """Build and serialize a DHIS2 DataValueSet from records.""" data_value_set, output_file = services.build_datavalueset_component( diff --git a/src/eo_api/main.py b/src/eo_api/main.py index 3824738..06fcbc1 100644 --- a/src/eo_api/main.py +++ b/src/eo_api/main.py @@ -49,7 +49,7 @@ async def tile_outside_bounds_handler(request: Request, exc: TileOutsideBounds) app.include_router(publication_routes.router, prefix="/publications", tags=["Publications"]) app.include_router(publication_generated_routes.router, prefix="/publications", tags=["Publications"]) app.include_router(analytics_viewer.routes.router, prefix="/analytics", tags=["Analytics"]) -app.include_router(components.routes.router, tags=["Components"]) +app.include_router(components.routes.router, prefix="/components", tags=["Components"]) app.include_router(ogc_routes.router, prefix="/ogcapi", tags=["OGC API"]) app.mount("/data", StaticFiles(directory="data/downloads"), name="Data") app.mount("/pygeoapi", ogc_api_app) diff --git a/src/eo_api/raster/routes.py b/src/eo_api/raster/routes.py index aa9db3a..a105055 100644 --- a/src/eo_api/raster/routes.py +++ b/src/eo_api/raster/routes.py @@ -7,7 +7,8 @@ from typing import Any, cast import attr -from fastapi import APIRouter, Depends, HTTPException, Path, Query, Request +from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi import Path as FastAPIPath from rio_tiler.colormap import cmap from rio_tiler.io.xarray import XarrayReader from titiler.core.dependencies import ImageRenderingParams @@ -160,7 +161,7 @@ def as_dict(self, exclude_none: bool = True) -> dict[Any, Any]: class RasterImageRenderingParams(ImageRenderingParams): """Image rendering params with dataset-aware default rescaling.""" - resource_id: str = Path() + resource_id: str = FastAPIPath() aggregation: str | None = Query(default=None) def __post_init__(self) -> None: diff --git a/src/eo_api/workflows/schemas.py b/src/eo_api/workflows/schemas.py index b587a66..b75d2fd 100644 --- a/src/eo_api/workflows/schemas.py +++ b/src/eo_api/workflows/schemas.py @@ -86,6 +86,7 @@ class WorkflowExecuteRequest(BaseModel): dataset_id: str start: str end: str + publish: bool = False overwrite: bool = False country_code: str | None = None feature_source: FeatureSourceConfig @@ -257,6 +258,7 @@ class WorkflowRequest(BaseModel): temporal_resolution: PeriodType = PeriodType.MONTHLY temporal_reducer: AggregationMethod = AggregationMethod.SUM spatial_reducer: AggregationMethod = AggregationMethod.MEAN + publish: bool = False overwrite: bool = False dry_run: bool = True feature_id_property: str = "id" diff --git a/src/eo_api/workflows/services/engine.py b/src/eo_api/workflows/services/engine.py index 4adae3d..5a63314 100644 --- a/src/eo_api/workflows/services/engine.py +++ b/src/eo_api/workflows/services/engine.py @@ -168,6 +168,7 @@ def execute_workflow( ) mark_job_success(job_id=runtime.run_id, response=response) if _should_publish_workflow_output( + request=request, response=response, publication=workflow.publication, workflow_definition_source=workflow_definition_source, @@ -262,11 +263,14 @@ def execute_workflow( def _should_publish_workflow_output( *, + request: WorkflowExecuteRequest, response: WorkflowExecuteResponse, publication: WorkflowPublicationPolicy, workflow_definition_source: Literal["catalog", "inline"], ) -> bool: """Apply workflow-level publication policy to a concrete workflow output.""" + if not request.publish: + return False if not publication.publishable: return False if publication.strategy != "on_success": diff --git a/src/eo_api/workflows/services/simple_mapper.py b/src/eo_api/workflows/services/simple_mapper.py index 07fb368..164b9e6 100644 --- a/src/eo_api/workflows/services/simple_mapper.py +++ b/src/eo_api/workflows/services/simple_mapper.py @@ -78,6 +78,7 @@ def normalize_simple_request(payload: WorkflowRequest) -> tuple[WorkflowExecuteR dataset_id=dataset_id, start=start, end=end, + publish=inputs.publish, overwrite=inputs.overwrite, country_code=inputs.country_code, feature_source=feature_source, diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f4b0042..a291ae9 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -40,6 +40,7 @@ def _valid_public_payload() -> dict[str, Any]: "temporal_resolution": "monthly", "temporal_reducer": "sum", "spatial_reducer": "mean", + "publish": True, "dry_run": True, "include_component_run_details": False, } @@ -1546,6 +1547,33 @@ def test_generated_pygeoapi_config_reflects_publication_registry( assert chirps["metadata"]["dataset_id"] == "chirps3_precipitation_daily" +def test_publishable_workflow_requires_request_publish_flag( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setattr(run_logs, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(job_store, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_services, "DOWNLOAD_DIR", tmp_path) + monkeypatch.setattr(publication_pygeoapi, "DOWNLOAD_DIR", tmp_path) + zarr_path = tmp_path / "chirps3_precipitation_daily.zarr" + zarr_path.mkdir(parents=True) + monkeypatch.setattr(publication_pygeoapi, "get_zarr_path", lambda dataset: zarr_path) + _patch_successful_execution(monkeypatch) + + payload = _valid_public_payload() + payload["request"]["publish"] = False + + workflow_response = client.post("/workflows/dhis2-datavalue-set", json=payload) + assert workflow_response.status_code == 200 + run_id = workflow_response.json()["run_id"] + + publications_response = client.get("/publications", params={"workflow_id": "dhis2_datavalue_set_v1"}) + assert publications_response.status_code == 200 + resources = publications_response.json()["resources"] + assert all(item["resource_id"] != f"workflow-output-{run_id}" for item in resources) + + def test_generated_pygeoapi_openapi_includes_derived_collection( client: TestClient, monkeypatch: pytest.MonkeyPatch,