diff --git a/Cargo.toml b/Cargo.toml index f7f607c..bac442c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,7 +96,7 @@ config-reload = ["config", "parking_lot", "tokio", "tracing"] config-postgres = ["config", "sqlx", "tokio", "serde_json"] # Transport features -transport = ["tokio", "serde_json", "rmp-serde", "chrono", "async-trait"] +transport = ["tokio", "serde_json", "rmp-serde", "chrono", "async-trait", "regex", "memchr"] transport-memory = ["transport"] transport-kafka = ["transport", "rdkafka", "regex", "tokio-util"] transport-grpc = ["transport", "dep:tonic", "dep:tonic-prost", "dep:prost", "dep:prost-types", "dep:tonic-prost-build", "dep:prost-build"] @@ -232,6 +232,7 @@ sonic-rs = { version = ">=0.5, <1", optional = true } # Regex (topic resolver include/exclude filters) regex = { version = ">=1.11, <2", optional = true } +memchr = { version = ">=2.7", optional = true } # Async trait (for tiered-sink Sink trait) async-trait = { version = ">=0.1.88, <0.2", optional = true } @@ -306,3 +307,8 @@ harness = false name = "engine_benchmark" harness = false required-features = ["worker"] + +[[bench]] +name = "filter_benchmark" +harness = false +required-features = ["transport-memory"] diff --git a/STATE.md b/STATE.md index b116cc6..54b9564 100644 --- a/STATE.md +++ b/STATE.md @@ -39,6 +39,7 @@ Modular library with feature-gated components. Each module can be enabled/disabl 15. **memory** - MemoryGuard: cgroup-aware memory backpressure with auto-detection 16. **scaling** - ScalingPressure: KEDA autoscaling signal calculation 17. **cli** - DfeApp trait, ServiceRuntime (pre-wired metrics + worker pool + batch engine + memory guard + shutdown) +18. **transport-filter** - TransportFilterEngine: CEL-syntax message filtering embedded in every transport. Tier 1 SIMD field ops (~50-100ns), Tier 2 compiled CEL (opt-in), Tier 3 complex CEL with regex/iteration (opt-in). Inbound/outbound, drop/dlq, first-match-wins. ### Tech Stack @@ -154,6 +155,7 @@ spool, cache, secrets, HTTP client, DLQ — with zero additional wiring. - **DFE parallelisation pattern** — split sequential hot loops into parallel (pure `&self` computation via rayon) and sequential (mutable state: buffer push, mark_pending, stats, DLQ) phases. The `BatchProcessor` trait + `BatchPipeline` struct in rustlib provide the common framework. Each DFE app implements `BatchProcessor` for its domain. See `src/pipeline/` module. - **ServiceRuntime** — pre-built infrastructure for DFE service apps. Created by `run_app()` before `run_service()`. Contains MetricsManager, DfeMetrics, MemoryGuard (optional), shutdown token (with K8s pre-stop delay), worker pool (optional), batch engine (optional), scaling pressure (optional), RuntimeContext. Apps receive it fully wired. See `src/cli/runtime.rs`. - **BatchEngine** — SIMD-optimised batch processing for DFE pipelines. Two modes: `process_mid_tier()` (parse JSON via sonic-rs + parallel transform via rayon) and `process_raw()` (skip parsing, parallel transform on raw bytes). Transport-wired: `run_async()` / `run_raw_async()` with async sink, sink-managed commit tokens, and optional ticker callback. See `src/worker/engine/`. +- **TransportFilterEngine** — CEL-syntax message filtering embedded in every transport (Kafka, gRPC, Memory, File, Pipe, HTTP, Redis). Three performance tiers: Tier 1 (SIMD field ops via sonic_rs::get_from_slice, ~50-200ns/msg, always enabled), Tier 2 (compiled CEL with extracted fields, requires `expression.allow_cel_filters_in/out`), Tier 3 (CEL with regex/iteration/time, requires `expression.allow_complex_filters_in/out`). Operators write CEL syntax — engine classifies via text pattern matching and bypasses CEL engine entirely for Tier 1. First-match-wins, drop/dlq actions, fail-fast at startup. Zero downstream code changes — config-only activation. See `src/transport/filter/`. - **RuntimeContext** — rich runtime metadata detected once at startup (pod_name, namespace, node_name, container_id, memory_limit_bytes, cpu_quota_cores). Global singleton via OnceLock. All modules read from this instead of doing their own env var lookups. No-ops on bare metal. See `src/env.rs`. - **K8s pre-stop compliance** — shutdown handler sleeps `PRESTOP_DELAY_SECS` (default 5 in K8s, 0 elsewhere) before cancelling the token. Prevents traffic routing to a draining pod. - **Deployment contract CI bridge** — `container-manifest.json` (minimal CI subset), `Dockerfile.runtime` (runtime stage fragment for CI composition), OCI labels (static from contract, dynamic injected by CI), `from_cargo_toml()` for auto-detecting native deps, `schema_version` field. diff --git a/benches/filter_benchmark.rs b/benches/filter_benchmark.rs new file mode 100644 index 0000000..a3e6fec --- /dev/null +++ b/benches/filter_benchmark.rs @@ -0,0 +1,233 @@ +// Project: hyperi-rustlib +// File: benches/filter_benchmark.rs +// Purpose: Criterion benchmarks for transport filter engine performance +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Benchmarks for the transport filter engine. +//! +//! Validates the design assumption: Tier 1 filters are ~50-100ns/msg via SIMD, +//! and the no-filter overhead is negligible. + +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; + +use hyperi_rustlib::transport::filter::{ + FilterAction, FilterDisposition, FilterRule, TransportFilterEngine, TransportFilterTierConfig, +}; + +const SAMPLE_PAYLOAD: &[u8] = br#"{"_table":"events","host":"prod-web01","source_type":"syslog","severity":3,"id":12345,"timestamp":"2026-04-10T12:00:00Z","message":"Sample log event with some realistic padding for benchmarking"}"#; + +const POISON_PAYLOAD: &[u8] = br#"{"_table":"events","status":"poison","data":"x"}"#; + +fn bench_no_filters_baseline(c: &mut Criterion) { + let engine = TransportFilterEngine::empty(); + + let mut group = c.benchmark_group("filter_no_filters"); + group.throughput(Throughput::Elements(1)); + group.bench_function("apply_inbound_no_filters", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(SAMPLE_PAYLOAD))); + }); + group.finish(); +} + +fn bench_tier1_field_exists(c: &mut Criterion) { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let mut group = c.benchmark_group("filter_tier1_field_exists"); + group.throughput(Throughput::Elements(1)); + group.bench_function("match", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(SAMPLE_PAYLOAD))); + }); + group.finish(); +} + +fn bench_tier1_field_equals(c: &mut Criterion) { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let mut group = c.benchmark_group("filter_tier1_field_equals"); + group.throughput(Throughput::Elements(1)); + group.bench_function("no_match_pass", |b| { + b.iter(|| { + let result = engine.apply_inbound(SAMPLE_PAYLOAD); + assert_eq!(result, FilterDisposition::Pass); + std::hint::black_box(result) + }); + }); + group.bench_function("match_drop", |b| { + b.iter(|| { + let result = engine.apply_inbound(POISON_PAYLOAD); + assert_eq!(result, FilterDisposition::Drop); + std::hint::black_box(result) + }); + }); + group.finish(); +} + +fn bench_tier1_starts_with(c: &mut Criterion) { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"host.startsWith("prod-")"#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let mut group = c.benchmark_group("filter_tier1_starts_with"); + group.throughput(Throughput::Elements(1)); + group.bench_function("match", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(SAMPLE_PAYLOAD))); + }); + group.finish(); +} + +fn bench_tier1_dotted_path(c: &mut Criterion) { + let nested_payload = + br#"{"metadata":{"source":"aws","region":"ap-southeast-2"},"event":"login"}"#; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"metadata.source == "aws""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let mut group = c.benchmark_group("filter_tier1_dotted_path"); + group.throughput(Throughput::Elements(1)); + group.bench_function("nested_match", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(nested_payload))); + }); + group.finish(); +} + +fn bench_tier1_first_match_wins(c: &mut Criterion) { + // 5 filters, message matches the third one + let rules = vec![ + FilterRule { + expression: "has(no_match_1)".into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: "has(no_match_2)".into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: "has(no_match_3)".into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: "has(no_match_4)".into(), + action: FilterAction::Drop, + }, + ]; + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + + let mut group = c.benchmark_group("filter_tier1_first_match_wins"); + group.throughput(Throughput::Elements(1)); + group.bench_function("match_at_position_3", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(SAMPLE_PAYLOAD))); + }); + group.finish(); +} + +#[cfg(feature = "expression")] +fn bench_tier2_compound_cel(c: &mut Criterion) { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"severity > 3 && source != "internal""#.into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + let payload = br#"{"severity":5,"source":"external","data":"x"}"#; + + let mut group = c.benchmark_group("filter_tier2_compound_cel"); + group.throughput(Throughput::Elements(1)); + group.bench_function("compound_cel_match", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(payload))); + }); + group.finish(); +} + +#[cfg(feature = "expression")] +fn bench_tier3_regex_cel(c: &mut Criterion) { + let tier_config = TransportFilterTierConfig { + allow_complex_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"host.matches("^prod-.*$")"#.into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + let mut group = c.benchmark_group("filter_tier3_regex_cel"); + group.throughput(Throughput::Elements(1)); + group.bench_function("regex_match", |b| { + b.iter(|| std::hint::black_box(engine.apply_inbound(SAMPLE_PAYLOAD))); + }); + group.finish(); +} + +#[cfg(feature = "expression")] +criterion_group!( + benches, + bench_no_filters_baseline, + bench_tier1_field_exists, + bench_tier1_field_equals, + bench_tier1_starts_with, + bench_tier1_dotted_path, + bench_tier1_first_match_wins, + bench_tier2_compound_cel, + bench_tier3_regex_cel, +); + +#[cfg(not(feature = "expression"))] +criterion_group!( + benches, + bench_no_filters_baseline, + bench_tier1_field_exists, + bench_tier1_field_equals, + bench_tier1_starts_with, + bench_tier1_dotted_path, + bench_tier1_first_match_wins, +); + +criterion_main!(benches); diff --git a/docs/CLICKHOUSE_PYTHON_BINDINGS.md b/docs/CLICKHOUSE_PYTHON_BINDINGS.md deleted file mode 100644 index 5257b08..0000000 --- a/docs/CLICKHOUSE_PYTHON_BINDINGS.md +++ /dev/null @@ -1,379 +0,0 @@ -# ClickHouse Python Bindings Discussion - -**Date:** 2025-01-19 -**Status:** Proposal / Discussion - ---- - -## Overview - -This document outlines the approach for exposing hyperi-rustlib's ClickHouse client to Python via PyO3, enabling hyperi-pylib to leverage the Rust implementation's performance and type safety. - ---- - -## Why Python Bindings? - -### Benefits - -1. **Performance**: Native Arrow protocol is faster than HTTP-based Python clients -2. **Type Safety**: Runtime type parsing from ClickHouse schema (SSOT principle) -3. **Consistency**: Same client behaviour across Rust and Python codebases -4. **Arrow Native**: Zero-copy data exchange via PyArrow - -### Use Cases - -- hyperi-pylib applications needing high-performance ClickHouse access -- Data pipelines mixing Python and Rust components -- Gradual migration from Python to Rust services - ---- - -## Types to Expose - -### Primary API (Must Have) - -| Rust Type | Python Class | Purpose | -|-----------|--------------|---------| -| `ArrowClickHouseClient` | `ClickHouseClient` | Main client interface | -| `ClickHouseConfig` | `ClickHouseConfig` | Connection configuration | -| `TableSchema` | `TableSchema` | Table metadata | -| `ColumnInfo` | `ColumnInfo` | Column metadata | -| `ParsedType` | `ParsedType` | Type introspection | - -### Error Handling - -| Rust Type | Python Exception | -|-----------|------------------| -| `ClickHouseError::Connection` | `ClickHouseConnectionError` | -| `ClickHouseError::Query` | `ClickHouseQueryError` | -| `ClickHouseError::Insert` | `ClickHouseInsertError` | -| `ClickHouseError::Schema` | `ClickHouseSchemaError` | -| `ClickHouseError::Arrow` | `ClickHouseArrowError` | - ---- - -## Proposed Python API - -### Configuration - -```python -from hyperi_rustlib import ClickHouseConfig, ClickHouseClient - -# Basic configuration -config = ClickHouseConfig( - host="localhost:9000", - database="default" -) - -# With authentication -config = ClickHouseConfig( - host="clickhouse.example.com:9000", - database="analytics", - username="user", - password="secret", - connect_timeout_ms=5000, - request_timeout_ms=30000, -) - -# Multiple hosts (load balancing) -config = ClickHouseConfig( - hosts=["ch1:9000", "ch2:9000", "ch3:9000"], - database="default" -) -``` - -### Client Usage - -```python -import pyarrow as pa -from hyperi_rustlib import ClickHouseClient, ClickHouseConfig - -# Create client (async context manager) -async with ClickHouseClient(config) as client: - # Health check - await client.health_check() - - # Execute query (returns PyArrow Table) - table = await client.select("SELECT * FROM events LIMIT 100") - df = table.to_pandas() # Convert to pandas if needed - - # Insert data (accepts PyArrow Table or RecordBatch) - data = pa.table({ - "id": [1, 2, 3], - "name": ["a", "b", "c"], - "timestamp": pa.array([...], type=pa.timestamp("us")) - }) - rows_inserted = await client.insert("events", data) - - # Schema introspection - schema = await client.fetch_table_schema("events") - for col in schema.columns: - print(f"{col.name}: {col.type_name} (nullable={col.is_nullable})") -``` - -### Synchronous Wrapper - -```python -# For sync codebases -from hyperi_rustlib import ClickHouseClient, ClickHouseConfig - -client = ClickHouseClient.connect_sync(config) -table = client.select_sync("SELECT * FROM events LIMIT 100") -client.close() -``` - -### Type Introspection - -```python -from hyperi_rustlib import ParsedType - -# Parse a ClickHouse type string -parsed = ParsedType.parse("Nullable(Array(String))") -print(parsed.base) # "Array" -print(parsed.nullable) # True -print(parsed.array_element) # ParsedType for String -print(parsed.is_string()) # False (it's an array) -``` - ---- - -## Implementation Approach - -### Option 1: PyO3 Extension Module (Recommended) - -Create a separate crate `hyperi-rustlib-python` that wraps the Rust types: - -```text -hyperi-rustlib/ -├── Cargo.toml -├── src/ # Core Rust library -└── python/ - ├── Cargo.toml # PyO3 extension - ├── src/ - │ └── lib.rs # Python bindings - └── pyproject.toml -``` - -**Pros:** - -- Clean separation of concerns -- Can version Python bindings independently -- Easier to maintain - -**Cons:** - -- Separate build process -- Need to keep in sync with core library - -### Option 2: Feature-Gated Bindings - -Add Python bindings directly to hyperi-rustlib behind a feature flag: - -```toml -[features] -python = ["dep:pyo3"] -``` - -**Pros:** - -- Single codebase -- Always in sync - -**Cons:** - -- Increases core library complexity -- PyO3 dependencies even when not needed - -### Recommendation - -**Option 1** - Separate crate in `python/` subdirectory, published as `hyperi-rustlib-python` to PyPI. - ---- - -## Async Runtime Handling - -The ClickHouse client is async-first. Options for Python: - -### Option A: pyo3-asyncio (Recommended) - -```rust -use pyo3_asyncio::tokio::future_into_py; - -#[pymethods] -impl PyClickHouseClient { - fn select<'py>(&self, py: Python<'py>, sql: &str) -> PyResult<&'py PyAny> { - let client = self.inner.clone(); - let sql = sql.to_string(); - future_into_py(py, async move { - let batches = client.select(&sql).await?; - // Convert to PyArrow - Ok(batches_to_pyarrow(batches)?) - }) - } -} -``` - -**Pros:** - -- Native async/await in Python -- Non-blocking - -**Cons:** - -- Requires Python 3.7+ -- More complex error handling - -### Option B: Sync Wrappers - -```rust -#[pymethods] -impl PyClickHouseClient { - fn select_sync(&self, sql: &str) -> PyResult { - let rt = tokio::runtime::Runtime::new()?; - rt.block_on(async { - let batches = self.inner.select(sql).await?; - Ok(batches_to_pyarrow(batches)?) - }) - } -} -``` - -**Pros:** - -- Simpler to use -- Works with sync Python code - -**Cons:** - -- Blocks the thread -- Can't be used in async Python context - -### Recommendation - -Provide **both**: async methods as default, with `_sync` suffix variants for convenience. - ---- - -## Arrow Interoperability - -### PyArrow Integration - -```rust -use arrow::pyarrow::ToPyArrow; -use pyo3::prelude::*; - -fn batches_to_pyarrow(py: Python, batches: Vec) -> PyResult { - // Convert RecordBatches to PyArrow Table - let schema = batches[0].schema(); - let table = arrow::compute::concat_batches(&schema, &batches)?; - table.to_pyarrow(py) -} - -fn pyarrow_to_batch(table: &PyAny) -> PyResult { - // Convert PyArrow Table to RecordBatch - RecordBatch::from_pyarrow(table) -} -``` - -### Zero-Copy Data Transfer - -Arrow's columnar format enables zero-copy data sharing between Rust and Python when memory is properly aligned. This is a key performance benefit. - ---- - -## Dependencies - -### Rust Side - -```toml -[dependencies] -pyo3 = { version = "0.22", features = ["extension-module"] } -pyo3-asyncio = { version = "0.21", features = ["tokio-runtime"] } -arrow = { version = "53", features = ["pyarrow"] } -``` - -### Python Side - -```toml -[project] -dependencies = [ - "pyarrow>=14.0", -] -``` - ---- - -## Build and Distribution - -### Build Process - -```bash -# Development -cd python/ -maturin develop - -# Release -maturin build --release -``` - -### Distribution - -1. **PyPI**: Publish wheels for Linux/macOS/Windows -2. **Artifactory**: Internal distribution alongside hyperi-pylib - -### Platform Support - -- Linux x86_64 (primary) -- macOS arm64 (development) -- Windows x86_64 (if needed) - ---- - -## Integration with hyperi-pylib - -### Option A: Separate Package - -```python -# hyperi-pylib uses hyperi-rustlib-python as optional dependency -# pyproject.toml -[project.optional-dependencies] -clickhouse = ["hyperi-rustlib-python>=0.1"] -``` - -### Option B: Vendored in hyperi-pylib - -Include pre-built wheels in hyperi-pylib's distribution. - -### Recommendation - -**Option A** - Separate package, optional dependency. Allows independent versioning and reduces hyperi-pylib's complexity. - ---- - -## Open Questions - -1. **Naming**: `hyperi-rustlib-python` vs `hs-clickhouse` vs `clickhouse-arrow-py`? -2. **Scope**: Just ClickHouse, or expose other hyperi-rustlib modules (config, metrics)? -3. **Async**: Should async be the default, or provide sync-first API? -4. **Error Messages**: How much detail to expose in Python exceptions? - ---- - -## Next Steps - -1. [ ] Create `python/` subdirectory with PyO3 skeleton -2. [ ] Implement basic `ClickHouseConfig` and `ClickHouseClient` wrappers -3. [ ] Add PyArrow integration for data transfer -4. [ ] Write Python tests -5. [ ] Set up maturin build pipeline -6. [ ] Publish to internal Artifactory -7. [ ] Integrate with hyperi-pylib as optional dependency - ---- - -## References - -- [PyO3 User Guide](https://pyo3.rs/) -- [pyo3-asyncio](https://github.com/awestlake87/pyo3-asyncio) -- [Arrow PyArrow Integration](https://docs.rs/arrow/latest/arrow/pyarrow/index.html) -- [Maturin](https://www.maturin.rs/) diff --git a/docs/GAP_ANALYSIS.md b/docs/GAP_ANALYSIS.md deleted file mode 100644 index 88f2cb2..0000000 --- a/docs/GAP_ANALYSIS.md +++ /dev/null @@ -1,196 +0,0 @@ -# Gap Analysis: hyperi-rustlib vs hyperi-pylib - -**Date:** 2025-01-19 -**hyperi-rustlib Version:** 0.3.0 -**hyperi-pylib Reference:** Latest main branch - ---- - -## Summary - -| Metric | hyperi-pylib | hyperi-rustlib | -|--------|----------|-----------| -| Total LOC | ~15,400 | ~6,163 | -| Major Modules | 11 | 7 | -| Status | Production-ready, enterprise features | MVP complete, core modules | - ---- - -## Module Comparison - -| Module | hyperi-pylib | hyperi-rustlib | Gap Notes | -|--------|----------|-----------|-----------| -| **env** | ✅ K8s/Docker/Container/BareMetal detection | ✅ Same detection methods | ✓ Parity | -| **runtime** | ✅ XDG + container-aware paths | ✅ XDG + container-aware paths | ✓ Parity | -| **config** | ✅ 7-layer cascade (Dynaconf) + PostgreSQL loader | ✅ 7-layer cascade (Figment) | ✓ Core parity; pylib has PostgreSQL config layer | -| **logger** | ✅ Loguru + masking + rate limiting | ✅ Tracing + masking | ✓ Core parity; pylib has rate limiting, emoji support | -| **metrics** | ✅ Prometheus + OpenTelemetry + FastAPI middleware | ✅ Prometheus only | ✓ Core parity; pylib has OpenTelemetry, middleware | -| **transport** | ❌ Not implemented | ✅ Kafka/Zenoh/Memory abstraction | ⚡ **rustlib advantage** | -| **clickhouse** | ❌ Not implemented | ✅ Arrow protocol client | ⚡ **rustlib advantage** | -| **database** | ✅ URL builders (Postgres/MySQL/MongoDB/Redis) | ❌ Not implemented | ⚠️ Gap | -| **http** | ✅ Sync/async with retries (Stamina) | ❌ Feature planned | ⚠️ Gap | -| **cache** | ✅ Disk + PostgreSQL backends | ❌ Feature planned | ⚠️ Gap | -| **kafka** | ✅ Full ecosystem (~3,500 LOC) | ❌ Not standalone | ⚠️ Major gap | -| **cli** | ✅ Typer framework + helpers | ❌ Not implemented | ⚠️ Gap | -| **anonymizer** | ✅ PII detection (Presidio) | ❌ Not implemented | ⚠️ Gap | -| **harness** | ✅ Timeout monitors, registry utils | ❌ Not implemented | ⚠️ Gap | - ---- - -## Features hyperi-rustlib Has That hyperi-pylib Doesn't - -### 1. Transport Abstraction Layer (~1,000 LOC) - -Multi-transport abstraction supporting Kafka, Zenoh, and in-memory transports with: - -- Stateful format detection with auto-locking -- Payload format handling (JSON/MsgPack) -- Generic message interface with commit tokens -- Lower-level control for event streaming pipelines - -### 2. ClickHouse Client (~500 LOC) - -Native Arrow protocol client with: - -- Schema introspection and type parsing -- Connection pooling -- Type-safe queries -- Arrow-native for high performance - ---- - -## Features hyperi-pylib Has That hyperi-rustlib Doesn't - -### P0 - Critical for Enterprise Use - -#### Kafka Client (~3,500 LOC) - -Full Kafka ecosystem that would require significant effort to port: - -- Sync/async producer, consumer, admin clients -- Health monitoring with consumer group lag tracking -- Schema analysis for JSON messages -- Sampling utilities (reservoir, time-bounded, partition) -- Metrics collection with callback integration - -**Note:** hyperi-rustlib has Kafka via transport abstraction, but lacks standalone client. - -### P1 - High Utility - -#### Database Module (~200 LOC) - -Connection URL builders: - -- PostgreSQL: `POSTGRES_HOST`, `POSTGRES_PORT`, `POSTGRES_USER`, `POSTGRES_PASSWORD`, `POSTGRES_DB` -- MySQL: Similar ENV var patterns -- MongoDB: Connection string builder -- Redis: `REDIS_HOST`, `REDIS_PORT`, `REDIS_PASSWORD` - -#### HTTP Client (~200 LOC) - -Production-ready HTTP client: - -- Sync and async variants -- Automatic retries with exponential backoff (Stamina) -- Configurable timeouts (30s default) -- Metrics auto-detection - -### P2 - Supporting Features - -#### Cache Module (~300 LOC) - -Multi-backend caching: - -- Disk cache (Cashews/SQLite) -- PostgreSQL cache for distributed deployments -- `@cached` decorator with TTL support -- Async-first design - -#### CLI Framework (~300 LOC) - -Typer-based CLI utilities: - -- Reusable options (VERBOSE_OPTION, DRY_RUN_OPTION) -- Version handling -- Output formatters (tables, progress bars) -- CliRunner for testing - -### P3 - Nice to Have - -#### Anonymizer/PII Detection (~500 LOC) - -Microsoft Presidio integration: - -- Multiple presets (minimal, standard, compliance) -- Strategies: mask, redact, hash, encrypt -- Text/JSON/dict support -- Streaming for large datasets - -#### Harness/Testing Utilities (~300 LOC) - -CI/CD helpers: - -- Smart timeout monitoring for functions -- Container registry login utilities -- Docker Hub rate limit checking - ---- - -## Implementation Priority for hyperi-rustlib - -### Phase 1 - Core Enterprise (Recommended) - -1. **Database module** (~200 LOC) - - URL builders for PostgreSQL, Redis - - ENV var parsing with standard prefixes - - Minimal dependencies - -2. **HTTP client** (~300 LOC) - - Wrap reqwest with retry middleware - - Already in Cargo.toml as optional feature - - Foundation for many applications - -### Phase 2 - Enhanced Features - -1. **Cache module** (~300 LOC) - - Disk-based cache with SQLite - - Optional Redis support - -2. **CLI framework** (~200 LOC) - - Wrap Clap with reusable helpers - - Output formatters - -### Phase 3 - Advanced (As Needed) - -1. **Kafka standalone client** - Only if transport layer insufficient -2. **Anonymizer** - Depends on external library availability -3. **Harness utilities** - Lower priority - ---- - -## Architectural Differences - -| Aspect | hyperi-pylib | hyperi-rustlib | -|--------|----------|-----------| -| Philosophy | "Zero-config batteries included" | "Minimal core + extensibility" | -| Async | Full async/await throughout | Async-ready, primarily sync core | -| Dependencies | ~50+ transitive | ~15-20 focused | -| Type Safety | Runtime | Compile-time | -| Memory | Garbage collected | Zero-cost abstractions | - ---- - -## Conclusion - -**hyperi-rustlib** is well-positioned as a core infrastructure library with type safety and performance advantages. The transport and ClickHouse modules are unique capabilities. - -**For full enterprise parity**, prioritise: - -1. Database URL builders (high utility, low effort) -2. HTTP client with retries (already partially scaffolded) -3. Evaluate need for standalone Kafka vs using transport layer - -**Keep unique in hyperi-rustlib**: - -- Transport abstraction (better in Rust) -- ClickHouse Arrow client (Rust-specific advantage) diff --git a/docs/TRANSPORT-FILTER-FOLLOWUP.md b/docs/TRANSPORT-FILTER-FOLLOWUP.md new file mode 100644 index 0000000..5f514c7 --- /dev/null +++ b/docs/TRANSPORT-FILTER-FOLLOWUP.md @@ -0,0 +1,127 @@ +## Transport Filter Engine — Follow-ups + +Tracked items uncovered during the v2.4.7 review of `feat/transport-filter-engine`. +None of these block the merge — they are improvements/hardening to schedule +when transport filters see broader production use. + +### #7 — Constant-time string comparison for sensitive fields + +**Context.** `CompiledFilter::FieldEquals` / `FieldNotEquals` use plain `==` +for the field-value comparison. If a downstream operator writes a filter +on a high-entropy secret-like field (e.g. an API token, an opaque session +ID, an HMAC), the early-exit characteristic of byte-for-byte equality +leaks ~1 byte of comparison position per nanosecond of timing observation. + +**Risk.** Low for the production use case (transport filters key off +routing fields, not secrets), but the door is open. An attacker who can +inject crafted payloads and observe per-message latency could in theory +brute-force a guarded field value. + +**Proposed fix.** When the filter expression compares against a value +flagged as "sensitive" (heuristic match: > 16 bytes high entropy, contains +`/[A-Za-z0-9+/=]{20,}/`, or explicit opt-in via filter metadata), use +`subtle::ConstantTimeEq` for the comparison. Add a benchmark to measure +the cost — should be negligible compared to the surrounding `sonic-rs` +field extraction. + +**Where.** `src/transport/filter/compiled.rs::evaluate()` and a new +`is_sensitive_pattern()` helper. + +--- + +### #8 — Log masking for filter expression content + +**Context.** Filter compilation, ordering warnings, and DLQ routing all +emit `tracing` events that include the raw expression text (e.g. +`"filter compiled: tier=tier2 expr=field == \"prod-secret\""`). If an +operator embeds a literal secret in a filter (bad practice but possible), +that secret ends up in logs. + +**Risk.** Low — the engine's existing `tracing-throttle` integration +already rate-limits these events, and rustlib's `logger` module masks +known sensitive field names. But the expression text itself is not run +through the masker. + +**Proposed fix.** Add an `expression_redacted()` helper that: +- Replaces string literals (anything between `"…"`) with `""` +- Keeps the structural form (`field == `) for debugging +- Is the only form ever written to logs + +Plumb it through every `tracing::warn!` / `tracing::info!` call site in +`compiled.rs`, `mod.rs`, and `metrics.rs`. + +--- + +### #9 — Pre-quoted bytes for `field == "value"` fast path + +**Context.** `FieldEquals` currently extracts the field value via +`sonic-rs` (which produces a `LazyValue`), runs `extract_string_value` +to unescape it into a `Cow`, and then string-compares against the +expected value. For the very common case of a single-segment field +compared against an ASCII string with no escapes, we could pre-bake the +bytes `"…":"value"` once at compile time and use a single `memmem::find` +— same trick as `FieldExists`. + +**Estimated win.** ~30-50% on the `field == "value"` path (currently +~250-400 ns, target ~150-200 ns). + +**Caveats.** +- Only safe when both the field name AND the value contain no JSON-escape + characters (`\`, `"`, control bytes). Otherwise the literal pattern + doesn't match the encoded form. +- Inherits the same nested-field false-positive limitation as + `FieldExists` (documented in the integration tests). +- Whitespace in the JSON (`{"field" : "value"}`) breaks the literal match. + +**Proposed fix.** At compile time, classify the value as "fast-path +eligible" or not. Build a `memchr::memmem::Finder<'static>` for the +eligible cases and short-circuit in `evaluate()`. Fall back to the +sonic-rs extraction otherwise. + +**Where.** `src/transport/filter/compiled.rs::FieldEquals` and +`evaluate()`. + +--- + +### #10 — MsgPack edge cases bypass filtering + +**Context.** `apply_inbound`/`apply_outbound` detect the payload format +via `PayloadFormat::detect()` (cheap heuristic on the first byte). For +detected MsgPack payloads we currently skip the JSON-oriented filter +engine entirely — the integration test +`adversarial_msgpack_bypasses_filters` documents this. + +**Risk.** Filter rules silently do nothing on MsgPack payloads. An +operator who configures `has(_drop_me)` on a Kafka topic carrying +MsgPack messages will see no filtering, no warning, no metric. + +**Proposed fix (cheap).** Emit a one-shot `tracing::warn!` per +filter-engine instance the first time a MsgPack payload is seen with +filters configured. Add a `dfe_transport_filter_bypassed_total{reason="msgpack"}` +counter so dashboards can flag the bypass. + +**Proposed fix (expensive).** Compile a parallel MsgPack evaluator +(rmp-serde or rmpv) for each `CompiledFilter` variant. Tier 1 ops are +field extractions, which work the same way on MsgPack — the only thing +that changes is the lookup engine. Defer until there is a real customer +asking for MsgPack filtering. + +--- + +### #11 — Preserve original `expression_text` through reload cycles + +**Context.** `CompiledFilter::expression_text` stores the as-typed +expression string for use in error messages and ordering-warning logs. +On hot-reload (`ConfigReloader`), the engine is rebuilt from the new +config, but if the new config is structurally identical to the old one, +the same compiled filter could be reused without re-allocating the text +buffer. Currently every reload allocates fresh `String`s for every rule. + +**Estimated win.** Negligible per reload, but reloads on a busy system +with thousands of routing rules add up. Mostly an allocator-pressure +hygiene item. + +**Proposed fix.** Hash the rule list during compile and cache compiled +filters keyed by hash. On reload, look up by hash before recompiling. + +**Where.** `src/transport/filter/mod.rs::TransportFilterEngine::new`. diff --git a/src/deployment/mod.rs b/src/deployment/mod.rs index fe7a910..d59451b 100644 --- a/src/deployment/mod.rs +++ b/src/deployment/mod.rs @@ -56,6 +56,8 @@ //! "ubuntu:24.04", //! ), //! image_profile: ImageProfile::Production, +//! oci_labels: Default::default(), +//! schema_version: 1, //! }; //! //! // Generate production Dockerfile diff --git a/src/transport/file.rs b/src/transport/file.rs index 28dcb50..9e1e54b 100644 --- a/src/transport/file.rs +++ b/src/transport/file.rs @@ -25,7 +25,7 @@ //! ```rust,ignore //! use hyperi_rustlib::transport::file::{FileTransport, FileTransportConfig}; //! -//! let config = FileTransportConfig { path: "/tmp/events.ndjson".into(), append: true }; +//! let config = FileTransportConfig { path: "/tmp/events.ndjson".into(), append: true, ..Default::default() }; //! let transport = FileTransport::new(&config).await?; //! transport.send("events", b"{\"msg\":\"hello\"}").await; //! ``` @@ -66,6 +66,14 @@ pub struct FileTransportConfig { /// Append mode (default true for send). #[serde(default = "default_append")] pub append: bool, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } fn default_append() -> bool { @@ -77,6 +85,8 @@ impl Default for FileTransportConfig { Self { path: String::new(), append: true, + filters_in: Vec::new(), + filters_out: Vec::new(), } } } @@ -119,6 +129,10 @@ pub struct FileTransport { writer: Mutex>, reader: Mutex>, closed: Arc, + filter_engine: super::filter::TransportFilterEngine, + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl FileTransport { @@ -135,6 +149,16 @@ impl FileTransport { #[cfg(feature = "logger")] tracing::info!(path = %config.path, append = config.append, "File transport opened"); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + ) + .unwrap_or_else(|e| { + tracing::warn!(error = %e, "Failed to compile transport filters, filtering disabled"); + super::filter::TransportFilterEngine::empty() + }); + let closed = Arc::new(AtomicBool::new(false)); #[cfg(feature = "health")] @@ -154,6 +178,8 @@ impl FileTransport { writer: Mutex::new(None), reader: Mutex::new(None), closed, + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), }) } @@ -273,6 +299,15 @@ impl TransportSender for FileTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + if let Err(e) = self.ensure_writer().await { return SendResult::Fatal(e); } @@ -362,6 +397,26 @@ impl TransportReceiver for FileTransport { }); } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + #[cfg(feature = "logger")] if !messages.is_empty() { tracing::debug!(lines = messages.len(), "File transport: batch received"); @@ -376,6 +431,10 @@ impl TransportReceiver for FileTransport { Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, tokens: &[Self::Token]) -> TransportResult<()> { if let Some(max_token) = tokens.iter().max_by_key(|t| t.offset) { let path = Path::new(&self.config.path); @@ -401,6 +460,7 @@ mod tests { let config = FileTransportConfig { path: path.to_str().unwrap().to_string(), append: true, + ..Default::default() }; FileTransport::new(&config).await.unwrap() } @@ -415,6 +475,7 @@ mod tests { let config = FileTransportConfig { path: path_str.clone(), append: true, + ..Default::default() }; let sender = FileTransport::new(&config).await.unwrap(); @@ -428,6 +489,7 @@ mod tests { let reader_config = FileTransportConfig { path: path_str, append: true, + ..Default::default() }; let reader = FileTransport::new(&reader_config).await.unwrap(); let messages = reader.recv(10).await.unwrap(); @@ -450,6 +512,7 @@ mod tests { let config = FileTransportConfig { path: path_str.clone(), append: true, + ..Default::default() }; let sender = FileTransport::new(&config).await.unwrap(); sender.send("k", b"line1").await; @@ -461,6 +524,7 @@ mod tests { let r1 = FileTransport::new(&FileTransportConfig { path: path_str.clone(), append: true, + ..Default::default() }) .await .unwrap(); @@ -478,6 +542,7 @@ mod tests { let r2 = FileTransport::new(&FileTransportConfig { path: path_str, append: true, + ..Default::default() }) .await .unwrap(); @@ -517,6 +582,7 @@ mod tests { let config = FileTransportConfig { path: path_str.clone(), append: true, + ..Default::default() }; let transport = FileTransport::new(&config).await.unwrap(); transport.send("k", b"only_line").await; @@ -526,6 +592,7 @@ mod tests { let reader = FileTransport::new(&FileTransportConfig { path: path_str, append: true, + ..Default::default() }) .await .unwrap(); diff --git a/src/transport/filter/classify.rs b/src/transport/filter/classify.rs new file mode 100644 index 0000000..d5a32c5 --- /dev/null +++ b/src/transport/filter/classify.rs @@ -0,0 +1,439 @@ +// Project: hyperi-rustlib +// File: src/transport/filter/classify.rs +// Purpose: CEL expression classification into performance tiers +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Classify CEL expressions into performance tiers via text pattern matching. +//! +//! Tier 1 patterns are detected by regex and executed as SIMD field operations +//! (no CEL engine). Expressions that don't match Tier 1 are classified as +//! Tier 2 (standard CEL) or Tier 3 (complex CEL with restricted functions). + +use std::sync::LazyLock; + +use regex::Regex; + +use super::config::FilterTier; + +/// Recognised Tier 1 operation extracted from the expression text. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Tier1Op { + FieldExists { field: String }, + FieldNotExists { field: String }, + FieldEquals { field: String, value: String }, + FieldNotEquals { field: String, value: String }, + FieldStartsWith { field: String, prefix: String }, + FieldEndsWith { field: String, suffix: String }, + FieldContains { field: String, substring: String }, +} + +/// Result of classifying an expression. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ClassifyResult { + /// Expression matches a Tier 1 SIMD pattern. + Tier1(Tier1Op), + /// Expression is valid CEL without restricted functions (Tier 2). + Tier2 { fields: Vec }, + /// Expression uses restricted functions (Tier 3). + Tier3 { fields: Vec }, +} + +impl ClassifyResult { + #[must_use] + pub fn tier(&self) -> FilterTier { + match self { + Self::Tier1(_) => FilterTier::Tier1, + Self::Tier2 { .. } => FilterTier::Tier2, + Self::Tier3 { .. } => FilterTier::Tier3, + } + } +} + +// --------------------------------------------------------------------------- +// Tier 1 regex patterns (compiled once via LazyLock) +// --------------------------------------------------------------------------- + +// Field name: word chars + dots for nested paths +static RE_HAS: LazyLock = + LazyLock::new(|| Regex::new(r"^\s*has\(\s*([\w.]+)\s*\)\s*$").unwrap()); + +static RE_NOT_HAS: LazyLock = + LazyLock::new(|| Regex::new(r"^\s*!\s*has\(\s*([\w.]+)\s*\)\s*$").unwrap()); + +static RE_EQ_STR: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*([\w.]+)\s*==\s*"([^"]*)"\s*$"#).unwrap()); + +static RE_NEQ_STR: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*([\w.]+)\s*!=\s*"([^"]*)"\s*$"#).unwrap()); + +static RE_STARTS_WITH: LazyLock = LazyLock::new(|| { + Regex::new(r#"^\s*([\w.]+)\s*\.\s*startsWith\(\s*"([^"]*)"\s*\)\s*$"#).unwrap() +}); + +static RE_ENDS_WITH: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*([\w.]+)\s*\.\s*endsWith\(\s*"([^"]*)"\s*\)\s*$"#).unwrap()); + +static RE_CONTAINS: LazyLock = + LazyLock::new(|| Regex::new(r#"^\s*([\w.]+)\s*\.\s*contains\(\s*"([^"]*)"\s*\)\s*$"#).unwrap()); + +// Restricted function names (Tier 3) +const RESTRICTED_FUNCTIONS: &[&str] = &[ + "matches", + "map", + "filter", + "exists", + "all", + "exists_one", + "timestamp", + "duration", +]; + +// CEL keywords and built-in function names (NOT field references) +const CEL_KEYWORDS: &[&str] = &[ + "true", + "false", + "null", + "in", + "has", + "size", + "int", + "uint", + "double", + "string", + "bool", + "type", + "contains", + "startsWith", + "endsWith", + "matches", + "map", + "filter", + "exists", + "all", + "exists_one", + "timestamp", + "duration", +]; + +/// Classify a CEL expression into a performance tier. +/// +/// Returns `Err` if the expression is syntactically invalid (can't even be +/// parsed as a potential CEL expression — empty, unbalanced quotes, etc.). +/// +/// # Examples +/// +/// ```rust,ignore +/// let result = classify("has(_table)"); +/// assert!(matches!(result, Ok(ClassifyResult::Tier1(..)))); +/// +/// let result = classify("severity > 3 && source != \"internal\""); +/// assert!(matches!(result, Ok(ClassifyResult::Tier2 { .. }))); +/// ``` +pub fn classify(expr: &str) -> Result { + let trimmed = expr.trim(); + if trimmed.is_empty() { + return Err("empty expression".into()); + } + + // Try Tier 1 patterns first (ordered by expected frequency) + if let Some(op) = try_tier1(trimmed) { + return Ok(ClassifyResult::Tier1(op)); + } + + // Not Tier 1 — check for restricted functions (Tier 3) vs standard (Tier 2) + let has_restricted = check_restricted_functions(trimmed); + let fields = extract_field_references(trimmed); + + if has_restricted { + Ok(ClassifyResult::Tier3 { fields }) + } else { + Ok(ClassifyResult::Tier2 { fields }) + } +} + +/// Try to match a Tier 1 pattern. Returns `None` if no pattern matches. +fn try_tier1(expr: &str) -> Option { + // has(field) + if let Some(caps) = RE_HAS.captures(expr) { + return Some(Tier1Op::FieldExists { + field: caps[1].to_string(), + }); + } + + // !has(field) + if let Some(caps) = RE_NOT_HAS.captures(expr) { + return Some(Tier1Op::FieldNotExists { + field: caps[1].to_string(), + }); + } + + // field == "value" + if let Some(caps) = RE_EQ_STR.captures(expr) { + return Some(Tier1Op::FieldEquals { + field: caps[1].to_string(), + value: caps[2].to_string(), + }); + } + + // field != "value" + if let Some(caps) = RE_NEQ_STR.captures(expr) { + return Some(Tier1Op::FieldNotEquals { + field: caps[1].to_string(), + value: caps[2].to_string(), + }); + } + + // field.startsWith("prefix") + if let Some(caps) = RE_STARTS_WITH.captures(expr) { + return Some(Tier1Op::FieldStartsWith { + field: caps[1].to_string(), + prefix: caps[2].to_string(), + }); + } + + // field.endsWith("suffix") + if let Some(caps) = RE_ENDS_WITH.captures(expr) { + return Some(Tier1Op::FieldEndsWith { + field: caps[1].to_string(), + suffix: caps[2].to_string(), + }); + } + + // field.contains("substring") + if let Some(caps) = RE_CONTAINS.captures(expr) { + return Some(Tier1Op::FieldContains { + field: caps[1].to_string(), + substring: caps[2].to_string(), + }); + } + + None +} + +/// Check if the expression uses any restricted functions (Tier 3). +/// +/// Text-scanning approach (same as `profile.rs`). Scans for function names +/// followed by `(`, skipping occurrences inside string literals. +fn check_restricted_functions(expr: &str) -> bool { + for func in RESTRICTED_FUNCTIONS { + // Look for `func(` pattern, not inside a string + let pattern = format!("{func}("); + if let Some(pos) = expr.find(&pattern) { + // Check we're not inside a string literal by counting quotes before pos + let before = &expr[..pos]; + let quote_count = before.chars().filter(|&c| c == '"').count(); + if quote_count % 2 == 0 { + // Even number of quotes = we're outside a string + return true; + } + } + } + false +} + +/// Extract field references from an expression (for Tier 2/3 CEL context building). +/// +/// Scans for identifier patterns that aren't CEL keywords or function names. +/// For method calls like `field.matches("...")`, extracts only the receiver field. +/// Returns unique field names (may include dotted paths for nested access). +fn extract_field_references(expr: &str) -> Vec { + // Match dotted identifier (potentially nested) — we'll trim trailing method call + // segments after matching. + static RE_IDENT: LazyLock = LazyLock::new(|| Regex::new(r"[a-zA-Z_][\w.]*").unwrap()); + + let mut fields: Vec = Vec::new(); + + // Build a mask of which byte positions are inside string literals + let mut in_string_mask = vec![false; expr.len()]; + let mut in_string = false; + let mut prev_was_escape = false; + for (i, ch) in expr.char_indices() { + if in_string { + in_string_mask[i] = true; + } + if ch == '"' && !prev_was_escape { + in_string = !in_string; + } + prev_was_escape = ch == '\\' && !prev_was_escape; + } + + for m in RE_IDENT.find_iter(expr) { + if in_string_mask.get(m.start()).copied().unwrap_or(false) { + continue; + } + + let mut ident = m.as_str().to_string(); + + // If this identifier is immediately followed by '(' (a function call), + // strip the last dotted segment (the method name) — the receiver is + // the actual field reference. + let after = &expr[m.end()..]; + if after.trim_start().starts_with('(') { + if let Some(dot_pos) = ident.rfind('.') { + // Method call on a field: keep the receiver + ident.truncate(dot_pos); + } else { + // Bare function call (e.g., has(), size()) — not a field + continue; + } + } + + if ident.is_empty() { + continue; + } + + // Skip CEL keywords (check the leading segment) + let base = ident.split('.').next().unwrap_or(&ident); + if CEL_KEYWORDS.contains(&base) { + continue; + } + + if !fields.contains(&ident) { + fields.push(ident); + } + } + + fields +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn classify_has_field() { + let result = classify("has(_table)").unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldExists { ref field }) if field == "_table" + )); + } + + #[test] + fn classify_not_has_field() { + let result = classify("!has(_internal)").unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldNotExists { ref field }) if field == "_internal" + )); + } + + #[test] + fn classify_field_equals_string() { + let result = classify(r#"status == "poison""#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldEquals { ref field, ref value }) + if field == "status" && value == "poison" + )); + } + + #[test] + fn classify_field_not_equals() { + let result = classify(r#"source != "internal""#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldNotEquals { ref field, ref value }) + if field == "source" && value == "internal" + )); + } + + #[test] + fn classify_starts_with() { + let result = classify(r#"host.startsWith("prod-")"#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldStartsWith { ref field, ref prefix }) + if field == "host" && prefix == "prod-" + )); + } + + #[test] + fn classify_ends_with() { + let result = classify(r#"name.endsWith(".log")"#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldEndsWith { ref field, ref suffix }) + if field == "name" && suffix == ".log" + )); + } + + #[test] + fn classify_contains() { + let result = classify(r#"path.contains("/api/")"#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldContains { ref field, ref substring }) + if field == "path" && substring == "/api/" + )); + } + + #[test] + fn classify_dotted_path() { + let result = classify(r#"metadata.source == "aws""#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + assert!(matches!( + result, + ClassifyResult::Tier1(Tier1Op::FieldEquals { ref field, ref value }) + if field == "metadata.source" && value == "aws" + )); + } + + #[test] + fn classify_compound_expression_is_tier2() { + let result = classify(r#"severity > 3 && source != "internal""#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier2); + } + + #[test] + fn classify_regex_is_tier3() { + let result = classify(r#"field.matches("^prod-.*")"#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier3); + } + + #[test] + fn classify_iteration_is_tier3() { + let result = classify(r#"tags.exists(t, t == "pii")"#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier3); + } + + #[test] + fn classify_empty_expression_errors() { + assert!(classify("").is_err()); + assert!(classify(" ").is_err()); + } + + #[test] + fn classify_whitespace_tolerance() { + let result = classify(" has( _table ) ").unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + } + + #[test] + fn classify_tier2_extracts_fields() { + let result = classify(r#"severity > 3 && source != "internal""#).unwrap(); + if let ClassifyResult::Tier2 { fields } = result { + assert!(fields.contains(&"severity".to_string())); + assert!(fields.contains(&"source".to_string())); + } else { + panic!("Expected Tier2"); + } + } + + #[test] + fn restricted_function_in_string_not_detected() { + // "matches" inside a string literal should NOT trigger Tier 3 + let result = classify(r#"field == "matches""#).unwrap(); + assert_eq!(result.tier(), FilterTier::Tier1); + } +} diff --git a/src/transport/filter/compiled.rs b/src/transport/filter/compiled.rs new file mode 100644 index 0000000..9fca216 --- /dev/null +++ b/src/transport/filter/compiled.rs @@ -0,0 +1,788 @@ +// Project: hyperi-rustlib +// File: src/transport/filter/compiled.rs +// Purpose: Compiled filter variants with Tier 1 SIMD evaluation +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Compiled filter representations and evaluation logic. +//! +//! Tier 1 filters execute as direct `sonic_rs::get_from_slice()` field +//! extraction + string comparison. No CEL engine, no allocation beyond +//! the extracted field value. ~50-100ns per message. + +use super::classify::{ClassifyResult, Tier1Op}; +use super::config::{FilterAction, FilterDirection, FilterTier, TransportFilterTierConfig}; + +/// A compiled filter ready for hot-path evaluation. +/// +/// Tier 1 variants bypass the CEL engine entirely — they use SIMD JSON +/// field extraction via `sonic_rs::get_from_str()` (zero-copy &str path, +/// no UTF-8 revalidation per call). +/// +/// `FieldExists` / `FieldNotExists` for single-segment paths use a +/// pre-compiled `memchr::memmem::Finder` to detect the `"key":` substring +/// in raw bytes, bypassing the JSON parser entirely (~10-20ns vs ~200ns). +#[derive(Debug)] +pub enum CompiledFilter { + // Tier 1 — SIMD field ops + FieldExists { + field: String, + path: Vec, + /// Pre-compiled memmem finder for the `"field":` byte pattern. + /// Used as a fast-path when the path is a single segment (no nested). + /// `None` for nested paths — falls back to sonic-rs. + needle: Option>, + action: FilterAction, + expression_text: String, + }, + FieldNotExists { + field: String, + path: Vec, + needle: Option>, + action: FilterAction, + expression_text: String, + }, + FieldEquals { + field: String, + path: Vec, + value: String, + action: FilterAction, + expression_text: String, + }, + FieldNotEquals { + field: String, + path: Vec, + value: String, + action: FilterAction, + expression_text: String, + }, + FieldStartsWith { + field: String, + path: Vec, + prefix: String, + action: FilterAction, + expression_text: String, + }, + FieldEndsWith { + field: String, + path: Vec, + suffix: String, + action: FilterAction, + expression_text: String, + }, + FieldContains { + field: String, + path: Vec, + substring: String, + action: FilterAction, + expression_text: String, + }, + // Tier 2/3 — CEL expression (feature-gated) + #[cfg(feature = "expression")] + CelExpression { + program: cel_interpreter::Program, + fields: Vec, + expression_text: String, + tier: FilterTier, + action: FilterAction, + }, +} + +impl CompiledFilter { + /// Compile a filter from a CEL expression string. + /// + /// Classifies the expression, checks tier gates, and returns the + /// appropriate compiled variant. + pub fn from_expression( + expr: &str, + action: FilterAction, + direction: FilterDirection, + tier_config: &TransportFilterTierConfig, + ) -> Result { + let classification = super::classify::classify(expr)?; + + // Check tier gate + let tier = classification.tier(); + if !tier_config.is_tier_allowed(tier, direction) { + return Err(format!( + "classified as {tier} but {tier} filters are not enabled for {direction}. \ + Set expression.allow_{} to enable.", + match (tier, direction) { + (FilterTier::Tier2, FilterDirection::In) => "cel_filters_in: true", + (FilterTier::Tier2, FilterDirection::Out) => "cel_filters_out: true", + (FilterTier::Tier3, FilterDirection::In) => "complex_filters_in: true", + (FilterTier::Tier3, FilterDirection::Out) => "complex_filters_out: true", + (FilterTier::Tier1, _) => unreachable!("Tier 1 is always allowed"), + } + )); + } + + let expression_text = expr.to_string(); + + match classification { + ClassifyResult::Tier1(op) => Ok(Self::from_tier1_op(op, action, expression_text)), + #[cfg(feature = "expression")] + ClassifyResult::Tier2 { fields } => { + let program = crate::expression::compile(expr) + .map_err(|e| format!("CEL compilation failed: {e}"))?; + Ok(Self::CelExpression { + program, + fields, + expression_text, + tier: FilterTier::Tier2, + action, + }) + } + #[cfg(feature = "expression")] + ClassifyResult::Tier3 { fields } => { + let profile = crate::expression::ProfileConfig { + allow_regex: true, + allow_iteration: true, + allow_time: true, + }; + let program = crate::expression::compile_with_config(expr, &profile) + .map_err(|e| format!("CEL compilation failed: {e}"))?; + Ok(Self::CelExpression { + program, + fields, + expression_text, + tier: FilterTier::Tier3, + action, + }) + } + #[cfg(not(feature = "expression"))] + ClassifyResult::Tier2 { .. } | ClassifyResult::Tier3 { .. } => Err(format!( + "classified as {tier} but the 'expression' feature is not enabled. \ + Enable it in Cargo.toml or simplify the expression to Tier 1." + )), + } + } + + fn from_tier1_op(op: Tier1Op, action: FilterAction, expression_text: String) -> Self { + match op { + Tier1Op::FieldExists { field } => { + let path = split_field_path(&field); + let needle = build_field_needle(&path); + Self::FieldExists { + field, + path, + needle, + action, + expression_text, + } + } + Tier1Op::FieldNotExists { field } => { + let path = split_field_path(&field); + let needle = build_field_needle(&path); + Self::FieldNotExists { + field, + path, + needle, + action, + expression_text, + } + } + Tier1Op::FieldEquals { field, value } => { + let path = split_field_path(&field); + Self::FieldEquals { + field, + path, + value, + action, + expression_text, + } + } + Tier1Op::FieldNotEquals { field, value } => { + let path = split_field_path(&field); + Self::FieldNotEquals { + field, + path, + value, + action, + expression_text, + } + } + Tier1Op::FieldStartsWith { field, prefix } => { + let path = split_field_path(&field); + Self::FieldStartsWith { + field, + path, + prefix, + action, + expression_text, + } + } + Tier1Op::FieldEndsWith { field, suffix } => { + let path = split_field_path(&field); + Self::FieldEndsWith { + field, + path, + suffix, + action, + expression_text, + } + } + Tier1Op::FieldContains { field, substring } => { + let path = split_field_path(&field); + Self::FieldContains { + field, + path, + substring, + action, + expression_text, + } + } + } + } + + /// Evaluate this filter against a raw JSON payload. + /// + /// Returns `Some(action)` if the filter matches, `None` otherwise. + /// Tier 1: SIMD field extraction via `sonic_rs::get_from_slice()`. + /// + /// Zero-copy hot path: uses stack arrays for path segments (no Vec + /// allocation per message). Single-segment fields are the common case. + #[inline] + #[must_use] + pub fn evaluate(&self, payload: &[u8]) -> Option { + match self { + Self::FieldExists { + path, + needle, + action, + .. + } => { + // Fast path: pre-compiled memmem Finder for single-segment fields. + // SIMD substring search ~10-20ns vs sonic-rs ~200ns. + if let Some(n) = needle { + return n.find(payload).is_some().then_some(*action); + } + // Slow path: nested field, use sonic-rs + with_path_refs(path, |refs| { + sonic_rs::get_from_slice(payload, refs) + .is_ok() + .then_some(*action) + }) + } + Self::FieldNotExists { + path, + needle, + action, + .. + } => { + if let Some(n) = needle { + return n.find(payload).is_none().then_some(*action); + } + with_path_refs(path, |refs| { + sonic_rs::get_from_slice(payload, refs) + .is_err() + .then_some(*action) + }) + } + Self::FieldEquals { + path, + value, + action, + .. + } => with_path_refs(path, |refs| { + let lv = sonic_rs::get_from_slice(payload, refs).ok()?; + let field_val = extract_string_value(&lv); + (field_val == value.as_str()).then_some(*action) + }), + Self::FieldNotEquals { + path, + value, + action, + .. + } => with_path_refs(path, |refs| match sonic_rs::get_from_slice(payload, refs) { + Ok(lv) => { + let field_val = extract_string_value(&lv); + (field_val != value.as_str()).then_some(*action) + } + // Field missing → not equal to anything → match + Err(_) => Some(*action), + }), + Self::FieldStartsWith { + path, + prefix, + action, + .. + } => with_path_refs(path, |refs| { + let lv = sonic_rs::get_from_slice(payload, refs).ok()?; + let field_val = extract_string_value(&lv); + field_val.starts_with(prefix.as_str()).then_some(*action) + }), + Self::FieldEndsWith { + path, + suffix, + action, + .. + } => with_path_refs(path, |refs| { + let lv = sonic_rs::get_from_slice(payload, refs).ok()?; + let field_val = extract_string_value(&lv); + field_val.ends_with(suffix.as_str()).then_some(*action) + }), + Self::FieldContains { + path, + substring, + action, + .. + } => with_path_refs(path, |refs| { + let lv = sonic_rs::get_from_slice(payload, refs).ok()?; + let field_val = extract_string_value(&lv); + field_val.contains(substring.as_str()).then_some(*action) + }), + #[cfg(feature = "expression")] + Self::CelExpression { + program, + fields, + action, + .. + } => evaluate_cel(payload, program, fields, *action), + } + } + + /// Get the filter's performance tier. + #[must_use] + pub fn tier(&self) -> FilterTier { + match self { + Self::FieldExists { .. } + | Self::FieldNotExists { .. } + | Self::FieldEquals { .. } + | Self::FieldNotEquals { .. } + | Self::FieldStartsWith { .. } + | Self::FieldEndsWith { .. } + | Self::FieldContains { .. } => FilterTier::Tier1, + #[cfg(feature = "expression")] + Self::CelExpression { tier, .. } => *tier, + } + } + + /// Get the filter's action. + #[must_use] + pub fn action(&self) -> FilterAction { + match self { + Self::FieldExists { action, .. } + | Self::FieldNotExists { action, .. } + | Self::FieldEquals { action, .. } + | Self::FieldNotEquals { action, .. } + | Self::FieldStartsWith { action, .. } + | Self::FieldEndsWith { action, .. } + | Self::FieldContains { action, .. } => *action, + #[cfg(feature = "expression")] + Self::CelExpression { action, .. } => *action, + } + } + + /// Get the original expression text (for logging/debug). + #[must_use] + pub fn expression_text(&self) -> &str { + match self { + Self::FieldExists { + expression_text, .. + } + | Self::FieldNotExists { + expression_text, .. + } + | Self::FieldEquals { + expression_text, .. + } + | Self::FieldNotEquals { + expression_text, .. + } + | Self::FieldStartsWith { + expression_text, .. + } + | Self::FieldEndsWith { + expression_text, .. + } + | Self::FieldContains { + expression_text, .. + } => expression_text, + #[cfg(feature = "expression")] + Self::CelExpression { + expression_text, .. + } => expression_text, + } + } +} + +/// Split a dotted field path into segments for `sonic_rs::get_from_slice()`. +fn split_field_path(field: &str) -> Vec { + field.split('.').map(String::from).collect() +} + +/// Build a memmem Finder for a single-segment field name. Returns `None` +/// for nested paths (those fall back to sonic-rs). +/// +/// The needle is `"":` — the JSON key pattern. memchr's SIMD-accelerated +/// substring search detects this pattern in raw bytes ~10-20ns per call, +/// vs ~200ns for a full sonic-rs JSON parse. +/// +/// Note: this is a heuristic — the pattern could appear inside a string value. +/// Used as a fast yes/no check; for false positives we'd need to verify. +/// In practice, valid JSON rarely contains escaped key-like patterns inside +/// string values, so the false positive rate is negligible. +fn build_field_needle(path: &[String]) -> Option> { + if path.len() != 1 { + return None; + } + let pattern = format!("\"{}\":", path[0]); + Some(memchr::memmem::Finder::new(&pattern.into_bytes()).into_owned()) +} + +/// Extract a string value from a `sonic_rs::LazyValue` as a borrowed `&str`. +/// +/// For string values without escapes, returns a zero-copy reference into the +/// raw payload (most common case). For escaped strings, falls back to +/// `as_str()` which un-escapes. For non-string values (numbers, booleans), +/// returns the raw JSON representation. +/// +/// **Hot path:** uses `is_str()` to fast-check string type, then `memchr` for +/// SIMD-accelerated escape detection. Zero allocation in the common case. +fn extract_string_value<'a>(lv: &'a sonic_rs::LazyValue<'a>) -> std::borrow::Cow<'a, str> { + use sonic_rs::JsonValueTrait; + let raw = lv.as_raw_str(); + + if lv.is_str() { + // Strip the quotes — sonic-rs guarantees raw is `"..."` for string values + let bytes = raw.as_bytes(); + if bytes.len() >= 2 && bytes[0] == b'"' && bytes[bytes.len() - 1] == b'"' { + let inner = &raw[1..raw.len() - 1]; + // SIMD escape detection via memchr + if memchr::memchr(b'\\', inner.as_bytes()).is_none() { + return std::borrow::Cow::Borrowed(inner); + } + // Has escapes — un-escape via sonic-rs as_str + if let Some(s) = lv.as_str() { + return std::borrow::Cow::Owned(s.to_string()); + } + } + } + + // Non-string value (number, bool, null): return raw representation + std::borrow::Cow::Borrowed(raw) +} + +/// Call `f` with a `&[&str]` slice over the field path. +/// +/// Zero-allocation hot path: stack arrays for paths up to 4 segments deep +/// (covers >99% of real-world filter expressions). Falls back to a heap +/// allocation only for paths deeper than 4 segments. +#[inline] +fn with_path_refs(path: &[String], f: impl FnOnce(&[&str]) -> R) -> R { + match path.len() { + 0 => f(&[]), + 1 => f(&[path[0].as_str()]), + 2 => f(&[path[0].as_str(), path[1].as_str()]), + 3 => f(&[path[0].as_str(), path[1].as_str(), path[2].as_str()]), + 4 => f(&[ + path[0].as_str(), + path[1].as_str(), + path[2].as_str(), + path[3].as_str(), + ]), + _ => { + let refs: Vec<&str> = path.iter().map(String::as_str).collect(); + f(refs.as_slice()) + } + } +} + +/// Evaluate a Tier 2/3 CEL expression against a JSON payload. +#[cfg(feature = "expression")] +fn evaluate_cel( + payload: &[u8], + program: &cel_interpreter::Program, + fields: &[String], + action: FilterAction, +) -> Option { + use std::collections::HashMap; + + // Extract only declared fields via SIMD (not full JSON parse) + let mut context_data: HashMap = HashMap::with_capacity(fields.len()); + for field in fields { + let path: Vec<&str> = field.split('.').collect(); + if let Ok(lv) = sonic_rs::get_from_slice(payload, path.as_slice()) + && let Ok(v) = sonic_rs::from_str::(lv.as_raw_str()) + { + context_data.insert(field.clone(), v); + } + } + + let ctx = crate::expression::build_context(&context_data).ok()?; + match program.execute(&ctx) { + Ok(cel_interpreter::Value::Bool(true)) => Some(action), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tier1_field_exists_matches() { + let filter = CompiledFilter::from_expression( + "has(_table)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"_table":"events","id":1}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_field_exists_no_match() { + let filter = CompiledFilter::from_expression( + "has(_table)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.evaluate(br#"{"host":"web1","id":1}"#), None); + } + + #[test] + fn tier1_field_not_exists_matches() { + let filter = CompiledFilter::from_expression( + "!has(_internal)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"host":"web1"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_field_equals_matches() { + let filter = CompiledFilter::from_expression( + r#"status == "poison""#, + FilterAction::Dlq, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"status":"poison","data":"x"}"#), + Some(FilterAction::Dlq) + ); + } + + #[test] + fn tier1_field_equals_no_match() { + let filter = CompiledFilter::from_expression( + r#"status == "poison""#, + FilterAction::Dlq, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.evaluate(br#"{"status":"healthy","data":"x"}"#), None); + } + + #[test] + fn tier1_field_not_equals_matches() { + let filter = CompiledFilter::from_expression( + r#"source != "trusted""#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"source":"untrusted"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_field_not_equals_missing_field_matches() { + let filter = CompiledFilter::from_expression( + r#"source != "trusted""#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + // Missing field is not equal to "trusted" + assert_eq!( + filter.evaluate(br#"{"other":"value"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_starts_with_matches() { + let filter = CompiledFilter::from_expression( + r#"host.startsWith("prod-")"#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"host":"prod-web01"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_starts_with_no_match() { + let filter = CompiledFilter::from_expression( + r#"host.startsWith("prod-")"#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.evaluate(br#"{"host":"dev-web01"}"#), None); + } + + #[test] + fn tier1_ends_with_matches() { + let filter = CompiledFilter::from_expression( + r#"name.endsWith(".log")"#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"name":"app.log"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_contains_matches() { + let filter = CompiledFilter::from_expression( + r#"path.contains("/api/")"#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"path":"/v1/api/users"}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_dotted_path_matches() { + let filter = CompiledFilter::from_expression( + r#"metadata.source == "aws""#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!( + filter.evaluate(br#"{"metadata":{"source":"aws"},"id":1}"#), + Some(FilterAction::Drop) + ); + } + + #[test] + fn tier1_non_json_payload_no_match() { + let filter = CompiledFilter::from_expression( + "has(_table)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.evaluate(b"not json at all {{{"), None); + } + + #[test] + fn tier1_empty_payload_no_match() { + let filter = CompiledFilter::from_expression( + "has(_table)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.evaluate(b""), None); + } + + #[test] + fn tier_accessor() { + let filter = CompiledFilter::from_expression( + "has(x)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.tier(), FilterTier::Tier1); + } + + #[test] + fn action_accessor() { + let filter = CompiledFilter::from_expression( + "has(x)", + FilterAction::Dlq, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.action(), FilterAction::Dlq); + } + + #[test] + fn expression_text_accessor() { + let filter = CompiledFilter::from_expression( + "has(my_field)", + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + assert_eq!(filter.expression_text(), "has(my_field)"); + } + + #[test] + fn tier2_rejected_without_opt_in() { + let result = CompiledFilter::from_expression( + r#"severity > 3 && source != "internal""#, + FilterAction::Drop, + FilterDirection::In, + &TransportFilterTierConfig::default(), + ); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.contains("Tier 2"), "{err}"); + } + + #[test] + fn split_field_path_simple() { + assert_eq!(split_field_path("_table"), vec!["_table"]); + } + + #[test] + fn split_field_path_nested() { + assert_eq!( + split_field_path("metadata.source"), + vec!["metadata", "source"] + ); + } + + #[test] + fn split_field_path_deep() { + assert_eq!(split_field_path("a.b.c.d"), vec!["a", "b", "c", "d"]); + } +} diff --git a/src/transport/filter/config.rs b/src/transport/filter/config.rs new file mode 100644 index 0000000..a6adb4a --- /dev/null +++ b/src/transport/filter/config.rs @@ -0,0 +1,207 @@ +// Project: hyperi-rustlib +// File: src/transport/filter/config.rs +// Purpose: Configuration types for transport-level message filtering +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Filter configuration types for transport-level message filtering. +//! +//! Filters use CEL syntax. The engine classifies expressions at config load +//! time and selects the optimal execution strategy (Tier 1 SIMD, Tier 2 CEL, +//! Tier 3 complex CEL). + +use serde::{Deserialize, Serialize}; + +/// A single filter rule — CEL expression + disposition action. +/// +/// Written in CEL syntax regardless of execution tier. The engine determines +/// the optimal execution strategy at construction time. +/// +/// # Examples +/// +/// ```yaml +/// - expression: 'has(_table)' +/// action: drop +/// - expression: 'status == "poison"' +/// action: dlq +/// ``` +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct FilterRule { + /// CEL expression to evaluate against each message payload. + pub expression: String, + + /// Action to take when the expression matches. Defaults to `drop`. + #[serde(default)] + pub action: FilterAction, +} + +/// Disposition action when a filter matches. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FilterAction { + /// Silently discard the message (counted in metrics). + #[default] + Drop, + /// Route the message to the dead-letter queue (counted + security audit). + Dlq, +} + +/// Performance tier for filter expressions. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FilterTier { + /// SIMD field extraction + string comparison. ~50-100ns/msg. Always enabled. + Tier1, + /// Pre-compiled CEL with extracted fields. ~500ns-1us/msg. Requires opt-in. + Tier2, + /// Complex CEL with restricted functions (regex, iteration). ~5-50us/msg. Requires opt-in. + Tier3, +} + +impl std::fmt::Display for FilterTier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Tier1 => write!(f, "Tier 1 (SIMD)"), + Self::Tier2 => write!(f, "Tier 2 (CEL)"), + Self::Tier3 => write!(f, "Tier 3 (complex CEL)"), + } + } +} + +/// Direction of filtering (inbound or outbound). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FilterDirection { + In, + Out, +} + +impl std::fmt::Display for FilterDirection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::In => write!(f, "in"), + Self::Out => write!(f, "out"), + } + } +} + +/// Tier gate configuration — controls which filter tiers are enabled. +/// +/// Lives under the `expression` config cascade key alongside `ProfileConfig`. +/// Separate struct because it serves a different purpose (transport-level +/// gating vs expression-level function restriction). +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[allow(clippy::struct_excessive_bools)] // 4 independent boolean tier gates (in/out x cel/complex) +pub struct TransportFilterTierConfig { + /// Enable Tier 2 (CEL engine) for inbound transport filters. + #[serde(default)] + pub allow_cel_filters_in: bool, + + /// Enable Tier 2 (CEL engine) for outbound transport filters. + #[serde(default)] + pub allow_cel_filters_out: bool, + + /// Enable Tier 3 (complex CEL: regex, iteration, time) for inbound filters. + /// Implies `allow_cel_filters_in`. + #[serde(default)] + pub allow_complex_filters_in: bool, + + /// Enable Tier 3 (complex CEL: regex, iteration, time) for outbound filters. + /// Implies `allow_cel_filters_out`. + #[serde(default)] + pub allow_complex_filters_out: bool, +} + +impl TransportFilterTierConfig { + /// Check if the given tier is allowed for the given direction. + #[must_use] + pub fn is_tier_allowed(&self, tier: FilterTier, direction: FilterDirection) -> bool { + match (tier, direction) { + (FilterTier::Tier1, _) => true, // always allowed + (FilterTier::Tier2, FilterDirection::In) => { + self.allow_cel_filters_in || self.allow_complex_filters_in + } + (FilterTier::Tier2, FilterDirection::Out) => { + self.allow_cel_filters_out || self.allow_complex_filters_out + } + (FilterTier::Tier3, FilterDirection::In) => self.allow_complex_filters_in, + (FilterTier::Tier3, FilterDirection::Out) => self.allow_complex_filters_out, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn filter_rule_deserializes_from_yaml() { + let yaml = "expression: 'has(_table)'\naction: drop\n"; + let rule: FilterRule = serde_yaml_ng::from_str(yaml).unwrap(); + assert_eq!(rule.expression, "has(_table)"); + assert_eq!(rule.action, FilterAction::Drop); + } + + #[test] + fn filter_action_defaults_to_drop() { + let yaml = "expression: 'has(_table)'"; + let rule: FilterRule = serde_yaml_ng::from_str(yaml).unwrap(); + assert_eq!(rule.action, FilterAction::Drop); + } + + #[test] + fn filter_action_dlq_variant() { + let yaml = "expression: 'status == \"poison\"'\naction: dlq\n"; + let rule: FilterRule = serde_yaml_ng::from_str(yaml).unwrap(); + assert_eq!(rule.action, FilterAction::Dlq); + } + + #[test] + fn tier_config_defaults_all_false() { + let config = TransportFilterTierConfig::default(); + assert!(!config.allow_cel_filters_in); + assert!(!config.allow_cel_filters_out); + assert!(!config.allow_complex_filters_in); + assert!(!config.allow_complex_filters_out); + } + + #[test] + fn tier1_always_allowed() { + let config = TransportFilterTierConfig::default(); + assert!(config.is_tier_allowed(FilterTier::Tier1, FilterDirection::In)); + assert!(config.is_tier_allowed(FilterTier::Tier1, FilterDirection::Out)); + } + + #[test] + fn tier2_requires_opt_in() { + let config = TransportFilterTierConfig::default(); + assert!(!config.is_tier_allowed(FilterTier::Tier2, FilterDirection::In)); + + let config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + assert!(config.is_tier_allowed(FilterTier::Tier2, FilterDirection::In)); + assert!(!config.is_tier_allowed(FilterTier::Tier2, FilterDirection::Out)); + } + + #[test] + fn tier3_implies_tier2() { + let config = TransportFilterTierConfig { + allow_complex_filters_in: true, + ..Default::default() + }; + // Tier 3 enabled implies Tier 2 is also allowed + assert!(config.is_tier_allowed(FilterTier::Tier2, FilterDirection::In)); + assert!(config.is_tier_allowed(FilterTier::Tier3, FilterDirection::In)); + // But not outbound + assert!(!config.is_tier_allowed(FilterTier::Tier2, FilterDirection::Out)); + } + + #[test] + fn filter_tier_display() { + assert_eq!(FilterTier::Tier1.to_string(), "Tier 1 (SIMD)"); + assert_eq!(FilterTier::Tier2.to_string(), "Tier 2 (CEL)"); + assert_eq!(FilterTier::Tier3.to_string(), "Tier 3 (complex CEL)"); + } +} diff --git a/src/transport/filter/metrics.rs b/src/transport/filter/metrics.rs new file mode 100644 index 0000000..72f2558 --- /dev/null +++ b/src/transport/filter/metrics.rs @@ -0,0 +1,65 @@ +// Project: hyperi-rustlib +// File: src/transport/filter/metrics.rs +// Purpose: Metrics for transport-level message filtering +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Filter metrics — counters per direction and action. +//! +//! Uses the `metrics` crate (no-op if no recorder installed). + +use super::config::{FilterAction, FilterDirection}; + +/// Metrics for transport filter operations. +pub struct FilterMetrics { + _private: (), // force construction via new() +} + +impl FilterMetrics { + /// Create new filter metrics (registers counters on first use via metrics macros). + #[must_use] + pub fn new() -> Self { + Self { _private: () } + } + + /// Record a filter match event. + pub fn record(&self, direction: FilterDirection, action: FilterAction) { + let dir = match direction { + FilterDirection::In => "in", + FilterDirection::Out => "out", + }; + let act = match action { + FilterAction::Drop => "drop", + FilterAction::Dlq => "dlq", + }; + metrics::counter!("transport_filtered_total", "direction" => dir, "action" => act) + .increment(1); + } +} + +impl Default for FilterMetrics { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn metrics_record_does_not_panic() { + let metrics = FilterMetrics::new(); + metrics.record(FilterDirection::In, FilterAction::Drop); + metrics.record(FilterDirection::Out, FilterAction::Dlq); + metrics.record(FilterDirection::In, FilterAction::Dlq); + metrics.record(FilterDirection::Out, FilterAction::Drop); + } + + #[test] + fn metrics_default_does_not_panic() { + let _metrics = FilterMetrics::default(); + } +} diff --git a/src/transport/filter/mod.rs b/src/transport/filter/mod.rs new file mode 100644 index 0000000..8566461 --- /dev/null +++ b/src/transport/filter/mod.rs @@ -0,0 +1,561 @@ +// Project: hyperi-rustlib +// File: src/transport/filter/mod.rs +// Purpose: Transport-level message filtering engine +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! # Transport Filter Engine +//! +//! Provides transport-level message filtering using CEL syntax with SIMD +//! fast-path for simple patterns. Embedded in every transport — filters are +//! configured via the config cascade and applied automatically. +//! +//! ## Performance Tiers +//! +//! - **Tier 1** — SIMD field extraction (~50-100ns/msg). Always enabled. +//! - **Tier 2** — Pre-compiled CEL (~500ns-1us/msg). Requires `allow_cel_filters_in/out`. +//! - **Tier 3** — Complex CEL with regex/iteration (~5-50us/msg). Requires `allow_complex_filters_in/out`. +//! +//! ## Usage +//! +//! Transports construct `TransportFilterEngine` from config at creation time. +//! The engine is a no-op when no filters are configured (zero overhead). +//! +//! ```yaml +//! kafka: +//! filters_in: +//! - expression: 'has(_internal)' +//! action: drop +//! - expression: 'status == "poison"' +//! action: dlq +//! ``` + +pub mod classify; +pub mod compiled; +pub mod config; +pub mod metrics; + +pub use config::{ + FilterAction, FilterDirection, FilterRule, FilterTier, TransportFilterTierConfig, +}; + +use compiled::CompiledFilter; +use metrics::FilterMetrics; + +use crate::transport::error::TransportError; + +/// Result of evaluating a filter against a message payload. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FilterDisposition { + /// Message passes all filters — continue processing. + Pass, + /// Message matched a filter with `action: drop` — discard silently. + Drop, + /// Message matched a filter with `action: dlq` — route to dead-letter queue. + Dlq, +} + +/// A DLQ entry produced by inbound filtering. +/// +/// The transport does NOT send to DLQ directly — it returns these entries +/// alongside passing messages. The caller handles DLQ routing using its +/// own `Dlq` handle. +#[derive(Debug, Clone)] +pub struct FilteredDlqEntry { + /// Raw message payload. + pub payload: Vec, + /// Routing key (Kafka topic, gRPC metadata, etc.). + pub key: Option>, + /// Human-readable reason for DLQ routing (filter expression text). + pub reason: String, +} + +/// Result of partitioning a batch of messages through inbound filters. +/// +/// Returned from `TransportFilterEngine::partition_batch()`. Contains: +/// - `messages`: messages that passed all filters (or had no filter match) +/// - `dlq_entries`: messages matched by a filter with `action: dlq`. The +/// caller is responsible for routing these to a DLQ — the engine does +/// NOT send to DLQ directly (transports don't have DLQ handles). +/// - `drop_count`: count of messages matched by `action: drop` filters +/// +/// Drop and DLQ messages are removed from `messages` — the caller only +/// processes `messages` for normal pipeline work, and `dlq_entries` +/// for DLQ routing. +#[derive(Debug)] +pub struct FilteredBatch { + /// Messages that passed all filters. + pub messages: Vec, + /// Messages matched by filters with `action: dlq`. Caller routes to DLQ. + pub dlq_entries: Vec, + /// Count of messages dropped (matched by `action: drop` filters). + pub drop_count: u64, +} + +impl FilteredBatch { + /// Create a `FilteredBatch` containing only passing messages (no filtering). + /// Used when there are no inbound filters configured. + #[must_use] + pub fn passthrough(messages: Vec) -> Self { + Self { + messages, + dlq_entries: Vec::new(), + drop_count: 0, + } + } +} + +/// Transport-level message filter engine. +/// +/// Embedded in every transport. Compiled from config at construction time. +/// Zero-cost when no filters are configured (`filters_in` and `filters_out` +/// are empty vecs → `has_inbound_filters()` returns false, branch predicted). +pub struct TransportFilterEngine { + filters_in: Vec, + filters_out: Vec, + #[allow(dead_code)] + metrics: FilterMetrics, +} + +impl std::fmt::Debug for TransportFilterEngine { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TransportFilterEngine") + .field("filters_in_count", &self.filters_in.len()) + .field("filters_out_count", &self.filters_out.len()) + .field("metrics", &"FilterMetrics") + .finish() + } +} + +/// Threshold for logging a warning about filter count. +const FILTER_COUNT_WARNING_THRESHOLD: usize = 20; + +impl TransportFilterEngine { + /// Construct a filter engine from config rules. + /// + /// Parses each expression, classifies its tier, rejects expressions above + /// the enabled tier, and compiles Tier 2/3 expressions to CEL programs. + /// + /// # Errors + /// + /// Returns `TransportError::Config` if: + /// - An expression is invalid (syntax error) + /// - An expression's tier exceeds what's enabled by `tier_config` + pub fn new( + filters_in: &[FilterRule], + filters_out: &[FilterRule], + tier_config: &TransportFilterTierConfig, + ) -> Result { + let compiled_in = Self::compile_rules(filters_in, FilterDirection::In, tier_config)?; + let compiled_out = Self::compile_rules(filters_out, FilterDirection::Out, tier_config)?; + + // Log startup info + if !compiled_in.is_empty() || !compiled_out.is_empty() { + for (idx, filter) in compiled_in.iter().enumerate() { + tracing::info!( + index = idx, + tier = %filter.tier(), + expression = filter.expression_text(), + action = ?filter.action(), + direction = "in", + "Transport filter configured" + ); + } + for (idx, filter) in compiled_out.iter().enumerate() { + tracing::info!( + index = idx, + tier = %filter.tier(), + expression = filter.expression_text(), + action = ?filter.action(), + direction = "out", + "Transport filter configured" + ); + } + + // Warn about ordering (higher tier before lower tier) + Self::warn_suboptimal_ordering(&compiled_in, "in"); + Self::warn_suboptimal_ordering(&compiled_out, "out"); + + // Warn about large filter counts + if compiled_in.len() > FILTER_COUNT_WARNING_THRESHOLD { + tracing::warn!( + count = compiled_in.len(), + direction = "in", + "Large number of inbound filters — may impact throughput" + ); + } + if compiled_out.len() > FILTER_COUNT_WARNING_THRESHOLD { + tracing::warn!( + count = compiled_out.len(), + direction = "out", + "Large number of outbound filters — may impact throughput" + ); + } + } + + Ok(Self { + filters_in: compiled_in, + filters_out: compiled_out, + metrics: FilterMetrics::new(), + }) + } + + /// Create an empty filter engine (no filters, zero overhead). + #[must_use] + pub fn empty() -> Self { + Self { + filters_in: Vec::new(), + filters_out: Vec::new(), + metrics: FilterMetrics::new(), + } + } + + /// Check if any inbound filters are configured. + #[inline] + #[must_use] + pub fn has_inbound_filters(&self) -> bool { + !self.filters_in.is_empty() + } + + /// Check if any outbound filters are configured. + #[inline] + #[must_use] + pub fn has_outbound_filters(&self) -> bool { + !self.filters_out.is_empty() + } + + /// Check if any inbound filter uses `action: dlq`. + #[must_use] + pub fn has_dlq_filters_in(&self) -> bool { + self.filters_in + .iter() + .any(|f| f.action() == FilterAction::Dlq) + } + + /// Check if any outbound filter uses `action: dlq`. + #[must_use] + pub fn has_dlq_filters_out(&self) -> bool { + self.filters_out + .iter() + .any(|f| f.action() == FilterAction::Dlq) + } + + /// Evaluate inbound filters against a raw payload. First-match-wins. + /// + /// Returns `Pass` if no filter matches (or no filters configured). + /// MsgPack payloads always pass (SIMD extraction is JSON-only). + #[inline] + #[must_use] + pub fn apply_inbound(&self, payload: &[u8]) -> FilterDisposition { + self.apply_filters(payload, &self.filters_in, FilterDirection::In) + } + + /// Partition a batch of messages through inbound filters. + /// + /// This is the recommended API for transports — it returns a + /// `FilteredBatch` containing both passing messages AND DLQ entries. + /// The transport's caller routes DLQ entries via its own DLQ handle. + /// + /// Two-pass: classify each message, then partition. Drop and DLQ + /// messages are removed from `messages`. The function never silently + /// loses DLQ-classified messages. + /// + /// # Type parameter + /// + /// `T` is the message type (e.g., `Message`). The function + /// uses a closure to extract the payload bytes and key from each message, + /// avoiding tight coupling to a specific message struct. + pub fn partition_batch( + &self, + messages: Vec, + get_payload: impl Fn(&T) -> &[u8], + get_key: impl Fn(&T) -> Option>, + ) -> FilteredBatch { + if !self.has_inbound_filters() { + return FilteredBatch::passthrough(messages); + } + + let mut passing = Vec::with_capacity(messages.len()); + let mut dlq_entries: Vec = Vec::new(); + let mut drop_count: u64 = 0; + + for msg in messages { + let payload = get_payload(&msg); + match self.apply_inbound(payload) { + FilterDisposition::Pass => passing.push(msg), + FilterDisposition::Drop => drop_count += 1, + FilterDisposition::Dlq => { + let key = get_key(&msg); + dlq_entries.push(FilteredDlqEntry { + payload: payload.to_vec(), + key, + reason: "transport filter".to_string(), + }); + } + } + } + + FilteredBatch { + messages: passing, + dlq_entries, + drop_count, + } + } + + /// Evaluate outbound filters against a raw payload. First-match-wins. + #[inline] + #[must_use] + pub fn apply_outbound(&self, payload: &[u8]) -> FilterDisposition { + self.apply_filters(payload, &self.filters_out, FilterDirection::Out) + } + + fn apply_filters( + &self, + payload: &[u8], + filters: &[CompiledFilter], + direction: FilterDirection, + ) -> FilterDisposition { + if filters.is_empty() { + return FilterDisposition::Pass; + } + + // MsgPack payloads bypass filters (SIMD extraction is JSON-only) + if is_likely_msgpack(payload) { + return FilterDisposition::Pass; + } + + for filter in filters { + if let Some(action) = filter.evaluate(payload) { + self.metrics.record(direction, action); + return match action { + FilterAction::Drop => FilterDisposition::Drop, + FilterAction::Dlq => FilterDisposition::Dlq, + }; + } + } + + FilterDisposition::Pass + } + + fn compile_rules( + rules: &[FilterRule], + direction: FilterDirection, + tier_config: &TransportFilterTierConfig, + ) -> Result, TransportError> { + let mut compiled = Vec::with_capacity(rules.len()); + + for (idx, rule) in rules.iter().enumerate() { + let filter = CompiledFilter::from_expression( + &rule.expression, + rule.action, + direction, + tier_config, + ) + .map_err(|e| { + TransportError::Config(format!( + "filter_{direction}[{idx}]: '{expr}' — {e}", + expr = rule.expression + )) + })?; + compiled.push(filter); + } + + Ok(compiled) + } + + fn warn_suboptimal_ordering(filters: &[CompiledFilter], direction: &str) { + let mut lowest_seen = FilterTier::Tier3; + for (idx, filter) in filters.iter().enumerate() { + let tier = filter.tier(); + if (tier as u8) > (lowest_seen as u8) { + tracing::warn!( + direction, + index = idx, + tier = %tier, + expression = filter.expression_text(), + "Higher-tier filter precedes lower-tier filter — consider reordering for better performance" + ); + } + if (tier as u8) < (lowest_seen as u8) { + lowest_seen = tier; + } + } + } +} + +/// Quick heuristic: is this payload likely MsgPack (not JSON)? +/// +/// Checks the first byte for MsgPack markers. Same heuristic as +/// `PayloadFormat::detect()` in `types.rs`. +#[inline] +fn is_likely_msgpack(payload: &[u8]) -> bool { + match payload.first() { + Some(b) => matches!( + b, + 0x80..=0x8f | 0xde..=0xdf | 0x90..=0x9f | 0xdc..=0xdd + ), + None => false, // empty payload is not MsgPack + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn engine_no_filters_always_passes() { + let engine = + TransportFilterEngine::new(&[], &[], &TransportFilterTierConfig::default()).unwrap(); + assert!(!engine.has_inbound_filters()); + assert!(!engine.has_outbound_filters()); + assert_eq!( + engine.apply_inbound(br#"{"any":"thing"}"#), + FilterDisposition::Pass + ); + } + + #[test] + fn engine_tier1_drop_filter() { + let rules = vec![FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Drop, + }]; + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + assert!(engine.has_inbound_filters()); + + assert_eq!( + engine.apply_inbound(br#"{"status":"poison","data":"x"}"#), + FilterDisposition::Drop + ); + assert_eq!( + engine.apply_inbound(br#"{"status":"ok","data":"x"}"#), + FilterDisposition::Pass + ); + } + + #[test] + fn engine_first_match_wins() { + let rules = vec![ + FilterRule { + expression: r#"status == "drop_me""#.into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: r#"status == "drop_me""#.into(), + action: FilterAction::Dlq, + }, + ]; + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + // First filter matches → Drop, not Dlq + assert_eq!( + engine.apply_inbound(br#"{"status":"drop_me"}"#), + FilterDisposition::Drop + ); + } + + #[test] + fn engine_tier2_rejected_without_opt_in() { + let rules = vec![FilterRule { + expression: r#"severity > 3 && source != "internal""#.into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Tier 2"), "Error should mention tier: {err}"); + } + + #[test] + fn engine_tier3_rejected_without_complex_opt_in() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let rules = vec![FilterRule { + expression: r#"field.matches("^prod-.*")"#.into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &tier_config); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Tier 3"), "Error should mention tier: {err}"); + } + + #[test] + fn engine_invalid_expression_errors() { + let rules = vec![FilterRule { + expression: "this is not valid ((( CEL".into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err()); + } + + #[test] + fn engine_has_dlq_filters_detection() { + let rules = vec![FilterRule { + expression: "has(field)".into(), + action: FilterAction::Dlq, + }]; + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + assert!(engine.has_dlq_filters_in()); + assert!(!engine.has_dlq_filters_out()); + } + + #[test] + fn engine_msgpack_payload_passes_through() { + let rules = vec![FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }]; + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + // MsgPack fixmap header (0x81) + let msgpack = &[ + 0x81, 0xa6, 0x5f, 0x74, 0x61, 0x62, 0x6c, 0x65, 0xa6, 0x65, 0x76, 0x65, 0x6e, 0x74, + 0x73, + ]; + assert_eq!(engine.apply_inbound(msgpack), FilterDisposition::Pass); + } + + #[test] + fn engine_outbound_filter_independent() { + let in_rules = vec![FilterRule { + expression: "has(drop_in)".into(), + action: FilterAction::Drop, + }]; + let out_rules = vec![FilterRule { + expression: "has(drop_out)".into(), + action: FilterAction::Drop, + }]; + let engine = TransportFilterEngine::new( + &in_rules, + &out_rules, + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let payload_in = br#"{"drop_in":true}"#; + assert_eq!(engine.apply_inbound(payload_in), FilterDisposition::Drop); + assert_eq!(engine.apply_outbound(payload_in), FilterDisposition::Pass); + + let payload_out = br#"{"drop_out":true}"#; + assert_eq!(engine.apply_inbound(payload_out), FilterDisposition::Pass); + assert_eq!(engine.apply_outbound(payload_out), FilterDisposition::Drop); + } + + #[test] + fn is_likely_msgpack_detection() { + assert!(is_likely_msgpack(&[0x81])); // fixmap + assert!(is_likely_msgpack(&[0x90])); // fixarray + assert!(is_likely_msgpack(&[0xde])); // map16 + assert!(!is_likely_msgpack(b"{")); // JSON object + assert!(!is_likely_msgpack(b"[")); // JSON array + assert!(!is_likely_msgpack(b"")); // empty + } +} diff --git a/src/transport/grpc/config.rs b/src/transport/grpc/config.rs index 67e651b..6e985db 100644 --- a/src/transport/grpc/config.rs +++ b/src/transport/grpc/config.rs @@ -47,6 +47,12 @@ pub struct GrpcConfig { /// Requires the `transport-grpc-vector-compat` feature. #[cfg(feature = "transport-grpc-vector-compat")] pub vector_compat: bool, + + /// Inbound message filters (applied on recv before caller sees messages). + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + pub filters_out: Vec, } impl Default for GrpcConfig { @@ -60,6 +66,8 @@ impl Default for GrpcConfig { compression: false, #[cfg(feature = "transport-grpc-vector-compat")] vector_compat: false, + filters_in: Vec::new(), + filters_out: Vec::new(), } } } diff --git a/src/transport/grpc/mod.rs b/src/transport/grpc/mod.rs index 6c91f43..dfd4d23 100644 --- a/src/transport/grpc/mod.rs +++ b/src/transport/grpc/mod.rs @@ -76,6 +76,13 @@ pub struct GrpcTransport { /// In-flight send count (for metrics). #[cfg(feature = "metrics")] inflight: AtomicU64, + + /// Transport-level message filter engine. + filter_engine: super::filter::TransportFilterEngine, + + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl GrpcTransport { @@ -175,6 +182,12 @@ impl GrpcTransport { let healthy = Arc::new(AtomicBool::new(true)); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + )?; + #[cfg(feature = "health")] { let h = Arc::clone(&healthy); @@ -197,6 +210,8 @@ impl GrpcTransport { recv_timeout_ms: config.recv_timeout_ms, #[cfg(feature = "metrics")] inflight: AtomicU64::new(0), + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), }) } } @@ -234,6 +249,15 @@ impl TransportSender for GrpcTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let Some(client) = &self.client else { return SendResult::Fatal(TransportError::Config( "no endpoint configured for sending".into(), @@ -360,9 +384,33 @@ impl TransportReceiver for GrpcTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, _tokens: &[Self::Token]) -> TransportResult<()> { // gRPC has no broker-side persistence — commit is a no-op. // Acknowledgement is implicit in the Push RPC response. diff --git a/src/transport/http.rs b/src/transport/http.rs index 264c2a1..f6ab37d 100644 --- a/src/transport/http.rs +++ b/src/transport/http.rs @@ -126,6 +126,14 @@ pub struct HttpTransportConfig { /// Receive timeout in milliseconds. Default: 100. #[serde(default = "default_recv_timeout_ms")] pub recv_timeout_ms: u64, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } impl Default for HttpTransportConfig { @@ -136,6 +144,8 @@ impl Default for HttpTransportConfig { recv_path: default_recv_path(), recv_buffer_size: default_buffer_size(), recv_timeout_ms: default_recv_timeout_ms(), + filters_in: Vec::new(), + filters_out: Vec::new(), } } } @@ -204,6 +214,13 @@ pub struct HttpTransport { /// Receive timeout in milliseconds (used by receive side). #[cfg(feature = "http-server")] recv_timeout_ms: u64, + + /// Transport-level message filter engine. + filter_engine: super::filter::TransportFilterEngine, + + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl HttpTransport { @@ -262,6 +279,16 @@ impl HttpTransport { "HTTP transport opened" ); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + ) + .unwrap_or_else(|e| { + tracing::warn!(error = %e, "Failed to compile transport filters, filtering disabled"); + super::filter::TransportFilterEngine::empty() + }); + let closed = Arc::new(AtomicBool::new(false)); #[cfg(feature = "health")] @@ -288,6 +315,8 @@ impl HttpTransport { closed, #[cfg(feature = "http-server")] recv_timeout_ms: config.recv_timeout_ms, + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), }) } } @@ -395,6 +424,15 @@ impl TransportSender for HttpTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let Some(base_url) = &self.endpoint else { return SendResult::Fatal(TransportError::Config( "no endpoint configured for sending".into(), @@ -532,6 +570,26 @@ impl TransportReceiver for HttpTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + #[cfg(feature = "logger")] if !messages.is_empty() { tracing::debug!(messages = messages.len(), "HTTP transport: batch received"); @@ -549,6 +607,10 @@ impl TransportReceiver for HttpTransport { } } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, _tokens: &[Self::Token]) -> TransportResult<()> { // HTTP is fire-and-forget — commit is a no-op. Ok(()) @@ -740,6 +802,7 @@ mod tests { recv_path: "/custom".into(), recv_buffer_size: 5000, recv_timeout_ms: 250, + ..Default::default() }; let json = serde_json::to_string(&config).unwrap(); diff --git a/src/transport/kafka/config.rs b/src/transport/kafka/config.rs index d523244..0b0a2f0 100644 --- a/src/transport/kafka/config.rs +++ b/src/transport/kafka/config.rs @@ -473,6 +473,14 @@ pub struct KafkaConfig { #[serde(default)] #[deprecated(since = "1.3.0", note = "Use `librdkafka_overrides` instead")] pub extra_config: HashMap, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } fn default_topic_exclude() -> Vec { @@ -572,6 +580,8 @@ impl Default for KafkaConfig { enable_partition_eof: false, librdkafka_overrides: HashMap::new(), extra_config: HashMap::new(), + filters_in: Vec::new(), + filters_out: Vec::new(), } } } diff --git a/src/transport/kafka/mod.rs b/src/transport/kafka/mod.rs index 66f77f8..91d07e1 100644 --- a/src/transport/kafka/mod.rs +++ b/src/transport/kafka/mod.rs @@ -130,6 +130,11 @@ pub struct KafkaTransport { /// Uses parking_lot::Mutex (no poisoning, faster uncontended) since this /// is on the recv() hot path. topic_refresh: Option>, + /// Transport-level message filter engine. + filter_engine: super::filter::TransportFilterEngine, + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl KafkaTransport { @@ -287,6 +292,12 @@ impl KafkaTransport { let healthy = Arc::new(AtomicBool::new(true)); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + )?; + #[cfg(feature = "health")] { let h = Arc::clone(&healthy); @@ -308,6 +319,8 @@ impl KafkaTransport { subscribed_topics: parking_lot::RwLock::new(subscribed_topics), shutdown_token, topic_refresh, + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), }) } @@ -345,6 +358,15 @@ impl TransportSender for KafkaTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let record: FutureRecord<'_, str, [u8]> = FutureRecord::to(key).payload(payload); // Inject W3C traceparent into Kafka message headers for distributed tracing @@ -531,9 +553,33 @@ impl TransportReceiver for KafkaTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + /// Commit offsets for processed messages. /// /// Uses async commit for better throughput. The commit is batched diff --git a/src/transport/memory/mod.rs b/src/transport/memory/mod.rs index 77aad36..2b1db9b 100644 --- a/src/transport/memory/mod.rs +++ b/src/transport/memory/mod.rs @@ -49,6 +49,14 @@ pub struct MemoryConfig { /// Receive timeout in milliseconds (0 = no wait, return immediately). #[serde(default)] pub recv_timeout_ms: u64, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } fn default_buffer_size() -> usize { @@ -60,6 +68,8 @@ impl Default for MemoryConfig { Self { buffer_size: default_buffer_size(), recv_timeout_ms: 0, + filters_in: Vec::new(), + filters_out: Vec::new(), } } } @@ -82,6 +92,10 @@ pub struct MemoryTransport { committed_seq: AtomicU64, closed: AtomicBool, recv_timeout_ms: u64, + filter_engine: super::filter::TransportFilterEngine, + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl MemoryTransport { @@ -89,6 +103,15 @@ impl MemoryTransport { #[must_use] pub fn new(config: &MemoryConfig) -> Self { let (sender, receiver) = mpsc::channel(config.buffer_size); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + ) + .unwrap_or_else(|e| { + tracing::warn!(error = %e, "Failed to compile transport filters, filtering disabled"); + super::filter::TransportFilterEngine::empty() + }); Self { sender, receiver: tokio::sync::Mutex::new(receiver), @@ -96,6 +119,8 @@ impl MemoryTransport { committed_seq: AtomicU64::new(0), closed: AtomicBool::new(false), recv_timeout_ms: config.recv_timeout_ms, + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), } } @@ -195,6 +220,15 @@ impl TransportSender for MemoryTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let seq = self.sequence.fetch_add(1, Ordering::Relaxed); let timestamp_ms = chrono::Utc::now().timestamp_millis(); @@ -263,9 +297,33 @@ impl TransportReceiver for MemoryTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, tokens: &[Self::Token]) -> TransportResult<()> { if let Some(max_seq) = tokens.iter().map(|t| t.seq).max() { let _ = self.committed_seq.fetch_max(max_seq, Ordering::Relaxed); @@ -352,6 +410,7 @@ mod tests { let config = MemoryConfig { buffer_size: 1, recv_timeout_ms: 0, + ..Default::default() }; let transport = MemoryTransport::new(&config); diff --git a/src/transport/mod.rs b/src/transport/mod.rs index 9d3dcb0..bc1e62a 100644 --- a/src/transport/mod.rs +++ b/src/transport/mod.rs @@ -52,6 +52,7 @@ mod detect; mod error; pub mod factory; +pub mod filter; mod payload; pub mod propagation; mod traits; diff --git a/src/transport/pipe.rs b/src/transport/pipe.rs index dac8bde..01d6348 100644 --- a/src/transport/pipe.rs +++ b/src/transport/pipe.rs @@ -58,6 +58,14 @@ pub struct PipeTransportConfig { /// Receive timeout in milliseconds (0 = block until data). Default: 100. #[serde(default = "default_recv_timeout_ms")] pub recv_timeout_ms: u64, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } fn default_recv_timeout_ms() -> u64 { @@ -68,6 +76,8 @@ impl Default for PipeTransportConfig { fn default() -> Self { Self { recv_timeout_ms: default_recv_timeout_ms(), + filters_in: Vec::new(), + filters_out: Vec::new(), } } } @@ -99,6 +109,10 @@ pub struct PipeTransport { sequence: AtomicU64, closed: Arc, recv_timeout_ms: u64, + filter_engine: super::filter::TransportFilterEngine, + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl PipeTransport { @@ -111,6 +125,16 @@ impl PipeTransport { "Pipe transport opened" ); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + ) + .unwrap_or_else(|e| { + tracing::warn!(error = %e, "Failed to compile transport filters, filtering disabled"); + super::filter::TransportFilterEngine::empty() + }); + let closed = Arc::new(AtomicBool::new(false)); #[cfg(feature = "health")] @@ -131,6 +155,8 @@ impl PipeTransport { sequence: AtomicU64::new(0), closed, recv_timeout_ms: config.recv_timeout_ms, + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), } } } @@ -164,6 +190,15 @@ impl TransportSender for PipeTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let mut stdout = self.stdout.lock().await; // Write payload + newline @@ -272,6 +307,26 @@ impl TransportReceiver for PipeTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + #[cfg(feature = "logger")] if !messages.is_empty() { tracing::debug!( @@ -283,6 +338,10 @@ impl TransportReceiver for PipeTransport { Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, _tokens: &[Self::Token]) -> TransportResult<()> { // No-op: stdin is a forward-only stream, cannot rewind or acknowledge Ok(()) @@ -322,6 +381,7 @@ mod tests { fn config_serde_roundtrip() { let config = PipeTransportConfig { recv_timeout_ms: 500, + ..Default::default() }; let json = serde_json::to_string(&config).unwrap(); let parsed: PipeTransportConfig = serde_json::from_str(&json).unwrap(); diff --git a/src/transport/redis_transport.rs b/src/transport/redis_transport.rs index c8086e9..fe26599 100644 --- a/src/transport/redis_transport.rs +++ b/src/transport/redis_transport.rs @@ -117,6 +117,14 @@ pub struct RedisTransportConfig { /// Block timeout in milliseconds for `XREADGROUP`. Default: 5000. #[serde(default = "default_block_ms")] pub block_ms: usize, + + /// Inbound message filters (applied on recv before caller sees messages). + #[serde(default)] + pub filters_in: Vec, + + /// Outbound message filters (applied on send before transport dispatches). + #[serde(default)] + pub filters_out: Vec, } impl Default for RedisTransportConfig { @@ -128,6 +136,8 @@ impl Default for RedisTransportConfig { consumer: default_consumer(), max_stream_len: None, block_ms: default_block_ms(), + filters_in: Vec::new(), + filters_out: Vec::new(), } } } @@ -158,6 +168,11 @@ pub struct RedisTransport { closed: Arc, /// Whether the consumer group has been ensured for a given stream. group_created: Mutex>, + /// Transport-level message filter engine. + filter_engine: super::filter::TransportFilterEngine, + /// Buffer for messages staged to DLQ by inbound filters. + /// Drained by `take_filtered_dlq_entries()`. + filtered_dlq_buffer: parking_lot::Mutex>, } impl RedisTransport { @@ -192,6 +207,12 @@ impl RedisTransport { "Redis transport opened" ); + let filter_engine = super::filter::TransportFilterEngine::new( + &config.filters_in, + &config.filters_out, + &crate::transport::filter::TransportFilterTierConfig::default(), + )?; + let closed = Arc::new(AtomicBool::new(false)); #[cfg(feature = "health")] @@ -211,6 +232,8 @@ impl RedisTransport { config: config.clone(), closed, group_created: Mutex::new(std::collections::HashSet::new()), + filter_engine, + filtered_dlq_buffer: parking_lot::Mutex::new(Vec::new()), }) } @@ -284,6 +307,15 @@ impl TransportSender for RedisTransport { return SendResult::Fatal(TransportError::Closed); } + // Outbound filter check + if self.filter_engine.has_outbound_filters() { + match self.filter_engine.apply_outbound(payload) { + super::filter::FilterDisposition::Pass => {} + super::filter::FilterDisposition::Drop => return SendResult::Ok, + super::filter::FilterDisposition::Dlq => return SendResult::FilteredDlq, + } + } + let stream = match self.resolve_stream(key) { Ok(s) => s.to_string(), Err(e) => return SendResult::Fatal(e), @@ -388,6 +420,26 @@ impl TransportReceiver for RedisTransport { } } + // Apply inbound filters: drop messages, stage DLQ entries + if self.filter_engine.has_inbound_filters() { + let mut staged_dlq: Vec = Vec::new(); + messages.retain(|msg| match self.filter_engine.apply_inbound(&msg.payload) { + super::filter::FilterDisposition::Pass => true, + super::filter::FilterDisposition::Drop => false, + super::filter::FilterDisposition::Dlq => { + staged_dlq.push(super::filter::FilteredDlqEntry { + payload: msg.payload.clone(), + key: msg.key.clone(), + reason: "transport filter".to_string(), + }); + false + } + }); + if !staged_dlq.is_empty() { + self.filtered_dlq_buffer.lock().extend(staged_dlq); + } + } + #[cfg(feature = "logger")] if !messages.is_empty() { tracing::debug!( @@ -405,6 +457,10 @@ impl TransportReceiver for RedisTransport { Ok(messages) } + fn take_filtered_dlq_entries(&self) -> Vec { + std::mem::take(&mut *self.filtered_dlq_buffer.lock()) + } + async fn commit(&self, tokens: &[Self::Token]) -> TransportResult<()> { if tokens.is_empty() { return Ok(()); @@ -584,6 +640,7 @@ block_ms: 2000 consumer: consumer.into(), max_stream_len: Some(1000), block_ms: 1000, + ..Default::default() }; let transport = RedisTransport::new(&config).await.unwrap(); diff --git a/src/transport/traits.rs b/src/transport/traits.rs index 6cf0105..4fc83a0 100644 --- a/src/transport/traits.rs +++ b/src/transport/traits.rs @@ -7,6 +7,7 @@ // Copyright: (c) 2026 HYPERI PTY LIMITED use super::error::TransportResult; +use super::filter::FilteredDlqEntry; use super::types::{Message, SendResult}; use std::fmt::{Debug, Display}; use std::future::Future; @@ -70,6 +71,13 @@ pub trait TransportReceiver: TransportBase { /// /// Returns immediately with available messages (may be fewer than `max`). /// Returns empty vec if no messages are available. + /// + /// **Filter behaviour:** if the transport has inbound filters configured, + /// `recv()` removes messages that match `action: drop` filters and stages + /// messages matching `action: dlq` filters into an internal queue. Use + /// [`take_filtered_dlq_entries`](Self::take_filtered_dlq_entries) after + /// each `recv()` call to retrieve the staged DLQ entries and route them + /// via your DLQ handle. fn recv( &self, max: usize, @@ -83,6 +91,29 @@ pub trait TransportReceiver: TransportBase { /// - File: advances read position /// - Memory: advances internal sequence fn commit(&self, tokens: &[Self::Token]) -> impl Future> + Send; + + /// Drain DLQ entries staged by inbound filtering. + /// + /// When a transport's inbound filters classify messages as `action: dlq`, + /// the messages are removed from the `recv()` result and staged in an + /// internal queue. Call this method after each `recv()` to drain the + /// staged entries and route them to your DLQ. + /// + /// Default implementation returns an empty vec — transports without + /// filter support don't need to override this. + /// + /// # Example + /// + /// ```rust,ignore + /// let messages = transport.recv(100).await?; + /// for entry in transport.take_filtered_dlq_entries() { + /// dlq.send(DlqEntry::new("filter", entry.reason, entry.payload)).await?; + /// } + /// // Process passing messages... + /// ``` + fn take_filtered_dlq_entries(&self) -> Vec { + Vec::new() + } } /// Combined transport — implements both send and receive. diff --git a/src/transport/types.rs b/src/transport/types.rs index 0de11b2..9129397 100644 --- a/src/transport/types.rs +++ b/src/transport/types.rs @@ -141,6 +141,9 @@ pub enum SendResult { Backpressured, /// Fatal error, cannot continue. Fatal(TransportError), + /// Message matched an outbound filter with `action: dlq`. + /// Caller is responsible for DLQ routing. + FilteredDlq, } impl SendResult { @@ -161,6 +164,12 @@ impl SendResult { pub fn is_fatal(&self) -> bool { matches!(self, Self::Fatal(_)) } + + /// Returns true if filtered for DLQ routing. + #[must_use] + pub fn is_filtered_dlq(&self) -> bool { + matches!(self, Self::FilteredDlq) + } } /// Top-level transport configuration. diff --git a/tests/engine_integration.rs b/tests/engine_integration.rs index 0ae196e..f057a26 100644 --- a/tests/engine_integration.rs +++ b/tests/engine_integration.rs @@ -25,6 +25,7 @@ fn make_transport() -> MemoryTransport { MemoryTransport::new(&MemoryConfig { buffer_size: 1000, recv_timeout_ms: 0, + ..Default::default() }) } diff --git a/tests/fixtures/cel_classifier_parity.json b/tests/fixtures/cel_classifier_parity.json new file mode 100644 index 0000000..56d9a2a --- /dev/null +++ b/tests/fixtures/cel_classifier_parity.json @@ -0,0 +1,31 @@ +{ + "_comment": "Shared test corpus for the CEL classifier. The Python classifier in dfe_engine.cel.classify and the Rust classifier in hyperi-rustlib src/transport/filter/classify.rs MUST agree on every entry. A copy of this file lives in /projects/hyperi-rustlib/tests/fixtures/cel_classifier_parity.json — keep them identical.", + "_columns": "expression: input. tier: 1|2|3. op_kind: optional Tier 1 op kind. op_field: optional Tier 1 field. op_value: optional Tier 1 comparison value. fields: optional sorted list of field references for Tier 2/3.", + "cases": [ + { "expression": "has(_table)", "tier": 1, "op_kind": "field_exists", "op_field": "_table" }, + { "expression": "has(metadata.source)", "tier": 1, "op_kind": "field_exists", "op_field": "metadata.source" }, + { "expression": " has( _internal ) ", "tier": 1, "op_kind": "field_exists", "op_field": "_internal" }, + { "expression": "!has(_internal)", "tier": 1, "op_kind": "field_not_exists", "op_field": "_internal" }, + { "expression": "! has( _table )", "tier": 1, "op_kind": "field_not_exists", "op_field": "_table" }, + { "expression": "source == \"internal\"", "tier": 1, "op_kind": "field_equals", "op_field": "source", "op_value": "internal" }, + { "expression": "source != \"external\"", "tier": 1, "op_kind": "field_not_equals", "op_field": "source", "op_value": "external" }, + { "expression": "host.startsWith(\"prod-\")", "tier": 1, "op_kind": "field_starts_with", "op_field": "host", "op_value": "prod-" }, + { "expression": "host.endsWith(\".internal\")", "tier": 1, "op_kind": "field_ends_with", "op_field": "host", "op_value": ".internal" }, + { "expression": "message.contains(\"error\")", "tier": 1, "op_kind": "field_contains", "op_field": "message", "op_value": "error" }, + { "expression": "metadata.host.startsWith(\"db-\")", "tier": 1, "op_kind": "field_starts_with", "op_field": "metadata.host", "op_value": "db-" }, + + { "expression": "severity > 3", "tier": 2, "fields": ["severity"] }, + { "expression": "severity > 3 && source != \"internal\"", "tier": 2, "fields": ["severity", "source"] }, + { "expression": "size(tags) > 0", "tier": 2, "fields": ["tags"] }, + { "expression": "user.role == admin_role", "tier": 2, "fields": ["user.role", "admin_role"] }, + { "expression": "count >= 10 || count <= 1", "tier": 2, "fields": ["count"] }, + { "expression": "field == \"value with == in it\"", "tier": 1, "op_kind": "field_equals", "op_field": "field", "op_value": "value with == in it" }, + + { "expression": "host.matches(\"^prod-.*$\")", "tier": 3, "fields": ["host"] }, + { "expression": "tags.exists(t, t == \"hot\")", "tier": 3, "fields": ["tags", "t"] }, + { "expression": "items.all(i, i.size > 0)", "tier": 3, "fields": ["i", "i.size", "items"] }, + { "expression": "tags.filter(t, t.startsWith(\"prod\")).size() > 0", "tier": 3, "fields": ["tags", "t"] }, + { "expression": "timestamp(created_at) > timestamp(\"2026-01-01\")", "tier": 3, "fields": ["created_at"] }, + { "expression": "duration(\"5m\") > duration(\"1m\")", "tier": 3, "fields": [] } + ] +} diff --git a/tests/transport_filter.rs b/tests/transport_filter.rs new file mode 100644 index 0000000..5b8ba8d --- /dev/null +++ b/tests/transport_filter.rs @@ -0,0 +1,1426 @@ +// Project: hyperi-rustlib +// File: tests/transport_filter.rs +// Purpose: Integration + adversarial tests for transport filter engine +// Language: Rust +// +// License: FSL-1.1-ALv2 +// Copyright: (c) 2026 HYPERI PTY LIMITED + +//! Transport filter engine integration tests. +//! +//! Tests the full round-trip: configure filters on MemoryTransport, inject +//! messages, verify filtering behaviour in recv() and send(). +//! +//! Includes: +//! - Expected failures (invalid CEL, tier rejections, DLQ without config) +//! - Sample data from real DFE pipelines +//! - Adversarial inputs (binary garbage, truncated JSON, Unicode, 1MB payloads) + +#![cfg(feature = "transport-memory")] + +use hyperi_rustlib::transport::filter::{ + FilterAction, FilterDisposition, FilterRule, TransportFilterEngine, TransportFilterTierConfig, +}; +use hyperi_rustlib::transport::memory::{MemoryConfig, MemoryTransport}; +use hyperi_rustlib::transport::{TransportReceiver, TransportSender}; + +// ============================================================================ +// Helper: build a MemoryTransport with inbound filters +// ============================================================================ + +fn transport_with_inbound_filters(filters: Vec) -> MemoryTransport { + MemoryTransport::new(&MemoryConfig { + buffer_size: 1000, + recv_timeout_ms: 50, + filters_in: filters, + filters_out: Vec::new(), + }) +} + +fn transport_with_outbound_filters(filters: Vec) -> MemoryTransport { + MemoryTransport::new(&MemoryConfig { + buffer_size: 1000, + recv_timeout_ms: 50, + filters_in: Vec::new(), + filters_out: filters, + }) +} + +fn transport_no_filters() -> MemoryTransport { + MemoryTransport::new(&MemoryConfig { + buffer_size: 1000, + recv_timeout_ms: 50, + ..Default::default() + }) +} + +// ============================================================================ +// Section 1: MemoryTransport Round-Trip Integration Tests +// ============================================================================ + +#[tokio::test] +async fn inbound_filter_drops_matching_messages() { + let transport = transport_with_inbound_filters(vec![FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Drop, + }]); + + // Inject 5 messages: 2 poison, 3 healthy + transport + .inject(None, br#"{"status":"ok","id":1}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"poison","id":2}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"ok","id":3}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"poison","id":4}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"ok","id":5}"#.to_vec()) + .await + .unwrap(); + + let messages = transport.recv(10).await.unwrap(); + assert_eq!( + messages.len(), + 3, + "Should receive 3 messages (2 poison dropped)" + ); +} + +#[tokio::test] +async fn inbound_filter_dlq_removes_from_batch() { + let transport = transport_with_inbound_filters(vec![FilterRule { + expression: "has(_internal)".into(), + action: FilterAction::Dlq, + }]); + + transport + .inject(None, br#"{"data":"keep"}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"_internal":true,"data":"dlq"}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"data":"also_keep"}"#.to_vec()) + .await + .unwrap(); + + let messages = transport.recv(10).await.unwrap(); + assert_eq!( + messages.len(), + 2, + "DLQ message should be removed from batch" + ); +} + +#[tokio::test] +async fn outbound_filter_blocks_send() { + let transport = transport_with_outbound_filters(vec![FilterRule { + expression: "has(debug)".into(), + action: FilterAction::Drop, + }]); + + // Send a debug message — should be silently dropped + let result = transport + .send("topic", br#"{"debug":true,"msg":"test"}"#) + .await; + assert!( + result.is_ok(), + "Filtered send should return Ok (silent drop)" + ); + + // Send a normal message — should go through + let result = transport.send("topic", br#"{"msg":"normal"}"#).await; + assert!(result.is_ok()); + + // Only the normal message should be receivable + let messages = transport.recv(10).await.unwrap(); + assert_eq!( + messages.len(), + 1, + "Only non-filtered message should be received" + ); +} + +#[tokio::test] +async fn outbound_filter_dlq_returns_filtered_dlq() { + let transport = transport_with_outbound_filters(vec![FilterRule { + expression: r#"status == "bad""#.into(), + action: FilterAction::Dlq, + }]); + + let result = transport + .send("topic", br#"{"status":"bad","data":"x"}"#) + .await; + assert!( + result.is_filtered_dlq(), + "DLQ filter should return FilteredDlq" + ); + + let result = transport + .send("topic", br#"{"status":"good","data":"x"}"#) + .await; + assert!(result.is_ok(), "Non-matching message should send normally"); +} + +#[tokio::test] +async fn no_filters_passthrough() { + let transport = transport_no_filters(); + + for i in 0..10 { + transport + .inject(None, format!(r#"{{"id":{i}}}"#).into_bytes()) + .await + .unwrap(); + } + + let messages = transport.recv(20).await.unwrap(); + assert_eq!( + messages.len(), + 10, + "All messages should pass through with no filters" + ); +} + +#[tokio::test] +async fn first_match_wins_integration() { + let transport = transport_with_inbound_filters(vec![ + FilterRule { + expression: r#"status == "a""#.into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: r#"status == "b""#.into(), + action: FilterAction::Dlq, + }, + FilterRule { + expression: "has(status)".into(), + action: FilterAction::Drop, + }, + ]); + + transport + .inject(None, br#"{"status":"a"}"#.to_vec()) + .await + .unwrap(); // matches filter 0 → drop + transport + .inject(None, br#"{"status":"b"}"#.to_vec()) + .await + .unwrap(); // matches filter 1 → dlq + transport + .inject(None, br#"{"status":"c"}"#.to_vec()) + .await + .unwrap(); // matches filter 2 → drop + transport + .inject(None, br#"{"no_status":true}"#.to_vec()) + .await + .unwrap(); // matches nothing → pass + + let messages = transport.recv(10).await.unwrap(); + assert_eq!(messages.len(), 1, "Only the no-status message should pass"); +} + +#[tokio::test] +async fn mixed_tier1_filters() { + let transport = transport_with_inbound_filters(vec![ + FilterRule { + expression: "has(_internal)".into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: r#"source == "test""#.into(), + action: FilterAction::Drop, + }, + FilterRule { + expression: r#"host.startsWith("debug-")"#.into(), + action: FilterAction::Drop, + }, + ]); + + transport + .inject(None, br#"{"_internal":true}"#.to_vec()) + .await + .unwrap(); // drop (exists) + transport + .inject(None, br#"{"source":"test"}"#.to_vec()) + .await + .unwrap(); // drop (equals) + transport + .inject(None, br#"{"host":"debug-web01"}"#.to_vec()) + .await + .unwrap(); // drop (startsWith) + transport + .inject(None, br#"{"host":"prod-web01","source":"live"}"#.to_vec()) + .await + .unwrap(); // pass + + let messages = transport.recv(10).await.unwrap(); + assert_eq!(messages.len(), 1); +} + +// ============================================================================ +// Section 2: Expected Failure Tests +// ============================================================================ + +#[test] +fn expected_fail_tier2_without_opt_in() { + let rules = vec![FilterRule { + expression: r#"severity > 3 && source != "internal""#.into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err(), "Tier 2 should be rejected without opt-in"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Tier 2"), "Error should mention tier: {err}"); + assert!( + err.contains("allow_cel_filters_in"), + "Error should mention config to enable: {err}" + ); +} + +#[test] +fn expected_fail_tier3_without_complex_opt_in() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, // Tier 2 enabled, but NOT Tier 3 + ..Default::default() + }; + let rules = vec![FilterRule { + expression: r#"field.matches("^prod-.*")"#.into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &tier_config); + assert!( + result.is_err(), + "Tier 3 should be rejected without complex opt-in" + ); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Tier 3"), "Error should mention tier: {err}"); +} + +#[test] +fn expected_fail_tier3_iteration_blocked() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let rules = vec![FilterRule { + expression: r#"tags.exists(t, t == "pii")"#.into(), + action: FilterAction::Dlq, + }]; + let result = TransportFilterEngine::new(&rules, &[], &tier_config); + assert!(result.is_err(), "Iteration should be Tier 3"); +} + +#[test] +fn expected_fail_invalid_cel_syntax() { + let rules = vec![FilterRule { + expression: "this is not valid ((( CEL syntax )))".into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err(), "Invalid CEL should fail at construction"); +} + +#[test] +fn expected_fail_empty_expression() { + let rules = vec![FilterRule { + expression: String::new(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err(), "Empty expression should fail"); +} + +#[test] +fn expected_fail_whitespace_only_expression() { + let rules = vec![FilterRule { + expression: " ".into(), + action: FilterAction::Drop, + }]; + let result = TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()); + assert!(result.is_err(), "Whitespace-only expression should fail"); +} + +// ============================================================================ +// Section 3: Sample Data Tests (real DFE pipeline payloads) +// ============================================================================ + +#[test] +fn sample_data_syslog_event() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"source_type == "syslog""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let syslog_event = br#"{"source_type":"syslog","host":"web01","facility":"auth","severity":6,"message":"user login","_timestamp":"2026-01-01T00:00:00Z"}"#; + assert_eq!(engine.apply_inbound(syslog_event), FilterDisposition::Drop); + + let windows_event = + br#"{"source_type":"windows_event","host":"dc01","event_id":4624,"message":"logon"}"#; + assert_eq!(engine.apply_inbound(windows_event), FilterDisposition::Pass); +} + +#[test] +fn sample_data_nested_cloud_event() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"metadata.source == "aws""#.into(), + action: FilterAction::Dlq, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let aws_event = br#"{"metadata":{"source":"aws","region":"ap-southeast-2"},"event_type":"cloudtrail","data":{"user":"admin"}}"#; + assert_eq!(engine.apply_inbound(aws_event), FilterDisposition::Dlq); + + let azure_event = + br#"{"metadata":{"source":"azure","tenant":"contoso"},"event_type":"activity_log"}"#; + assert_eq!(engine.apply_inbound(azure_event), FilterDisposition::Pass); +} + +#[test] +fn sample_data_dfe_loader_routing() { + // Loader uses _table field for routing — filter out internal tables + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"_table.startsWith("_internal")"#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let internal = br#"{"_table":"_internal_metrics","value":42}"#; + assert_eq!(engine.apply_inbound(internal), FilterDisposition::Drop); + + let normal = br#"{"_table":"auth_events","user":"admin","action":"login"}"#; + assert_eq!(engine.apply_inbound(normal), FilterDisposition::Pass); +} + +#[test] +fn sample_data_receiver_poison_message() { + let engine = TransportFilterEngine::new( + &[ + FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Dlq, + }, + FilterRule { + expression: "!has(_table)".into(), + action: FilterAction::Dlq, + }, + ], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let poison = br#"{"status":"poison","_table":"events","data":"bad"}"#; + assert_eq!(engine.apply_inbound(poison), FilterDisposition::Dlq); + + let no_table = br#"{"status":"ok","data":"missing routing field"}"#; + assert_eq!(engine.apply_inbound(no_table), FilterDisposition::Dlq); + + let valid = br#"{"status":"ok","_table":"events","data":"good"}"#; + assert_eq!(engine.apply_inbound(valid), FilterDisposition::Pass); +} + +#[test] +fn sample_data_fetcher_debug_filter() { + // Fetcher outbound: don't send debug events downstream + let engine = TransportFilterEngine::new( + &[], + &[FilterRule { + expression: "has(debug)".into(), + action: FilterAction::Drop, + }], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let debug_event = br#"{"debug":true,"source":"aws","data":"test"}"#; + assert_eq!(engine.apply_outbound(debug_event), FilterDisposition::Drop); + + let real_event = br#"{"source":"aws","event_type":"cloudtrail","data":{"user":"admin"}}"#; + assert_eq!(engine.apply_outbound(real_event), FilterDisposition::Pass); +} + +// ============================================================================ +// Section 4: Adversarial Tests +// ============================================================================ + +#[test] +fn adversarial_binary_garbage() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let garbage: &[u8] = &[0xFF, 0xFE, 0x00, 0x01, 0x02, 0xAB, 0xCD]; + assert_eq!(engine.apply_inbound(garbage), FilterDisposition::Pass); +} + +#[test] +fn adversarial_truncated_json() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + // Truncated JSON with `"_table":` pattern: the memmem fast-path detects + // the field exists pattern, so it matches the filter (Drop action). + // This is safe filtering behaviour — broken JSON containing the field + // pattern is treated as if the field exists. + assert_eq!( + engine.apply_inbound(br#"{"_table":"ev"#), + FilterDisposition::Drop + ); + // Truncated before the colon: pattern not present, no match + assert_eq!( + engine.apply_inbound(br#"{"_table"#), + FilterDisposition::Pass + ); + assert_eq!(engine.apply_inbound(br#"{"#), FilterDisposition::Pass); +} + +#[test] +fn adversarial_empty_payload() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(anything)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert_eq!(engine.apply_inbound(b""), FilterDisposition::Pass); +} + +#[test] +fn adversarial_json_null() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(field)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert_eq!(engine.apply_inbound(b"null"), FilterDisposition::Pass); +} + +#[test] +fn adversarial_json_array() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(field)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert_eq!(engine.apply_inbound(b"[1,2,3]"), FilterDisposition::Pass); +} + +#[test] +fn adversarial_json_string() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(field)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert_eq!( + engine.apply_inbound(br#""just a string""#), + FilterDisposition::Pass + ); +} + +#[test] +fn adversarial_cel_chars_in_value() { + // Field value contains characters that look like CEL operators + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"field == "value with == and && in it""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let payload = br#"{"field":"value with == and && in it"}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Drop); + + let other = br#"{"field":"normal value"}"#; + assert_eq!(engine.apply_inbound(other), FilterDisposition::Pass); +} + +#[test] +fn adversarial_large_payload_1mb() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + // 1MB payload with the poison field buried at the start + let mut payload = br#"{"status":"poison","data":""#.to_vec(); + payload.extend(vec![b'x'; 1_000_000]); + payload.extend(br#""}"#); + + assert_eq!(engine.apply_inbound(&payload), FilterDisposition::Drop); +} + +#[test] +fn adversarial_many_filters() { + let rules: Vec = (0..100) + .map(|i| FilterRule { + expression: format!(r#"field_{i} == "value_{i}""#), + action: FilterAction::Drop, + }) + .collect(); + + let engine = + TransportFilterEngine::new(&rules, &[], &TransportFilterTierConfig::default()).unwrap(); + + // Message matching filter 99 (last one) + let payload = br#"{"field_99":"value_99"}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Drop); + + // Message matching nothing + let payload = br#"{"field_999":"value_999"}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Pass); +} + +#[test] +fn adversarial_missing_field_no_error() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"nonexistent_field == "value""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let payload = br#"{"other":"data"}"#; + // Field missing → no match (not error) + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Pass); +} + +#[test] +fn adversarial_unicode_field_names() { + // Note: sonic_rs handles UTF-8 field names correctly + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"has(name)"#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let payload = br#"{"name":"\u00e9v\u00e9nement","id":1}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Drop); +} + +#[test] +fn adversarial_deeply_nested_path() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"a.b.c.d == "deep""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + let payload = br#"{"a":{"b":{"c":{"d":"deep"}}}}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Drop); + + let shallow = br#"{"a":{"b":"leaf"}}"#; + assert_eq!(engine.apply_inbound(shallow), FilterDisposition::Pass); +} + +#[test] +fn adversarial_msgpack_bypasses_filters() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + // MsgPack fixmap with _table key — should bypass filter (not crash) + let msgpack = &[ + 0x81, 0xa6, 0x5f, 0x74, 0x61, 0x62, 0x6c, 0x65, 0xa6, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, + ]; + assert_eq!(engine.apply_inbound(msgpack), FilterDisposition::Pass); +} + +// ============================================================================ +// Section 5: Engine API Tests +// ============================================================================ + +#[test] +fn engine_empty_has_no_overhead() { + let engine = TransportFilterEngine::empty(); + assert!(!engine.has_inbound_filters()); + assert!(!engine.has_outbound_filters()); + assert!(!engine.has_dlq_filters_in()); + assert!(!engine.has_dlq_filters_out()); + assert_eq!( + engine.apply_inbound(br#"{"any":"thing"}"#), + FilterDisposition::Pass + ); +} + +#[test] +fn engine_direction_independence() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(in_only)".into(), + action: FilterAction::Drop, + }], + &[FilterRule { + expression: "has(out_only)".into(), + action: FilterAction::Drop, + }], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert!(engine.has_inbound_filters()); + assert!(engine.has_outbound_filters()); + + // in_only field: dropped inbound, passes outbound + let payload = br#"{"in_only":true}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Drop); + assert_eq!(engine.apply_outbound(payload), FilterDisposition::Pass); + + // out_only field: passes inbound, dropped outbound + let payload = br#"{"out_only":true}"#; + assert_eq!(engine.apply_inbound(payload), FilterDisposition::Pass); + assert_eq!(engine.apply_outbound(payload), FilterDisposition::Drop); +} + +#[test] +fn engine_dlq_filter_detection() { + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(x)".into(), + action: FilterAction::Dlq, + }], + &[FilterRule { + expression: "has(y)".into(), + action: FilterAction::Drop, + }], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + assert!(engine.has_dlq_filters_in()); + assert!(!engine.has_dlq_filters_out()); +} + +#[test] +fn config_deserializes_from_yaml() { + let yaml = r#" +filters_in: + - expression: 'has(_table)' + action: drop + - expression: 'status == "poison"' + action: dlq +filters_out: + - expression: 'has(debug)' + action: drop +"#; + + #[derive(serde::Deserialize)] + struct TestConfig { + #[serde(default)] + filters_in: Vec, + #[serde(default)] + filters_out: Vec, + } + + let config: TestConfig = serde_yaml_ng::from_str(yaml).unwrap(); + assert_eq!(config.filters_in.len(), 2); + assert_eq!(config.filters_out.len(), 1); + assert_eq!(config.filters_in[0].expression, "has(_table)"); + assert_eq!(config.filters_in[0].action, FilterAction::Drop); + assert_eq!(config.filters_in[1].action, FilterAction::Dlq); +} + +#[test] +fn tier_config_deserializes_from_yaml() { + let yaml = r#" +allow_cel_filters_in: true +allow_complex_filters_out: true +"#; + let config: TransportFilterTierConfig = serde_yaml_ng::from_str(yaml).unwrap(); + assert!(config.allow_cel_filters_in); + assert!(!config.allow_cel_filters_out); + assert!(!config.allow_complex_filters_in); + assert!(config.allow_complex_filters_out); +} + +#[test] +fn empty_filters_config_deserializes() { + let yaml = "{}"; + #[derive(serde::Deserialize)] + struct TestConfig { + #[serde(default)] + filters_in: Vec, + } + let config: TestConfig = serde_yaml_ng::from_str(yaml).unwrap(); + assert!(config.filters_in.is_empty()); +} + +// ============================================================================ +// Section 6: Tier Classification Verification +// ============================================================================ + +#[test] +fn tier1_patterns_all_accepted_by_default() { + let expressions = [ + "has(field)", + "!has(field)", + r#"field == "value""#, + r#"field != "value""#, + r#"field.startsWith("prefix")"#, + r#"field.endsWith("suffix")"#, + r#"field.contains("sub")"#, + r#"nested.path == "value""#, + r#"a.b.c.d == "deep""#, + ]; + + for expr in &expressions { + let result = TransportFilterEngine::new( + &[FilterRule { + expression: (*expr).into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ); + assert!( + result.is_ok(), + "Tier 1 expression should be accepted by default: {expr}" + ); + } +} + +#[test] +fn tier2_patterns_rejected_by_default() { + let expressions = [ + r#"severity > 3 && source != "internal""#, + r#"count >= 100"#, + r#"a == b"#, // field-to-field comparison + ]; + + for expr in &expressions { + let result = TransportFilterEngine::new( + &[FilterRule { + expression: (*expr).into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ); + assert!( + result.is_err(), + "Tier 2 expression should be rejected by default: {expr}" + ); + } +} + +// ============================================================================ +// Section 7: Tier 2/3 CEL Evaluation Tests (requires expression feature) +// ============================================================================ + +#[cfg(feature = "expression")] +#[test] +fn tier2_compound_expression_evaluates_correctly() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"severity > 3 && source != "internal""#.into(), + action: FilterAction::Dlq, + }], + &[], + &tier_config, + ) + .unwrap(); + + // Matches: severity > 3 AND source != "internal" + let match_payload = br#"{"severity":5,"source":"external"}"#; + assert_eq!(engine.apply_inbound(match_payload), FilterDisposition::Dlq); + + // No match: severity not > 3 + let no_match = br#"{"severity":1,"source":"external"}"#; + assert_eq!(engine.apply_inbound(no_match), FilterDisposition::Pass); + + // No match: source IS "internal" + let internal = br#"{"severity":10,"source":"internal"}"#; + assert_eq!(engine.apply_inbound(internal), FilterDisposition::Pass); +} + +#[cfg(feature = "expression")] +#[test] +fn tier2_size_function() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "size(items) > 0".into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + let with_items = br#"{"items":["a","b","c"]}"#; + assert_eq!(engine.apply_inbound(with_items), FilterDisposition::Drop); + + let empty_items = br#"{"items":[]}"#; + assert_eq!(engine.apply_inbound(empty_items), FilterDisposition::Pass); +} + +#[cfg(feature = "expression")] +#[test] +fn tier2_field_to_field_comparison() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "expected == actual".into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + let matching = br#"{"expected":"x","actual":"x"}"#; + assert_eq!(engine.apply_inbound(matching), FilterDisposition::Drop); + + let mismatched = br#"{"expected":"x","actual":"y"}"#; + assert_eq!(engine.apply_inbound(mismatched), FilterDisposition::Pass); +} + +#[cfg(feature = "expression")] +#[test] +fn tier3_regex_evaluates_correctly() { + let tier_config = TransportFilterTierConfig { + allow_complex_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"host.matches("^prod-.*$")"#.into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + let prod = br#"{"host":"prod-web01"}"#; + assert_eq!(engine.apply_inbound(prod), FilterDisposition::Drop); + + let dev = br#"{"host":"dev-web01"}"#; + assert_eq!(engine.apply_inbound(dev), FilterDisposition::Pass); +} + +#[cfg(feature = "expression")] +#[test] +fn tier2_missing_field_safe() { + let tier_config = TransportFilterTierConfig { + allow_cel_filters_in: true, + ..Default::default() + }; + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: r#"severity > 3 && source != "internal""#.into(), + action: FilterAction::Drop, + }], + &[], + &tier_config, + ) + .unwrap(); + + // Missing severity field — should NOT match (evaluate_condition returns false on missing) + let no_severity = br#"{"source":"external"}"#; + assert_eq!(engine.apply_inbound(no_severity), FilterDisposition::Pass); +} + +// ============================================================================ +// Section 8: take_filtered_dlq_entries() — DLQ buffering integration +// ============================================================================ + +#[tokio::test] +async fn dlq_filter_entries_exposed_via_take() { + let transport = transport_with_inbound_filters(vec![FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Dlq, + }]); + + transport + .inject(None, br#"{"status":"ok","id":1}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"poison","id":2}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"poison","id":3}"#.to_vec()) + .await + .unwrap(); + + let messages = transport.recv(10).await.unwrap(); + assert_eq!( + messages.len(), + 1, + "Only the non-poison message should be in result" + ); + + // The DLQ entries should be exposed via take_filtered_dlq_entries + let dlq_entries = transport.take_filtered_dlq_entries(); + assert_eq!(dlq_entries.len(), 2, "Two DLQ entries should be staged"); + assert!(dlq_entries[0].payload.windows(6).any(|w| w == b"poison")); + assert!(dlq_entries[1].payload.windows(6).any(|w| w == b"poison")); +} + +#[tokio::test] +async fn take_filtered_dlq_entries_drains_buffer() { + let transport = transport_with_inbound_filters(vec![FilterRule { + expression: "has(_internal)".into(), + action: FilterAction::Dlq, + }]); + + transport + .inject(None, br#"{"_internal":true}"#.to_vec()) + .await + .unwrap(); + let _ = transport.recv(10).await.unwrap(); + + // First take returns the entry + let first = transport.take_filtered_dlq_entries(); + assert_eq!(first.len(), 1); + + // Second take returns empty (buffer drained) + let second = transport.take_filtered_dlq_entries(); + assert!(second.is_empty(), "Buffer should be drained after take"); +} + +#[tokio::test] +async fn drop_filter_does_not_populate_dlq_buffer() { + let transport = transport_with_inbound_filters(vec![FilterRule { + expression: r#"status == "drop_me""#.into(), + action: FilterAction::Drop, + }]); + + transport + .inject(None, br#"{"status":"drop_me"}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"status":"ok"}"#.to_vec()) + .await + .unwrap(); + + let messages = transport.recv(10).await.unwrap(); + assert_eq!(messages.len(), 1); + + // Drop action should NOT populate the DLQ buffer + let dlq_entries = transport.take_filtered_dlq_entries(); + assert!( + dlq_entries.is_empty(), + "Drop action should not populate DLQ buffer" + ); +} + +#[tokio::test] +async fn no_filters_no_dlq_buffer_overhead() { + let transport = transport_no_filters(); + transport + .inject(None, br#"{"any":"thing"}"#.to_vec()) + .await + .unwrap(); + let _ = transport.recv(10).await.unwrap(); + let dlq_entries = transport.take_filtered_dlq_entries(); + assert!(dlq_entries.is_empty()); +} + +// ============================================================================ +// Section 9: Memmem false positive (documented limitation) +// ============================================================================ + +#[test] +fn memmem_false_positive_nested_field_matches_top_level_filter() { + // KNOWN LIMITATION: the memmem fast-path for `has()` searches + // for the literal `"":` byte pattern anywhere in the payload. It does + // NOT verify that the field appears at the TOP LEVEL of the JSON object. + // + // If the same field name occurs at a nested level, the fast-path will match + // even though a strict CEL `has()` would not. + // + // Example: filter is `has(_table)` (top-level), payload is + // {"data":{"_table":"events"}} + // The bytes contain `"_table":`, so memmem matches and the filter triggers. + // + // This is a deliberate tradeoff for the ~50% performance gain on the most + // common transport filter (existence checks on top-level routing fields). + // Workaround for users who need strict top-level matching: use a nested + // path like `has(some.nested._table)` which forces the slower sonic-rs path. + + let engine = TransportFilterEngine::new( + &[FilterRule { + expression: "has(_table)".into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(); + + // Real top-level field — correct match + let real_match = br#"{"_table":"events"}"#; + assert_eq!(engine.apply_inbound(real_match), FilterDisposition::Drop); + + // Nested field at non-top-level — documented false positive + let nested_payload = br#"{"data":{"_table":"events"}}"#; + assert_eq!( + engine.apply_inbound(nested_payload), + FilterDisposition::Drop, + "Documented false positive: memmem fast-path matches nested occurrences" + ); + + // Sound case: well-formed JSON with field name only inside an escaped + // string value never triggers a false positive — JSON encoding requires + // a `\` before any embedded `\"`, so the literal byte pattern `"_table":` + // never appears inside a string value. + let escaped_in_value = br#"{"description":"event with \"_table\": substring"}"#; + assert_eq!( + engine.apply_inbound(escaped_in_value), + FilterDisposition::Pass, + "Escaped quotes prevent the literal `\"_table\":` pattern from appearing in a string value" + ); +} + +// ============================================================================ +// Section 10: Concurrency (Send + Sync via tokio::spawn) +// ============================================================================ + +#[tokio::test] +async fn engine_send_sync_concurrent_evaluation() { + use std::sync::Arc; + + let engine = Arc::new( + TransportFilterEngine::new( + &[FilterRule { + expression: r#"status == "poison""#.into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ) + .unwrap(), + ); + + let mut handles = Vec::new(); + for i in 0..32 { + let engine = Arc::clone(&engine); + handles.push(tokio::spawn(async move { + let mut drops = 0u32; + let mut passes = 0u32; + for j in 0..1000 { + let payload = if j % 3 == 0 { + br#"{"status":"poison","id":1}"#.to_vec() + } else { + format!(r#"{{"id":{j},"thread":{i}}}"#).into_bytes() + }; + match engine.apply_inbound(&payload) { + FilterDisposition::Drop => drops += 1, + FilterDisposition::Pass => passes += 1, + FilterDisposition::Dlq => {} + } + } + (drops, passes) + })); + } + + let mut total_drops = 0u32; + let mut total_passes = 0u32; + for h in handles { + let (d, p) = h.await.unwrap(); + total_drops += d; + total_passes += p; + } + + // 32 threads × 1000 messages = 32000 total + assert_eq!(total_drops + total_passes, 32_000); + // ~33% are poison + assert!(total_drops > 10_000 && total_drops < 12_000); +} + +#[test] +fn filter_action_is_send_sync() { + fn assert_send_sync() {} + assert_send_sync::(); + assert_send_sync::(); + assert_send_sync::(); + assert_send_sync::(); +} + +// ============================================================================ +// Section 11: Per-transport smoke (verify field_engine field exists in all) +// ============================================================================ + +#[tokio::test] +async fn smoke_memory_transport_filters_field_present() { + // Construct MemoryTransport with filter config — verifies field exists + let transport = MemoryTransport::new(&MemoryConfig { + buffer_size: 100, + recv_timeout_ms: 50, + filters_in: vec![FilterRule { + expression: "has(_drop_me)".into(), + action: FilterAction::Drop, + }], + filters_out: vec![FilterRule { + expression: "has(_drop_me)".into(), + action: FilterAction::Drop, + }], + }); + + // Filter actually works + transport + .inject(None, br#"{"_drop_me":true}"#.to_vec()) + .await + .unwrap(); + transport + .inject(None, br#"{"keep":true}"#.to_vec()) + .await + .unwrap(); + + let messages = transport.recv(10).await.unwrap(); + assert_eq!(messages.len(), 1, "Filter must be wired in MemoryTransport"); +} + +#[test] +fn tier3_patterns_rejected_by_default() { + let expressions = [ + r#"field.matches("^prod-.*")"#, + r#"tags.exists(t, t == "pii")"#, + ]; + + for expr in &expressions { + let result = TransportFilterEngine::new( + &[FilterRule { + expression: (*expr).into(), + action: FilterAction::Drop, + }], + &[], + &TransportFilterTierConfig::default(), + ); + assert!( + result.is_err(), + "Tier 3 expression should be rejected by default: {expr}" + ); + } +} + +// ============================================================================ +// Section 12: Python <-> Rust classifier parity (shared fixture) +// ============================================================================ +// +// Loads tests/fixtures/cel_classifier_parity.json and verifies the Rust +// classifier produces the same tier, op, and field results as the fixture +// expects. The dfe-engine Python test in +// `/projects/dfe-engine/tests/unit/test_cel/test_parity.py` runs the SAME +// fixture through the Python classifier in `dfe_engine.cel.classify`. +// +// If both tests pass on their respective sides, the UI validator and the +// runtime engine agree on classification — no drift. +// +// To add a new test case, edit the fixture in BOTH: +// * /projects/hyperi-rustlib/tests/fixtures/cel_classifier_parity.json +// * /projects/dfe-engine/tests/fixtures/cel_classifier_parity.json +// They must remain byte-identical. + +#[test] +fn classifier_matches_python_fixture() { + use hyperi_rustlib::transport::filter::classify::{ClassifyResult, Tier1Op, classify}; + + #[derive(serde::Deserialize)] + struct Fixture { + cases: Vec, + } + + #[derive(serde::Deserialize)] + struct Case { + expression: String, + tier: u8, + #[serde(default)] + op_kind: Option, + #[serde(default)] + op_field: Option, + #[serde(default)] + op_value: Option, + #[serde(default)] + fields: Option>, + } + + let raw = + std::fs::read_to_string("tests/fixtures/cel_classifier_parity.json").expect("read fixture"); + let fixture: Fixture = serde_json::from_str(&raw).expect("parse fixture"); + + for case in &fixture.cases { + let result = classify(&case.expression) + .unwrap_or_else(|e| panic!("classify failed for {:?}: {}", case.expression, e)); + + let actual_tier_num: u8 = match result.tier() { + hyperi_rustlib::transport::filter::FilterTier::Tier1 => 1, + hyperi_rustlib::transport::filter::FilterTier::Tier2 => 2, + hyperi_rustlib::transport::filter::FilterTier::Tier3 => 3, + }; + assert_eq!( + actual_tier_num, case.tier, + "tier mismatch for {:?}: expected={} actual={}", + case.expression, case.tier, actual_tier_num + ); + + if case.tier == 1 { + let ClassifyResult::Tier1(op) = &result else { + panic!( + "Tier 1 expected for {:?}, got {:?}", + case.expression, result + ); + }; + let (kind, field, value) = match op { + Tier1Op::FieldExists { field } => ("field_exists", field.as_str(), None), + Tier1Op::FieldNotExists { field } => ("field_not_exists", field.as_str(), None), + Tier1Op::FieldEquals { field, value } => { + ("field_equals", field.as_str(), Some(value.as_str())) + } + Tier1Op::FieldNotEquals { field, value } => { + ("field_not_equals", field.as_str(), Some(value.as_str())) + } + Tier1Op::FieldStartsWith { field, prefix } => { + ("field_starts_with", field.as_str(), Some(prefix.as_str())) + } + Tier1Op::FieldEndsWith { field, suffix } => { + ("field_ends_with", field.as_str(), Some(suffix.as_str())) + } + Tier1Op::FieldContains { field, substring } => { + ("field_contains", field.as_str(), Some(substring.as_str())) + } + }; + let expected_kind = case.op_kind.as_deref().unwrap_or_else(|| { + panic!( + "fixture missing op_kind for Tier 1 case {:?}", + case.expression + ) + }); + assert_eq!( + kind, expected_kind, + "op_kind mismatch for {:?}", + case.expression + ); + let expected_field = case.op_field.as_deref().unwrap_or_else(|| { + panic!( + "fixture missing op_field for Tier 1 case {:?}", + case.expression + ) + }); + assert_eq!( + field, expected_field, + "op_field mismatch for {:?}", + case.expression + ); + if let Some(expected_value) = case.op_value.as_deref() { + assert_eq!( + value, + Some(expected_value), + "op_value mismatch for {:?}", + case.expression + ); + } + } else { + let actual_fields: Vec = match &result { + ClassifyResult::Tier2 { fields } | ClassifyResult::Tier3 { fields } => { + fields.clone() + } + ClassifyResult::Tier1(_) => unreachable!(), + }; + let mut actual_sorted = actual_fields.clone(); + actual_sorted.sort(); + let mut expected_sorted = case.fields.clone().unwrap_or_default(); + expected_sorted.sort(); + assert_eq!( + actual_sorted, expected_sorted, + "fields mismatch for {:?}", + case.expression + ); + } + } +}