diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml new file mode 100644 index 000000000..8a29befa9 --- /dev/null +++ b/.github/workflows/unit-tests.yaml @@ -0,0 +1,21 @@ +name: Unit Tests + +on: + pull_request: + branches: + - add-alert-management-api-base + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run tests + run: go test -count=1 $(go list ./... | grep -v /test/e2e) diff --git a/Makefile b/Makefile index ce54b2060..20a641653 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ lint-frontend: lint-backend: go mod tidy go fmt ./cmd/ - go fmt ./pkg/ + go fmt ./pkg/... ./internal/... .PHONY: install-backend install-backend: @@ -57,7 +57,11 @@ start-backend: .PHONY: test-backend test-backend: - go test ./pkg/... -v + go test ./pkg/... ./internal/... -v + +.PHONY: test-e2e +test-e2e: + PLUGIN_URL=http://localhost:9001 go test -v -timeout=150m -count=1 ./test/e2e .PHONY: build-image build-image: diff --git a/cmd/plugin-backend.go b/cmd/plugin-backend.go index 82e76f4b6..c7b79d6da 100644 --- a/cmd/plugin-backend.go +++ b/cmd/plugin-backend.go @@ -8,15 +8,16 @@ import ( "strconv" "strings" - server "github.com/openshift/monitoring-plugin/pkg" "github.com/sirupsen/logrus" + + server "github.com/openshift/monitoring-plugin/pkg" ) var ( portArg = flag.Int("port", 0, "server port to listen on (default: 9443)\nports 9444 and 9445 reserved for other use") certArg = flag.String("cert", "", "cert file path to enable TLS (disabled by default)") keyArg = flag.String("key", "", "private key file path to enable TLS (disabled by default)") - featuresArg = flag.String("features", "", "enabled features, comma separated.\noptions: ['acm-alerting', 'incidents', 'dev-config', 'perses-dashboards']") + featuresArg = flag.String("features", "", "enabled features, comma separated.\noptions: ['acm-alerting', 'incidents', 'dev-config', 'perses-dashboards', 'alert-management-api']") staticPathArg = flag.String("static-path", "", "static files path to serve frontend (default: './web/dist')") configPathArg = flag.String("config-path", "", "config files path (default: './config')") pluginConfigArg = flag.String("plugin-config-path", "", "plugin yaml configuration") diff --git a/docs/alert-management.md b/docs/alert-management.md new file mode 100644 index 000000000..1ca39abf9 --- /dev/null +++ b/docs/alert-management.md @@ -0,0 +1,41 @@ +## Alert Management Notes + +This document covers alert management behavior and prerequisites for the monitoring plugin. + +### User workload monitoring prerequisites + +To include **user workload** alerts and rules in `/api/v1/alerting/alerts` and `/api/v1/alerting/rules`, the user workload monitoring stack must be enabled. Follow the OpenShift documentation for enabling and configuring UWM: + +https://docs.redhat.com/en/documentation/monitoring_stack_for_red_hat_openshift/4.20/html/configuring_user_workload_monitoring/configuring-alerts-and-notifications-uwm + +#### How the plugin reads user workload alerts/rules + +The plugin prefers **Thanos tenancy** for user workload alerts/rules (RBAC-scoped, requires a namespace parameter). When the client does not provide a `namespace` filter, the plugin discovers candidate namespaces and queries Thanos tenancy per-namespace, using the end-user bearer token. + +Routes in `openshift-user-workload-monitoring` are treated as **fallbacks** (and are also used for some health checks and pending state retrieval). + +If you want to create the user workload Prometheus route (optional), you can expose the service: + +```shell +oc -n openshift-user-workload-monitoring expose svc/prometheus-user-workload-web --name=prometheus-user-workload-web --port=web +``` + +If the route is missing/unreachable but tenancy is healthy, the plugin should still return user workload data and suppress route warnings. + +#### Alert states + +- `/api/v1/alerting/alerts?state=pending`: pending alerts come from Prometheus. +- `/api/v1/alerting/alerts?state=firing`: firing alerts come from Alertmanager when available. +- `/api/v1/alerting/alerts?state=silenced`: silenced alerts come from Alertmanager (requires an Alertmanager endpoint). + +### Alertmanager routing choices + +OpenShift supports routing user workload alerts to: + +- The **platform Alertmanager** (default instance) +- A **separate Alertmanager** for user workloads +- **External Alertmanager** instances + +This is a cluster configuration choice and does not change the plugin API shape. The plugin reads alerts from Alertmanager (for firing/silenced) and Prometheus (for pending), then merges platform and user workload results when available. + +The plugin intentionally reads from only the in-cluster Alertmanager endpoints. Supporting multiple external Alertmanagers would introduce ambiguous alert state and silencing outcomes because each instance can apply different routing, inhibition, and silence configurations. diff --git a/docs/alert-rule-classification.md b/docs/alert-rule-classification.md new file mode 100644 index 000000000..8682f47ea --- /dev/null +++ b/docs/alert-rule-classification.md @@ -0,0 +1,213 @@ +# Alert Rule Classification - Design and Usage + +## Overview +The backend classifies Prometheus alerting rules into a “component” and an “impact layer”. It: +- Computes an `openshift_io_alert_rule_id` per alerting rule. +- Determines component/layer based on matcher logic and rule labels. +- Allows users to override classification via a single, fixed-name ConfigMap per namespace. +- Enriches the Alerts API response with `openshift_io_alert_rule_id`, `openshift_io_alert_component`, and `openshift_io_alert_layer`. + +This document explains how it works, how to override, and how to test it. + + +## Terminology +- openshift_io_alert_rule_id: Identifier for an alerting rule. Computed from a canonicalized view of the rule definition and encoded as `rid_` + base64url(nopad(sha256(payload))). Independent of `PrometheusRule` name. +- component: Logical owner of the alert (e.g., `kube-apiserver`, `etcd`, a namespace, etc.). +- layer: Impact scope. Allowed values: + - `cluster` + - `namespace` + +Notes: +- **Stability**: + - The id is **always derived from the rule spec**. If the rule definition changes (expr/for/business labels/name), the id may change. + - For **platform rules**, this API currently only supports label updates via `AlertRelabelConfig` (not editing expr/for), so the id is effectively stable unless the upstream operator changes the rule definition. + - For **user-defined rules**, the API stamps the computed id into the `PrometheusRule` rule labels. If you update the rule definition, the API returns the **new** id and migrates any existing classification override to the new id. +- Layer values are validated as `cluster|namespace` when set. To remove an override, clear the field (via API `null` or by removing the ConfigMap entry); empty/invalid values are ignored at read time. + +## Rule ID computation (openshift_io_alert_rule_id) +Location: `pkg/alert_rule/alert_rule.go` + +The backend computes a specHash-like value from: +- `kind`/`name`: `alert` + `alert:` name or `record` + `record:` name +- `expr`: trimmed with consecutive whitespace collapsed +- `for`: trimmed (duration string as written in the rule) +- `labels`: only non-system labels + - excludes labels with `openshift_io_` prefix and the `alertname` label + - drops empty values + - keeps only valid Prometheus label names (`[a-zA-Z_][a-zA-Z0-9_]*`) + - sorted by key and joined as `key=value` lines + +Annotations are intentionally ignored to reduce id churn on documentation-only changes. + +## Classification Logic (How component/layer are determined) +Location: `pkg/alertcomponent/matcher.go` + +1) The code adapts `cluster-health-analyzer` matchers: + - CVO-related alerts (update/upgrade) → component/layer based on known patterns + - Compute / node-related alerts + - Core control plane components (renamed to layer `cluster`) + - Workload/namespace-level alerts (renamed to layer `namespace`) + +2) Fallback: + - If the computed component is empty or “Others”, we set: + - `component = other` + - `layer` derived from source: + - `openshift_io_alert_source=platform` → `cluster` + - `openshift_io_prometheus_rule_namespace=openshift-monitoring` → `cluster` + - `prometheus` label starting with `openshift-monitoring/` → `cluster` + - otherwise → `namespace` + +3) Result: + - Each alerting rule is assigned a `(component, layer)` tuple following the above logic. + +## Developer Overrides via Rule Labels (Recommended) +If you want explicit component/layer values and do not want to rely on the matcher, set +these labels on each rule in your `PrometheusRule`: +- `openshift_io_alert_rule_component` +- `openshift_io_alert_rule_layer` + +Both are validated the same way as API overrides: +- `component`: 1-253 chars, alphanumeric + `._-`, must start/end alphanumeric +- `layer`: `cluster` or `namespace` + +When these labels are present and valid, they override matcher-derived values. + +## User Overrides (ConfigMap) +Location: `pkg/management/update_classification.go`, `pkg/management/get_alerts.go` + +- The backend stores overrides in the plugin namespace, sharded by target rule namespace: + - Name: `alert-classification-overrides-` + - Namespace: the monitoring plugin's namespace + - Required label: + - `monitoring.openshift.io/type=alert-classification-overrides` + - Recommended label: + - `app.kubernetes.io/managed-by=openshift-console` + +- Data layout: + - Key: base64url(nopad(UTF-8 bytes of ``)) + - This keeps ConfigMap keys opaque and avoids relying on any particular id character set. + - Value: JSON object with a `classification` field that holds component/layer. + - Optional metadata fields such as `alertName`, `prometheusRuleName`, and + `prometheusRuleNamespace` may be included for readability; they are ignored by + the backend. + - Dynamic overrides: + - `openshift_io_alert_rule_component_from`: derive component from an alert label key. + - `openshift_io_alert_rule_layer_from`: derive layer from an alert label key. + +Example: +```json +{ + "alertName": "ClusterOperatorDown", + "prometheusRuleName": "cluster-version", + "prometheusRuleNamespace": "openshift-cluster-version", + "classification": { + "openshift_io_alert_rule_component_from": "name", + "openshift_io_alert_rule_layer": "cluster" + } +} +``` + +Notes: +- Overrides are only read when the required `monitoring.openshift.io/type` label is present. +- Invalid component/layer values are ignored for that entry. +- `*_from` values must be valid Prometheus label names (`[a-zA-Z_][a-zA-Z0-9_]*`). +- If a `*_from` label is present but the alert does not carry that label or the derived + value is invalid, the backend falls back to static values (if present) or defaults. +- If both component and layer are empty, the entry is removed. + + +## Alerts API Enrichment +Location: `pkg/management/get_alerts.go`, `pkg/k8s/prometheus_alerts.go` + +- Endpoint: `GET /api/v1/alerting/alerts` (prom-compatible schema) +- The backend fetches active alerts and enriches each alert with: + - `openshift_io_alert_rule_id` + - `openshift_io_alert_component` + - `openshift_io_alert_layer` + - `prometheusRuleName`: name of the PrometheusRule resource the alert originates from + - `prometheusRuleNamespace`: namespace of that PrometheusRule resource + - `alertingRuleName`: name of the AlertingRule CR that generated the PrometheusRule (empty when the PrometheusRule is not owned by an AlertingRule CR) +- Prometheus compatibility: + - Base response matches Prometheus `/api/v1/alerts`. + - Additional fields are additive and safe for clients like Perses. + +## Prometheus/Thanos Sources +Location: `pkg/k8s/prometheus_alerts.go` + +- Order of candidates: + 1) Thanos Route `thanos-querier` at `/api` + `/v1/alerts` (oauth-proxied) + 2) In-cluster Thanos service `https://thanos-querier.openshift-monitoring.svc:9091/api/v1/alerts` + 3) In-cluster Prometheus `https://prometheus-k8s.openshift-monitoring.svc:9091/api/v1/alerts` + 4) In-cluster Prometheus (plain HTTP) `http://prometheus-k8s.openshift-monitoring.svc:9090/api/v1/alerts` (fallback) + 5) Prometheus Route `prometheus-k8s` at `/api/v1/alerts` + +- TLS and Auth: + - Bearer token: service account token from in-cluster config. + - CA trust: system pool + `SSL_CERT_FILE` + `/var/run/configmaps/service-ca/service-ca.crt`. + +RBAC: +- Read routes in `openshift-monitoring`. +- Access `prometheuses/api` as needed for oauth-proxied endpoints. + +## Updating Rules Classification +APIs: +- Single update: + - Method: `PATCH /api/v1/alerting/rules/{ruleId}` + - Request body: + ```json + { + "classification": { + "openshift_io_alert_rule_component": "team-x", + "openshift_io_alert_rule_layer": "namespace", + "openshift_io_alert_rule_component_from": "name", + "openshift_io_alert_rule_layer_from": "layer" + } + } + ``` + - `openshift_io_alert_rule_layer`: `cluster` or `namespace` + - To remove a classification override, set the field to `null` (e.g. `"openshift_io_alert_rule_layer": null`). + - Response: + - 200 OK with a status payload (same format as other rule PATCH responses), where `status_code` is 204 on success. + - Standard error body on failure (400 validation, 404 not found, etc.) +- Bulk update: + - Method: `PATCH /api/v1/alerting/rules` + - Request body: + ```json + { + "ruleIds": ["", ""], + "classification": { + "openshift_io_alert_rule_component": "etcd", + "openshift_io_alert_rule_layer": "cluster" + } + } + ``` + - Response: + - 200 OK with per-rule results (same format as other bulk rule PATCH responses). Clients should handle partial failures. + +Direct K8s (supported for power users/GitOps): +- PATCH/PUT the ConfigMap `alert-classification-overrides-` in the monitoring plugin namespace (respect `resourceVersion`). +- Each entry is keyed by base64url(``) with a JSON payload that contains a `classification` object (`openshift_io_alert_rule_component`, `openshift_io_alert_rule_layer`). +- UI should check update permissions with SelfSubjectAccessReview before showing an editor. + +Notes: +- These endpoints are intended for updating **classification only** (component/layer overrides), + with permissions enforced based on the rule’s ownership (platform, user workload, operator-managed, + GitOps-managed). +- To update other rule fields (expr/labels/annotations/etc.), use `PATCH /api/v1/alerting/rules/{ruleId}`. + Clients that need to update both should issue two requests. The combined operation is not atomic. +- In the ConfigMap override entries, classification is nested under `classification` + and validated as component/layer to keep it separate from generic label updates. + +## Security Notes +- Persist only minimal classification metadata in the fixed-name ConfigMap. + +## Testing and Ops +Unit tests: +- `pkg/management/get_alerts_test.go` + - Overrides from labeled ConfigMap, fallback behavior, label validation. + +## Future Work +- Optional CRD to formalize the schema (adds overhead; ConfigMap is sufficient today). +- Optional composite update API if we need to update rule fields and classification atomically. +- De-duplication/merge logic when aggregating alerts across sources. + diff --git a/go.mod b/go.mod index c63c87f86..9437a6af0 100644 --- a/go.mod +++ b/go.mod @@ -6,55 +6,87 @@ require ( github.com/evanphx/json-patch v4.12.0+incompatible github.com/gorilla/handlers v1.5.2 github.com/gorilla/mux v1.8.1 + github.com/onsi/ginkgo/v2 v2.22.0 + github.com/onsi/gomega v1.36.1 + github.com/openshift/api v0.0.0-20251122153900-88cca31a44c9 + github.com/openshift/client-go v0.0.0-20251123231646-4685125c2287 github.com/openshift/library-go v0.0.0-20240905123346-5bdbfe35a6f5 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.0 + github.com/prometheus-operator/prometheus-operator/pkg/client v0.87.0 + github.com/prometheus/common v0.67.4 + github.com/prometheus/prometheus v0.308.0 github.com/sirupsen/logrus v1.9.3 - github.com/stretchr/testify v1.9.0 + github.com/stretchr/testify v1.11.1 gopkg.in/yaml.v2 v2.4.0 - k8s.io/api v0.31.1 - k8s.io/apiserver v0.30.3 - k8s.io/client-go v0.31.1 + k8s.io/api v0.34.2 + k8s.io/apimachinery v0.34.2 + k8s.io/apiserver v0.34.2 + k8s.io/client-go v0.34.2 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.12.1 // indirect + github.com/dennwc/varint v1.0.0 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fsnotify/fsnotify v1.7.0 // indirect - github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect - github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.21.0 // indirect - github.com/go-openapi/swag v0.23.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.22.1 // indirect + github.com/go-openapi/jsonreference v0.21.2 // indirect + github.com/go-openapi/swag v0.25.1 // indirect + github.com/go-openapi/swag/cmdutils v0.25.1 // indirect + github.com/go-openapi/swag/conv v0.25.1 // indirect + github.com/go-openapi/swag/fileutils v0.25.1 // indirect + github.com/go-openapi/swag/jsonname v0.25.1 // indirect + github.com/go-openapi/swag/jsonutils v0.25.1 // indirect + github.com/go-openapi/swag/loading v0.25.1 // indirect + github.com/go-openapi/swag/mangling v0.25.1 // indirect + github.com/go-openapi/swag/netutils v0.25.1 // indirect + github.com/go-openapi/swag/stringutils v0.25.1 // indirect + github.com/go-openapi/swag/typeutils v0.25.1 // indirect + github.com/go-openapi/swag/yamlutils v0.25.1 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/josharian/intern v1.0.0 // indirect + github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect - github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/procfs v0.16.1 // indirect + github.com/spf13/pflag v1.0.6 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.34.0 // indirect - golang.org/x/oauth2 v0.25.0 // indirect - golang.org/x/sys v0.29.0 // indirect - golang.org/x/term v0.28.0 // indirect - golang.org/x/text v0.21.0 // indirect - golang.org/x/time v0.9.0 // indirect - google.golang.org/protobuf v1.34.2 // indirect + go.uber.org/atomic v1.11.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.46.0 // indirect + golang.org/x/oauth2 v0.32.0 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/term v0.36.0 // indirect + golang.org/x/text v0.30.0 // indirect + golang.org/x/time v0.13.0 // indirect + golang.org/x/tools v0.37.0 // indirect + google.golang.org/protobuf v1.36.10 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apimachinery v0.31.1 // indirect + k8s.io/apiextensions-apiserver v0.34.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20240808142205-8e686545bdb8 // indirect - k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect - sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/controller-runtime v0.22.3 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 4bc90faf2..e70962788 100644 --- a/go.sum +++ b/go.sum @@ -1,170 +1,319 @@ +cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= +cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ= +cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= +cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1 h1:5YTBM8QDVIBN3sxBil89WfdAAqDZbyJTgh688DSxX5w= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0 h1:wL5IEG5zb7BVv1Kv0Xm92orq+5hB5Nipn3B5tn4Rqfk= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 h1:XkkQbfMyuH2jTSjQjSoihryI8GINRcs4xp8lNawg0FI= +github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0= +github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs= +github.com/aws/aws-sdk-go-v2 v1.39.6 h1:2JrPCVgWJm7bm83BDwY5z8ietmeJUbh3O2ACnn+Xsqk= +github.com/aws/aws-sdk-go-v2 v1.39.6/go.mod h1:c9pm7VwuW0UPxAEYGyTmyurVcNrbF6Rt/wixFqDhcjE= +github.com/aws/aws-sdk-go-v2/config v1.31.17 h1:QFl8lL6RgakNK86vusim14P2k8BFSxjvUkcWLDjgz9Y= +github.com/aws/aws-sdk-go-v2/config v1.31.17/go.mod h1:V8P7ILjp/Uef/aX8TjGk6OHZN6IKPM5YW6S78QnRD5c= +github.com/aws/aws-sdk-go-v2/credentials v1.18.21 h1:56HGpsgnmD+2/KpG0ikvvR8+3v3COCwaF4r+oWwOeNA= +github.com/aws/aws-sdk-go-v2/credentials v1.18.21/go.mod h1:3YELwedmQbw7cXNaII2Wywd+YY58AmLPwX4LzARgmmA= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.1 h1:0JPwLz1J+5lEOfy/g0SURC9cxhbQ1lIMHMa+AHZSzz0= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.1/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5 h1:OWs0/j2UYR5LOGi88sD5/lhN6TDLG6SfA7CqsQO9zF0= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo= +github.com/aws/aws-sdk-go-v2/service/sts v1.39.1 h1:mLlUgHn02ue8whiR4BmxxGJLR2gwU6s6ZzJ5wDamBUs= +github.com/aws/aws-sdk-go-v2/service/sts v1.39.1/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk= +github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM= +github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 h1:6df1vn4bBlDDo4tARvBm7l6KA9iVMnE3NWizDeWSrps= +github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3/go.mod h1:CIWtjkly68+yqLPbvwwR/fjNJA/idrtULjZWh2v1ys0= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= -github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE= +github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84= github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= -github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= -github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk= +github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM= +github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU= +github.com/go-openapi/jsonreference v0.21.2/go.mod h1:pp3PEjIsJ9CZDGCNOyXIQxsNuroxm8FAJ/+quA0yKzQ= +github.com/go-openapi/swag v0.25.1 h1:6uwVsx+/OuvFVPqfQmOOPsqTcm5/GkBhNwLqIR916n8= +github.com/go-openapi/swag v0.25.1/go.mod h1:bzONdGlT0fkStgGPd3bhZf1MnuPkf2YAys6h+jZipOo= +github.com/go-openapi/swag/cmdutils v0.25.1 h1:nDke3nAFDArAa631aitksFGj2omusks88GF1VwdYqPY= +github.com/go-openapi/swag/cmdutils v0.25.1/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.1 h1:+9o8YUg6QuqqBM5X6rYL/p1dpWeZRhoIt9x7CCP+he0= +github.com/go-openapi/swag/conv v0.25.1/go.mod h1:Z1mFEGPfyIKPu0806khI3zF+/EUXde+fdeksUl2NiDs= +github.com/go-openapi/swag/fileutils v0.25.1 h1:rSRXapjQequt7kqalKXdcpIegIShhTPXx7yw0kek2uU= +github.com/go-openapi/swag/fileutils v0.25.1/go.mod h1:+NXtt5xNZZqmpIpjqcujqojGFek9/w55b3ecmOdtg8M= +github.com/go-openapi/swag/jsonname v0.25.1 h1:Sgx+qbwa4ej6AomWC6pEfXrA6uP2RkaNjA9BR8a1RJU= +github.com/go-openapi/swag/jsonname v0.25.1/go.mod h1:71Tekow6UOLBD3wS7XhdT98g5J5GR13NOTQ9/6Q11Zo= +github.com/go-openapi/swag/jsonutils v0.25.1 h1:AihLHaD0brrkJoMqEZOBNzTLnk81Kg9cWr+SPtxtgl8= +github.com/go-openapi/swag/jsonutils v0.25.1/go.mod h1:JpEkAjxQXpiaHmRO04N1zE4qbUEg3b7Udll7AMGTNOo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1 h1:DSQGcdB6G0N9c/KhtpYc71PzzGEIc/fZ1no35x4/XBY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1/go.mod h1:kjmweouyPwRUEYMSrbAidoLMGeJ5p6zdHi9BgZiqmsg= +github.com/go-openapi/swag/loading v0.25.1 h1:6OruqzjWoJyanZOim58iG2vj934TysYVptyaoXS24kw= +github.com/go-openapi/swag/loading v0.25.1/go.mod h1:xoIe2EG32NOYYbqxvXgPzne989bWvSNoWoyQVWEZicc= +github.com/go-openapi/swag/mangling v0.25.1 h1:XzILnLzhZPZNtmxKaz/2xIGPQsBsvmCjrJOWGNz/ync= +github.com/go-openapi/swag/mangling v0.25.1/go.mod h1:CdiMQ6pnfAgyQGSOIYnZkXvqhnnwOn997uXZMAd/7mQ= +github.com/go-openapi/swag/netutils v0.25.1 h1:2wFLYahe40tDUHfKT1GRC4rfa5T1B4GWZ+msEFA4Fl4= +github.com/go-openapi/swag/netutils v0.25.1/go.mod h1:CAkkvqnUJX8NV96tNhEQvKz8SQo2KF0f7LleiJwIeRE= +github.com/go-openapi/swag/stringutils v0.25.1 h1:Xasqgjvk30eUe8VKdmyzKtjkVjeiXx1Iz0zDfMNpPbw= +github.com/go-openapi/swag/stringutils v0.25.1/go.mod h1:JLdSAq5169HaiDUbTvArA2yQxmgn4D6h4A+4HqVvAYg= +github.com/go-openapi/swag/typeutils v0.25.1 h1:rD/9HsEQieewNt6/k+JBwkxuAHktFtH3I3ysiFZqukA= +github.com/go-openapi/swag/typeutils v0.25.1/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8= +github.com/go-openapi/swag/yamlutils v0.25.1 h1:mry5ez8joJwzvMbaTGLhw8pXUnhDK91oSJLDPF1bmGk= +github.com/go-openapi/swag/yamlutils v0.25.1/go.mod h1:cm9ywbzncy3y6uPm/97ysW8+wZ09qsks+9RS8fLWKqg= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= +github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20240727154555-813a5fbdbec8 h1:FKHo8hFI3A+7w0aUQuYXQ+6EN5stWmeY/AZqtM8xk9k= -github.com/google/pprof v0.0.0-20240727154555-813a5fbdbec8/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ= +github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= +github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= +github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= +github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= +github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= +github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= github.com/gorilla/handlers v1.5.2/go.mod h1:dX+xVpaxdSw+q0Qek8SSsl3dfMk3jNddUkMzo0GtH0w= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= -github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 h1:cLN4IBkmkYZNnk7EAJ0BHIethd+J6LqxFNw5mSiI2bM= +github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co= +github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= -github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.20.0 h1:PE84V2mHqoT1sglvHc8ZdQtPcwmvvt29WLEEO3xmdZw= -github.com/onsi/ginkgo/v2 v2.20.0/go.mod h1:lG9ey2Z29hR41WMVthyJBGUBcBhGOtoPF2VFMvBXFCI= -github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= -github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4= +github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s= +github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ= +github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= +github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw= +github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/openshift/api v0.0.0-20251122153900-88cca31a44c9 h1:RKbCmhOI6XOKMjoXLjANJ1ic7wd4dVV7nSfrn3csEuQ= +github.com/openshift/api v0.0.0-20251122153900-88cca31a44c9/go.mod h1:d5uzF0YN2nQQFA0jIEWzzOZ+edmo6wzlGLvx5Fhz4uY= +github.com/openshift/client-go v0.0.0-20251123231646-4685125c2287 h1:Spullg4rMMWUjYiBMvYMhyeZ+j36mYOrkSO7ad43xrA= +github.com/openshift/client-go v0.0.0-20251123231646-4685125c2287/go.mod h1:liCuDDdOsPSZIDP0QuTveFhF7ldXuvnPhBd/OTsJdJc= github.com/openshift/library-go v0.0.0-20240905123346-5bdbfe35a6f5 h1:CyPTfZvr+HvwXbix9kieI55HeFn4a5DBaxJ3DNFinhg= github.com/openshift/library-go v0.0.0-20240905123346-5bdbfe35a6f5/go.mod h1:/wmao3qtqOQ484HDka9cWP7SIvOQOdzpmhyXkF2YdzE= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.0 h1:QK37j5ZUtBwbyZkF4BBAs3bQQ1gYKG8e+g1BdNZBr/M= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.0/go.mod h1:WHiLZmOWVop/MoYvRD58LfnPeyE+dcITby/jQjg83Hw= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.87.0 h1:rrZriucuC8ZUOPr8Asvavb9pbzqXSsAeY79aH8xnXlc= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.87.0/go.mod h1:OMvC2XJGxPeEAKf5qB1u7DudV46HA8ePxYslRjxQcbk= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_golang/exp v0.0.0-20250914183048-a974e0d45e0a h1:RF1vfKM34/3DbGNis22BGd6sDDY3XBi0eM7pYqmOEO0= +github.com/prometheus/client_golang/exp v0.0.0-20250914183048-a974e0d45e0a/go.mod h1:FGJuwvfcPY0V5enm+w8zF1RNS062yugQtPPQp1c4Io4= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= +github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= +github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos= +github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/prometheus v0.308.0 h1:kVh/5m1n6m4cSK9HYTDEbMxzuzCWyEdPdKSxFRxXj04= +github.com/prometheus/prometheus v0.308.0/go.mod h1:xXYKzScyqyFHihpS0UsXpC2F3RA/CygOs7wb4mpdusE= +github.com/prometheus/sigv4 v0.3.0 h1:QIG7nTbu0JTnNidGI1Uwl5AGVIChWUACxn2B/BQ1kms= +github.com/prometheus/sigv4 v0.3.0/go.mod h1:fKtFYDus2M43CWKMNtGvFNHGXnAJJEGZbiYCmVp/F8I= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= +go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= -golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= +golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= +golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= +golang.org/x/exp v0.0.0-20250808145144-a408d31f581a h1:Y+7uR/b1Mw2iSXZ3G//1haIiSElDQZ8KWh0h+sZPG90= +golang.org/x/exp v0.0.0-20250808145144-a408d31f581a/go.mod h1:rT6SFzZ7oxADUDx58pcaKFTcZ+inxAa9fTrYx/uVYwg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= -golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= -golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= +golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg= -golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q= +golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= +golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= -golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= +golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= +golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= -google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +google.golang.org/api v0.252.0 h1:xfKJeAJaMwb8OC9fesr369rjciQ704AjU/psjkKURSI= +google.golang.org/api v0.252.0/go.mod h1:dnHOv81x5RAmumZ7BWLShB/u7JZNeyalImxHmtTHxqw= +google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 h1:L6iMMGrtzgHsWofoFcihmDEMYeDR9KN/ThbPWGrh++g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251002232023-7c0ddcbb5797 h1:CirRxTOwnRWVLKzDNrs0CXAaVozJoR4G9xvdRecrdpk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251002232023-7c0ddcbb5797/go.mod h1:HSkG/KdJWusxU1F6CNrwNDjBMgisKxGnc5dAZfT0mjQ= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.31.1 h1:Xe1hX/fPW3PXYYv8BlozYqw63ytA92snr96zMW9gWTU= -k8s.io/api v0.31.1/go.mod h1:sbN1g6eY6XVLeqNsZGLnI5FwVseTrZX7Fv3O26rhAaI= -k8s.io/apimachinery v0.31.1 h1:mhcUBbj7KUjaVhyXILglcVjuS4nYXiwC+KKFBgIVy7U= -k8s.io/apimachinery v0.31.1/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= -k8s.io/apiserver v0.30.3 h1:QZJndA9k2MjFqpnyYv/PH+9PE0SHhx3hBho4X0vE65g= -k8s.io/apiserver v0.30.3/go.mod h1:6Oa88y1CZqnzetd2JdepO0UXzQX4ZnOekx2/PtEjrOg= -k8s.io/client-go v0.31.1 h1:f0ugtWSbWpxHR7sjVpQwuvw9a3ZKLXX0u0itkFXufb0= -k8s.io/client-go v0.31.1/go.mod h1:sKI8871MJN2OyeqRlmA4W4KM9KBdBUpDLu/43eGemCg= +k8s.io/api v0.34.2 h1:fsSUNZhV+bnL6Aqrp6O7lMTy6o5x2C4XLjnh//8SLYY= +k8s.io/api v0.34.2/go.mod h1:MMBPaWlED2a8w4RSeanD76f7opUoypY8TFYkSM+3XHw= +k8s.io/apiextensions-apiserver v0.34.2 h1:WStKftnGeoKP4AZRz/BaAAEJvYp4mlZGN0UCv+uvsqo= +k8s.io/apiextensions-apiserver v0.34.2/go.mod h1:398CJrsgXF1wytdaanynDpJ67zG4Xq7yj91GrmYN2SE= +k8s.io/apimachinery v0.34.2 h1:zQ12Uk3eMHPxrsbUJgNF8bTauTVR2WgqJsTmwTE/NW4= +k8s.io/apimachinery v0.34.2/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= +k8s.io/apiserver v0.34.2 h1:2/yu8suwkmES7IzwlehAovo8dDE07cFRC7KMDb1+MAE= +k8s.io/apiserver v0.34.2/go.mod h1:gqJQy2yDOB50R3JUReHSFr+cwJnL8G1dzTA0YLEqAPI= +k8s.io/client-go v0.34.2 h1:Co6XiknN+uUZqiddlfAjT68184/37PS4QAzYvQvDR8M= +k8s.io/client-go v0.34.2/go.mod h1:2VYDl1XXJsdcAxw7BenFslRQX28Dxz91U9MWKjX97fE= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240808142205-8e686545bdb8 h1:1Wof1cGQgA5pqgo8MxKPtf+qN6Sh/0JzznmeGPm1HnE= -k8s.io/kube-openapi v0.0.0-20240808142205-8e686545bdb8/go.mod h1:Os6V6dZwLNii3vxFpxcNaTmH8LJJBkOTg1N0tOA0fvA= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.3 h1:I7mfqz/a/WdmDCEnXmSPm8/b/yRTy6JsKKENTijTq8Y= +sigs.k8s.io/controller-runtime v0.22.3/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= +sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/internal/managementrouter/alert_rule_bulk_update.go b/internal/managementrouter/alert_rule_bulk_update.go new file mode 100644 index 000000000..845459fd0 --- /dev/null +++ b/internal/managementrouter/alert_rule_bulk_update.go @@ -0,0 +1,221 @@ +package managementrouter + +import ( + "encoding/json" + "errors" + "net/http" + "strings" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/management" +) + +// Note: router no longer filters provenance/identity labels here. +// Backend enforces ARC scoping and ignores/guards protected labels as needed. + +type BulkUpdateAlertRulesRequest struct { + RuleIds []string `json:"ruleIds"` + // Use pointer values so we can distinguish null (delete) vs string value (set) + Labels map[string]*string `json:"labels,omitempty"` + AlertingRuleEnabled *bool `json:"AlertingRuleEnabled,omitempty"` + Classification *AlertRuleClassificationPatch `json:"classification,omitempty"` +} + +type BulkUpdateAlertRulesResponse struct { + Rules []UpdateAlertRuleResponse `json:"rules"` +} + +func (hr *httpRouter) BulkUpdateAlertRules(w http.ResponseWriter, req *http.Request) { + var payload BulkUpdateAlertRulesRequest + if err := json.NewDecoder(req.Body).Decode(&payload); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + + if len(payload.RuleIds) == 0 { + writeError(w, http.StatusBadRequest, "ruleIds is required") + return + } + + if payload.AlertingRuleEnabled == nil && payload.Labels == nil && payload.Classification == nil { + writeError(w, http.StatusBadRequest, "AlertingRuleEnabled (toggle drop/restore) or labels (set/unset) or classification is required") + return + } + var haveToggle bool + var enabled bool + if payload.AlertingRuleEnabled != nil { + enabled = *payload.AlertingRuleEnabled + haveToggle = true + } + + results := make([]UpdateAlertRuleResponse, 0, len(payload.RuleIds)) + + for _, rawId := range payload.RuleIds { + id, err := parseParam(rawId, "ruleId") + if err != nil { + results = append(results, UpdateAlertRuleResponse{ + Id: rawId, + StatusCode: http.StatusBadRequest, + Message: err.Error(), + }) + continue + } + + // Handle enabled drop/restore first if requested + notAllowedEnabled := false + if haveToggle { + var derr error + if !enabled { + derr = hr.managementClient.DropPlatformAlertRule(req.Context(), id) + } else { + derr = hr.managementClient.RestorePlatformAlertRule(req.Context(), id) + } + if derr != nil { + // If NotAllowed (likely user-defined), we still allow label updates. + var na *management.NotAllowedError + if errors.As(derr, &na) { + notAllowedEnabled = true + } else { + status, message := parseError(derr) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + } + } + + if payload.Classification != nil { + update := management.UpdateRuleClassificationRequest{RuleId: id} + if payload.Classification.ComponentSet { + update.Component = payload.Classification.Component + update.ComponentSet = true + } + if payload.Classification.LayerSet { + update.Layer = payload.Classification.Layer + update.LayerSet = true + } + if payload.Classification.ComponentFromSet { + update.ComponentFrom = payload.Classification.ComponentFrom + update.ComponentFromSet = true + } + if payload.Classification.LayerFromSet { + update.LayerFrom = payload.Classification.LayerFrom + update.LayerFromSet = true + } + + if update.ComponentSet || update.LayerSet || update.ComponentFromSet || update.LayerFromSet { + if err := hr.managementClient.UpdateAlertRuleClassification(req.Context(), update); err != nil { + status, message := parseError(err) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + } + } + + if payload.Labels != nil { + // For bulk update, merge labels and handle empty strings as drops + currentRule, err := hr.managementClient.GetRuleById(req.Context(), id) + if err != nil { + status, message := parseError(err) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + + mergedLabels := make(map[string]string) + intentLabels := make(map[string]string) + for k, v := range currentRule.Labels { + mergedLabels[k] = v + } + for k, pv := range payload.Labels { + if pv == nil || *pv == "" { + intentLabels[k] = "" + delete(mergedLabels, k) + continue + } + mergedLabels[k] = *pv + intentLabels[k] = *pv + } + + updatedPlatformRule := monitoringv1.Rule{Labels: intentLabels} + + err = hr.managementClient.UpdatePlatformAlertRule(req.Context(), id, updatedPlatformRule) + if err != nil { + var ve *management.ValidationError + var nf *management.NotFoundError + if errors.As(err, &ve) || errors.As(err, &nf) { + status, message := parseError(err) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + + var na *management.NotAllowedError + if errors.As(err, &na) && strings.Contains(na.Error(), "cannot update non-platform alert rule") { + // Not a platform rule, try user-defined + // For user-defined, we apply the merged labels to the PR + updatedUserRule := currentRule + updatedUserRule.Labels = mergedLabels + + newRuleId, err := hr.managementClient.UpdateUserDefinedAlertRule(req.Context(), id, updatedUserRule) + if err != nil { + status, message := parseError(err) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + results = append(results, UpdateAlertRuleResponse{ + Id: newRuleId, + StatusCode: http.StatusNoContent, + }) + continue + } + + status, message := parseError(err) + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + } + + // If only enabled was requested and it was NotAllowed, return 405 for this id. + if notAllowedEnabled && payload.Labels == nil && payload.Classification == nil { + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: http.StatusMethodNotAllowed, + }) + continue + } + + results = append(results, UpdateAlertRuleResponse{ + Id: id, + StatusCode: http.StatusNoContent, + }) + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(BulkUpdateAlertRulesResponse{ + Rules: results, + }) +} diff --git a/internal/managementrouter/alert_rule_bulk_update_test.go b/internal/managementrouter/alert_rule_bulk_update_test.go new file mode 100644 index 000000000..b5f675e88 --- /dev/null +++ b/internal/managementrouter/alert_rule_bulk_update_test.go @@ -0,0 +1,480 @@ +package managementrouter_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("BulkUpdateAlertRules", func() { + var ( + router http.Handler + mockK8sRules *testutils.MockPrometheusRuleInterface + mockK8s *testutils.MockClient + mockRelabeledRules *testutils.MockRelabeledRulesInterface + ) + + var ( + userRule1 = monitoringv1.Rule{ + Alert: "user-alert-1", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + }, + } + userRule1Id = alertrule.GetAlertingRuleId(&userRule1) + + userRule2 = monitoringv1.Rule{ + Alert: "user-alert-2", + Expr: intstr.FromString("cpu > 80"), + Labels: map[string]string{ + "severity": "info", + }, + } + userRule2Id = alertrule.GetAlertingRuleId(&userRule2) + platformRule = monitoringv1.Rule{Alert: "platform-alert", Expr: intstr.FromString("memory > 90"), Labels: map[string]string{"severity": "critical"}} + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + mockK8sRules = &testutils.MockPrometheusRuleInterface{} + + userPR := monitoringv1.PrometheusRule{} + userPR.Name = "user-pr" + userPR.Namespace = "default" + userPR.Spec.Groups = []monitoringv1.RuleGroup{ + { + Name: "g1", + Rules: []monitoringv1.Rule{ + { + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{"severity": "warning", k8s.AlertRuleLabelId: userRule1Id}, + }, + { + Alert: userRule2.Alert, + Expr: userRule2.Expr, + Labels: map[string]string{"severity": "info", k8s.AlertRuleLabelId: userRule2Id}, + }, + }, + }, + } + + platformPR := monitoringv1.PrometheusRule{} + platformPR.Name = "platform-pr" + platformPR.Namespace = "platform-namespace-1" + platformPR.Spec.Groups = []monitoringv1.RuleGroup{ + { + Name: "pg1", + Rules: []monitoringv1.Rule{ + { + Alert: "platform-alert", + Expr: intstr.FromString("memory > 90"), + Labels: map[string]string{"severity": "critical"}, + }, + }, + }, + } + + mockK8sRules.SetPrometheusRules(map[string]*monitoringv1.PrometheusRule{ + "default/user-pr": &userPR, + "platform-namespace-1/platform-pr": &platformPR, + }) + + mockNamespace := &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "platform-namespace-1" || name == "platform-namespace-2" + }, + } + + mockRelabeledRules = &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return monitoringv1.Rule{ + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{ + "severity": "warning", + k8s.AlertRuleLabelId: userRule1Id, + k8s.PrometheusRuleLabelNamespace: "default", + k8s.PrometheusRuleLabelName: "user-pr", + }, + }, true + } + if id == userRule2Id { + return monitoringv1.Rule{ + Alert: userRule2.Alert, + Expr: userRule2.Expr, + Labels: map[string]string{ + "severity": "info", + k8s.AlertRuleLabelId: userRule2Id, + k8s.PrometheusRuleLabelNamespace: "default", + k8s.PrometheusRuleLabelName: "user-pr", + }, + }, true + } + if id == platformRuleId { + return monitoringv1.Rule{ + Alert: "platform-alert", + Expr: intstr.FromString("memory > 90"), + Labels: map[string]string{ + "severity": "critical", + k8s.AlertRuleLabelId: platformRuleId, + k8s.PrometheusRuleLabelNamespace: "platform-namespace-1", + k8s.PrometheusRuleLabelName: "platform-pr", + }, + }, true + } + return monitoringv1.Rule{}, false + }, + } + + mockK8s = &testutils.MockClient{ + PrometheusRulesFunc: func() k8s.PrometheusRuleInterface { + return mockK8sRules + }, + NamespaceFunc: func() k8s.NamespaceInterface { + return mockNamespace + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return mockRelabeledRules + }, + } + + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + }) + + Context("when updating multiple user-defined rules", func() { + It("should successfully update all rules and return updated IDs", func() { + expectedNewUserRule1Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{ + "severity": "warning", + "component": "api", + "team": "backend", + }, + }) + expectedNewUserRule2Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: userRule2.Alert, + Expr: userRule2.Expr, + Labels: map[string]string{ + "severity": "info", + "component": "api", + "team": "backend", + }, + }) + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id, userRule2Id}, + "labels": map[string]string{ + "component": "api", + "team": "backend", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + + Expect(resp.Rules[0].Id).To(Equal(expectedNewUserRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Rules[1].Id).To(Equal(expectedNewUserRule2Id)) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNoContent)) + }) + + It("should drop labels with empty string value", func() { + expectedNewUserRule1Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: "user-alert-1", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "critical", + }, + }) + mockRelabeledRules.GetFunc = func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return monitoringv1.Rule{ + Alert: "user-alert-1", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + "team": "backend", + k8s.AlertRuleLabelId: userRule1Id, + k8s.PrometheusRuleLabelNamespace: "default", + k8s.PrometheusRuleLabelName: "user-pr", + }, + }, true + } + return monitoringv1.Rule{}, false + } + + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id}, + "labels": map[string]string{ + "team": "", + "severity": "critical", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(1)) + + Expect(resp.Rules[0].Id).To(Equal(expectedNewUserRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + }) + }) + + Context("when updating mixed platform and user-defined rules", func() { + It("should handle both types correctly - both keep their IDs", func() { + expectedNewUserRule1Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{ + "severity": "warning", + "component": "api", + }, + }) + mockARC := &testutils.MockAlertRelabelConfigInterface{} + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return mockARC + } + + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id, platformRuleId}, + "labels": map[string]string{ + "component": "api", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + + Expect(resp.Rules[0].Id).To(Equal(expectedNewUserRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + + Expect(resp.Rules[1].Id).To(Equal(platformRuleId)) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNoContent)) + }) + }) + + Context("when request body is invalid", func() { + It("should return 400", func() { + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewBufferString("{")) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("invalid request body")) + }) + }) + + Context("when ruleIds is empty", func() { + It("should return 400", func() { + body := map[string]interface{}{ + "ruleIds": []string{}, + "labels": map[string]string{"component": "api"}, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("ruleIds is required")) + }) + }) + + Context("when labels, AlertingRuleEnabled, and classification are missing", func() { + It("should return 400", func() { + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id}, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("AlertingRuleEnabled (toggle drop/restore) or labels (set/unset) or classification is required")) + }) + }) + + Context("enabled toggle in bulk for platform/user/missing", func() { + It("should drop platform, mark user as not allowed, and missing as not found", func() { + mockARC := &testutils.MockAlertRelabelConfigInterface{} + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { return mockARC } + + body := map[string]interface{}{ + "ruleIds": []string{platformRuleId, userRule1Id, "rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + "AlertingRuleEnabled": false, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(3)) + + // Order corresponds to input order + Expect(resp.Rules[0].Id).To(Equal(platformRuleId)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Rules[1].Id).To(Equal(userRule1Id)) + // user-defined alerts cannot be dropped/restored via enabled + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusMethodNotAllowed)) + Expect(resp.Rules[2].Id).To(Equal("rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")) + Expect(resp.Rules[2].StatusCode).To(Equal(http.StatusNotFound)) + }) + }) + + Context("when some rules are not found", func() { + It("should return mixed results", func() { + expectedNewUserRule1Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{ + "severity": "warning", + "component": "api", + }, + }) + mockRelabeledRules.GetFunc = func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return monitoringv1.Rule{ + Alert: "user-alert-1", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + k8s.AlertRuleLabelId: userRule1Id, + k8s.PrometheusRuleLabelNamespace: "default", + k8s.PrometheusRuleLabelName: "user-pr", + }, + }, true + } + return monitoringv1.Rule{}, false + } + + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id, "rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}, + "labels": map[string]string{"component": "api"}, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + + Expect(resp.Rules[0].Id).To(Equal(expectedNewUserRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Rules[1].Id).To(Equal("rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNotFound)) + }) + }) + + Context("when ruleId is invalid", func() { + It("should return 400 for invalid ruleId", func() { + expectedNewUserRule1Id := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: userRule1.Alert, + Expr: userRule1.Expr, + Labels: map[string]string{ + "severity": "warning", + "component": "api", + }, + }) + body := map[string]interface{}{ + "ruleIds": []string{userRule1Id, ""}, + "labels": map[string]string{"component": "api"}, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + + Expect(resp.Rules[0].Id).To(Equal(expectedNewUserRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Rules[1].Id).To(Equal("")) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusBadRequest)) + Expect(resp.Rules[1].Message).To(ContainSubstring("missing ruleId")) + }) + }) + + Context("when bulk updating classification only", func() { + It("should update classification overrides and return 204 per rule", func() { + body := map[string]any{ + "ruleIds": []string{userRule1Id, userRule2Id}, + "classification": map[string]any{ + "openshift_io_alert_rule_component": "team-x", + "openshift_io_alert_rule_layer": "namespace", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.BulkUpdateAlertRulesResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNoContent)) + }) + }) +}) diff --git a/internal/managementrouter/alert_rule_classification_patch.go b/internal/managementrouter/alert_rule_classification_patch.go new file mode 100644 index 000000000..812c73aab --- /dev/null +++ b/internal/managementrouter/alert_rule_classification_patch.go @@ -0,0 +1,66 @@ +package managementrouter + +import "encoding/json" + +// AlertRuleClassificationPatch represents a partial update ("patch") payload for +// alert rule classification labels. +// +// This type supports a three-state contract per field: +// - omitted: leave unchanged +// - null: clear the override +// - string: set the override +// +// Note: Go's encoding/json cannot represent "explicit null" vs "omitted" using **string +// (both decode to nil), so we custom-unmarshal and track key presence with *Set flags. +type AlertRuleClassificationPatch struct { + Component *string `json:"openshift_io_alert_rule_component,omitempty"` + ComponentSet bool `json:"-"` + Layer *string `json:"openshift_io_alert_rule_layer,omitempty"` + LayerSet bool `json:"-"` + ComponentFrom *string `json:"openshift_io_alert_rule_component_from,omitempty"` + ComponentFromSet bool `json:"-"` + LayerFrom *string `json:"openshift_io_alert_rule_layer_from,omitempty"` + LayerFromSet bool `json:"-"` +} + +func (p *AlertRuleClassificationPatch) UnmarshalJSON(b []byte) error { + var m map[string]json.RawMessage + if err := json.Unmarshal(b, &m); err != nil { + return err + } + + decodeNullableString := func(key string) (set bool, v *string, err error) { + raw, ok := m[key] + if !ok { + return false, nil, nil + } + set = true + if len(raw) == 0 || string(raw) == "null" { + return true, nil, nil + } + var s string + if err := json.Unmarshal(raw, &s); err != nil { + return true, nil, err + } + return true, &s, nil + } + + var err error + p.ComponentSet, p.Component, err = decodeNullableString("openshift_io_alert_rule_component") + if err != nil { + return err + } + p.LayerSet, p.Layer, err = decodeNullableString("openshift_io_alert_rule_layer") + if err != nil { + return err + } + p.ComponentFromSet, p.ComponentFrom, err = decodeNullableString("openshift_io_alert_rule_component_from") + if err != nil { + return err + } + p.LayerFromSet, p.LayerFrom, err = decodeNullableString("openshift_io_alert_rule_layer_from") + if err != nil { + return err + } + return nil +} diff --git a/internal/managementrouter/alert_rule_classification_patch_test.go b/internal/managementrouter/alert_rule_classification_patch_test.go new file mode 100644 index 000000000..34890b6fa --- /dev/null +++ b/internal/managementrouter/alert_rule_classification_patch_test.go @@ -0,0 +1,40 @@ +package managementrouter_test + +import ( + "encoding/json" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" +) + +var _ = Describe("AlertRuleClassificationPatch", func() { + Context("when field is omitted", func() { + It("does not mark it as set", func() { + var p managementrouter.AlertRuleClassificationPatch + Expect(json.Unmarshal([]byte(`{}`), &p)).To(Succeed()) + Expect(p.ComponentSet).To(BeFalse()) + Expect(p.Component).To(BeNil()) + }) + }) + + Context("when field is explicitly null", func() { + It("marks it as set and clears the value", func() { + var p managementrouter.AlertRuleClassificationPatch + Expect(json.Unmarshal([]byte(`{"openshift_io_alert_rule_component":null}`), &p)).To(Succeed()) + Expect(p.ComponentSet).To(BeTrue()) + Expect(p.Component).To(BeNil()) + }) + }) + + Context("when field is a string", func() { + It("marks it as set and provides the value", func() { + var p managementrouter.AlertRuleClassificationPatch + Expect(json.Unmarshal([]byte(`{"openshift_io_alert_rule_component":"team-x"}`), &p)).To(Succeed()) + Expect(p.ComponentSet).To(BeTrue()) + Expect(p.Component).NotTo(BeNil()) + Expect(*p.Component).To(Equal("team-x")) + }) + }) +}) diff --git a/internal/managementrouter/alert_rule_update.go b/internal/managementrouter/alert_rule_update.go new file mode 100644 index 000000000..979e973ec --- /dev/null +++ b/internal/managementrouter/alert_rule_update.go @@ -0,0 +1,175 @@ +package managementrouter + +import ( + "encoding/json" + "errors" + "net/http" + "strings" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/management" +) + +type UpdateAlertRuleRequest struct { + AlertingRule *monitoringv1.Rule `json:"alertingRule,omitempty"` + AlertingRuleEnabled *bool `json:"AlertingRuleEnabled,omitempty"` + Classification *AlertRuleClassificationPatch `json:"classification,omitempty"` +} + +type UpdateAlertRuleResponse struct { + Id string `json:"id"` + StatusCode int `json:"status_code"` + Message string `json:"message,omitempty"` +} + +func (hr *httpRouter) UpdateAlertRule(w http.ResponseWriter, req *http.Request) { + ruleId, err := getParam(req, "ruleId") + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + + var payload UpdateAlertRuleRequest + if err := json.NewDecoder(req.Body).Decode(&payload); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + + if payload.AlertingRule == nil && payload.AlertingRuleEnabled == nil && payload.Classification == nil { + writeError(w, http.StatusBadRequest, "either alertingRule, AlertingRuleEnabled, or classification is required") + return + } + + // Handle drop/restore for platform alerts + if payload.AlertingRuleEnabled != nil { + var derr error + if !*payload.AlertingRuleEnabled { + derr = hr.managementClient.DropPlatformAlertRule(req.Context(), ruleId) + } else { + derr = hr.managementClient.RestorePlatformAlertRule(req.Context(), ruleId) + } + if derr != nil { + status, message := parseError(derr) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: status, + Message: message, + }) + return + } + if payload.AlertingRule == nil && payload.Classification == nil { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: http.StatusNoContent, + }) + return + } + } + + if payload.Classification != nil { + update := management.UpdateRuleClassificationRequest{RuleId: ruleId} + if payload.Classification.ComponentSet { + update.Component = payload.Classification.Component + update.ComponentSet = true + } + if payload.Classification.LayerSet { + update.Layer = payload.Classification.Layer + update.LayerSet = true + } + if payload.Classification.ComponentFromSet { + update.ComponentFrom = payload.Classification.ComponentFrom + update.ComponentFromSet = true + } + if payload.Classification.LayerFromSet { + update.LayerFrom = payload.Classification.LayerFrom + update.LayerFromSet = true + } + if err := hr.managementClient.UpdateAlertRuleClassification(req.Context(), update); err != nil { + status, message := parseError(err) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: status, + Message: message, + }) + return + } + + // If this is a classification-only patch, return success now. + if payload.AlertingRule == nil { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: http.StatusNoContent, + }) + return + } + } + + alertRule := *payload.AlertingRule + + err = hr.managementClient.UpdatePlatformAlertRule(req.Context(), ruleId, alertRule) + if err != nil { + var ve *management.ValidationError + var nf *management.NotFoundError + if errors.As(err, &ve) || errors.As(err, &nf) { + status, message := parseError(err) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: status, + Message: message, + }) + return + } + + var na *management.NotAllowedError + if errors.As(err, &na) && strings.Contains(na.Error(), "cannot update non-platform alert rule") { + // Not a platform rule, try user-defined update + newRuleId, err := hr.managementClient.UpdateUserDefinedAlertRule(req.Context(), ruleId, alertRule) + if err != nil { + status, message := parseError(err) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: status, + Message: message, + }) + return + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: newRuleId, + StatusCode: http.StatusNoContent, + }) + return + } + + status, message := parseError(err) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: status, + Message: message, + }) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(UpdateAlertRuleResponse{ + Id: ruleId, + StatusCode: http.StatusNoContent, + }) +} diff --git a/internal/managementrouter/alert_rule_update_test.go b/internal/managementrouter/alert_rule_update_test.go new file mode 100644 index 000000000..e6d208e4b --- /dev/null +++ b/internal/managementrouter/alert_rule_update_test.go @@ -0,0 +1,367 @@ +package managementrouter_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("UpdateAlertRule", func() { + var ( + router http.Handler + mockK8sRules *testutils.MockPrometheusRuleInterface + mockK8s *testutils.MockClient + mockRelabeledRules *testutils.MockRelabeledRulesInterface + ) + + var ( + originalUserRule = monitoringv1.Rule{ + Alert: "user-alert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + }, + } + userRuleId = alertrule.GetAlertingRuleId(&originalUserRule) + + platformRule = monitoringv1.Rule{Alert: "platform-alert", Expr: intstr.FromString("cpu > 80"), Labels: map[string]string{"severity": "critical"}} + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + mockK8sRules = &testutils.MockPrometheusRuleInterface{} + + userPR := monitoringv1.PrometheusRule{} + userPR.Name = "user-pr" + userPR.Namespace = "default" + userPR.Spec.Groups = []monitoringv1.RuleGroup{ + { + Name: "g1", + Rules: []monitoringv1.Rule{ + { + Alert: originalUserRule.Alert, + Expr: originalUserRule.Expr, + Labels: map[string]string{"severity": "warning", k8s.AlertRuleLabelId: userRuleId}, + }, + }, + }, + } + + platformPR := monitoringv1.PrometheusRule{} + platformPR.Name = "platform-pr" + platformPR.Namespace = "platform-namespace-1" + platformPR.Spec.Groups = []monitoringv1.RuleGroup{ + { + Name: "pg1", + Rules: []monitoringv1.Rule{ + { + Alert: "platform-alert", + Expr: intstr.FromString("cpu > 80"), + Labels: map[string]string{"severity": "critical"}, + }, + }, + }, + } + + mockK8sRules.SetPrometheusRules(map[string]*monitoringv1.PrometheusRule{ + "default/user-pr": &userPR, + "platform-namespace-1/platform-pr": &platformPR, + }) + + mockNamespace := &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "platform-namespace-1" || name == "platform-namespace-2" + }, + } + + mockRelabeledRules = &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return monitoringv1.Rule{ + Alert: "user-alert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + k8s.AlertRuleLabelId: userRuleId, + k8s.PrometheusRuleLabelNamespace: "default", + k8s.PrometheusRuleLabelName: "user-pr", + }, + }, true + } + if id == platformRuleId { + return monitoringv1.Rule{ + Alert: "platform-alert", + Expr: intstr.FromString("cpu > 80"), + Labels: map[string]string{ + "severity": "critical", + k8s.AlertRuleLabelId: platformRuleId, + k8s.PrometheusRuleLabelNamespace: "platform-namespace-1", + k8s.PrometheusRuleLabelName: "platform-pr", + }, + }, true + } + return monitoringv1.Rule{}, false + }, + } + + mockK8s = &testutils.MockClient{ + PrometheusRulesFunc: func() k8s.PrometheusRuleInterface { + return mockK8sRules + }, + NamespaceFunc: func() k8s.NamespaceInterface { + return mockNamespace + }, + RelabeledRulesFunc: func() k8s.RelabeledRulesInterface { + return mockRelabeledRules + }, + } + + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + }) + + Context("when updating a user-defined alert rule", func() { + It("should successfully update the rule and return new ID", func() { + expectedNewId := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: "user-alert", + Expr: intstr.FromString("up == 1"), + Labels: map[string]string{ + "severity": "critical", + "team": "sre", + }, + }) + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "user-alert", + "expr": "up == 1", + "labels": map[string]string{ + "severity": "critical", + "team": "sre", + }, + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+userRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + + Expect(resp.Id).To(Equal(expectedNewId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Message).To(BeEmpty()) + }) + + It("should replace all labels without merging", func() { + expectedNewId := alertrule.GetAlertingRuleId(&monitoringv1.Rule{ + Alert: "user-alert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "team": "sre", + }, + }) + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "user-alert", + "expr": "up == 0", + "labels": map[string]string{ + "team": "sre", + }, + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+userRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + + Expect(resp.Id).To(Equal(expectedNewId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + }) + }) + + Context("when updating rule classification via PATCH /rules/{ruleId}", func() { + It("should update classification overrides with nested classification payload", func() { + body := map[string]any{ + "classification": map[string]any{ + "openshift_io_alert_rule_component": "team-x", + "openshift_io_alert_rule_layer": "namespace", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+userRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).To(Equal(userRuleId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + }) + }) + + Context("when updating a platform alert rule", func() { + It("should successfully update labels via AlertRelabelConfig", func() { + mockARC := &testutils.MockAlertRelabelConfigInterface{} + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return mockARC + } + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "platform-alert", + "expr": "cpu > 80", + "labels": map[string]string{ + "severity": "warning", + }, + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+platformRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).To(Equal(platformRuleId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Message).To(BeEmpty()) + }) + }) + + Context("when ruleId is missing", func() { + It("should return 400", func() { + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "test-alert", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/%20", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("missing ruleId")) + }) + }) + + Context("when request body is invalid", func() { + It("should return 400", func() { + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/user-alert", bytes.NewBufferString("{")) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("invalid request body")) + }) + }) + + Context("enabled toggle for platform alerts", func() { + It("should drop (AlertingRuleEnabled=false) and return 204 envelope", func() { + mockARC := &testutils.MockAlertRelabelConfigInterface{} + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { return mockARC } + + body := map[string]interface{}{"AlertingRuleEnabled": false} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+platformRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).To(Equal(platformRuleId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Message).To(BeEmpty()) + }) + + It("should restore (AlertingRuleEnabled=true) and return 204 envelope", func() { + mockARC := &testutils.MockAlertRelabelConfigInterface{} + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { return mockARC } + + body := map[string]interface{}{"AlertingRuleEnabled": true} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+platformRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).To(Equal(platformRuleId)) + Expect(resp.StatusCode).To(Equal(http.StatusNoContent)) + Expect(resp.Message).To(BeEmpty()) + }) + }) + + Context("when alertingRule, AlertingRuleEnabled, and classification are missing", func() { + It("should return 400", func() { + body := map[string]interface{}{} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/"+userRuleId, bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("either alertingRule, AlertingRuleEnabled, or classification is required")) + }) + }) + + Context("when rule is not found", func() { + It("should return JSON response with 404 status code", func() { + mockRelabeledRules.GetFunc = func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + } + + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "missing-alert", + }, + } + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodPatch, "/api/v1/alerting/rules/rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", bytes.NewReader(buf)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp managementrouter.UpdateAlertRuleResponse + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).To(Equal("rid_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")) + Expect(resp.StatusCode).To(Equal(http.StatusNotFound)) + Expect(resp.Message).To(ContainSubstring("not found")) + }) + }) +}) diff --git a/internal/managementrouter/alerts_get.go b/internal/managementrouter/alerts_get.go new file mode 100644 index 000000000..6f6d94dac --- /dev/null +++ b/internal/managementrouter/alerts_get.go @@ -0,0 +1,122 @@ +package managementrouter + +import ( + "context" + "encoding/json" + "log" + "net/http" + "strings" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +type GetAlertsResponse struct { + Data GetAlertsResponseData `json:"data"` + Warnings []string `json:"warnings,omitempty"` +} + +type GetAlertsResponseData struct { + Alerts []k8s.PrometheusAlert `json:"alerts"` +} + +func (hr *httpRouter) GetAlerts(w http.ResponseWriter, req *http.Request) { + state, labels, err := parseStateAndLabels(req.URL.Query()) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + ctx := k8s.WithBearerToken(req.Context(), bearerTokenFromRequest(req)) + + alerts, err := hr.managementClient.GetAlerts(ctx, k8s.GetAlertsRequest{ + Labels: labels, + State: state, + }) + if err != nil { + handleError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(GetAlertsResponse{ + Data: GetAlertsResponseData{ + Alerts: alerts, + }, + Warnings: hr.alertWarnings(ctx), + }); err != nil { + log.Printf("failed to encode alerts response: %v", err) + } +} + +func bearerTokenFromRequest(req *http.Request) string { + auth := strings.TrimSpace(req.Header.Get("Authorization")) + if auth == "" { + return "" + } + const prefix = "Bearer " + if !strings.HasPrefix(auth, prefix) { + return "" + } + return strings.TrimSpace(strings.TrimPrefix(auth, prefix)) +} + +func (hr *httpRouter) alertWarnings(ctx context.Context) []string { + health, ok := hr.alertingHealth(ctx) + if !ok { + return nil + } + + warnings := []string{} + if health.UserWorkloadEnabled && health.UserWorkload != nil { + warnings = append(warnings, buildRouteWarnings(health.UserWorkload.Prometheus, k8s.UserWorkloadRouteName, "user workload Prometheus")...) + warnings = append(warnings, buildRouteWarnings(health.UserWorkload.Alertmanager, k8s.UserWorkloadAlertmanagerRouteName, "user workload Alertmanager")...) + } + + return warnings +} + +func (hr *httpRouter) rulesWarnings(ctx context.Context) []string { + health, ok := hr.alertingHealth(ctx) + if !ok { + return nil + } + + if health.UserWorkloadEnabled && health.UserWorkload != nil { + return buildRouteWarnings(health.UserWorkload.Prometheus, k8s.UserWorkloadRouteName, "user workload Prometheus") + } + + return nil +} + +func (hr *httpRouter) alertingHealth(ctx context.Context) (k8s.AlertingHealth, bool) { + if hr.managementClient == nil { + return k8s.AlertingHealth{}, false + } + + health, err := hr.managementClient.GetAlertingHealth(ctx) + if err != nil { + log.Printf("alerting health unavailable: %v", err) + return k8s.AlertingHealth{}, false + } + + return health, true +} + +func buildRouteWarnings(route k8s.AlertingRouteHealth, expectedName string, friendlyName string) []string { + if route.Name != "" && route.Name != expectedName { + return nil + } + if route.FallbackReachable { + return nil + } + + switch route.Status { + case k8s.RouteNotFound: + return []string{friendlyName + " route is missing"} + case k8s.RouteUnreachable: + return []string{friendlyName + " route is unreachable"} + default: + return nil + } +} diff --git a/internal/managementrouter/alerts_get_test.go b/internal/managementrouter/alerts_get_test.go new file mode 100644 index 000000000..f295cc4b4 --- /dev/null +++ b/internal/managementrouter/alerts_get_test.go @@ -0,0 +1,409 @@ +package managementrouter_test + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var _ = Describe("GetAlerts", func() { + var ( + mockK8s *testutils.MockClient + mockPrometheusAlerts *testutils.MockPrometheusAlertsInterface + mockManagement management.Client + router http.Handler + ) + + BeforeEach(func() { + By("setting up mock clients") + mockPrometheusAlerts = &testutils.MockPrometheusAlertsInterface{} + mockK8s = &testutils.MockClient{ + PrometheusAlertsFunc: func() k8s.PrometheusAlertsInterface { + return mockPrometheusAlerts + }, + } + mockManagement = management.New(context.Background(), mockK8s) + router = managementrouter.New(mockManagement) + }) + + Context("flat label parsing", func() { + It("parses flat query params into Labels map and state", func() { + var captured k8s.GetAlertsRequest + mockPrometheusAlerts.GetAlertsFunc = func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + captured = req + return []k8s.PrometheusAlert{}, nil + } + + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts?namespace=ns1&severity=critical&state=firing&team=sre", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying the response") + Expect(w.Code).To(Equal(http.StatusOK)) + Expect(captured.State).To(Equal("firing")) + Expect(captured.Labels["namespace"]).To(Equal("ns1")) + Expect(captured.Labels["severity"]).To(Equal("critical")) + Expect(captured.Labels["team"]).To(Equal("sre")) + }) + }) + + Context("when getting all alerts without filters", func() { + It("should return all active alerts", func() { + By("setting up test alerts") + testAlerts := []k8s.PrometheusAlert{ + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "HighCPUUsage", + "severity": "warning", + "namespace": "default", + }, + Annotations: map[string]string{ + "description": "CPU usage is high", + }, + State: "firing", + ActiveAt: time.Now(), + }, + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "LowMemory", + "severity": "critical", + "namespace": "monitoring", + }, + Annotations: map[string]string{ + "description": "Memory is running low", + }, + State: "firing", + ActiveAt: time.Now(), + }, + } + mockPrometheusAlerts.SetActiveAlerts(testAlerts) + + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying the response") + Expect(w.Code).To(Equal(http.StatusOK)) + Expect(w.Header().Get("Content-Type")).To(Equal("application/json")) + + var response managementrouter.GetAlertsResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Data.Alerts).To(HaveLen(2)) + Expect(response.Data.Alerts[0].Labels[managementlabels.AlertNameLabel]).To(Equal("HighCPUUsage")) + Expect(response.Data.Alerts[1].Labels[managementlabels.AlertNameLabel]).To(Equal("LowMemory")) + }) + + It("returns warnings when user workload routes are missing", func() { + mockK8s.AlertingHealthFunc = func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{Status: k8s.RouteNotFound}, + Alertmanager: k8s.AlertingRouteHealth{Status: k8s.RouteNotFound}, + }, + }, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + var response managementrouter.GetAlertsResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Warnings).To(ContainElements( + "user workload Prometheus route is missing", + "user workload Alertmanager route is missing", + )) + }) + + It("suppresses warnings when fallbacks are healthy", func() { + mockK8s.AlertingHealthFunc = func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{ + Status: k8s.RouteUnreachable, + FallbackReachable: true, + }, + Alertmanager: k8s.AlertingRouteHealth{ + Status: k8s.RouteUnreachable, + FallbackReachable: true, + }, + }, + }, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + var response managementrouter.GetAlertsResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Warnings).To(BeEmpty()) + }) + + It("should return empty array when no alerts exist", func() { + By("setting up empty alerts") + mockPrometheusAlerts.SetActiveAlerts([]k8s.PrometheusAlert{}) + + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying the response") + Expect(w.Code).To(Equal(http.StatusOK)) + + var response managementrouter.GetAlertsResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Data.Alerts).To(BeEmpty()) + }) + }) + + Context("when handling errors", func() { + It("should return 500 when GetAlerts fails", func() { + By("configuring mock to return error") + mockPrometheusAlerts.GetAlertsFunc = func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, fmt.Errorf("connection error") + } + + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying error response") + Expect(w.Code).To(Equal(http.StatusInternalServerError)) + Expect(w.Body.String()).To(ContainSubstring("An unexpected error occurred")) + }) + }) + + Context("bearer token forwarding", func() { + It("forwards the Authorization bearer token to the management client via context", func() { + var capturedCtx context.Context + mockPrometheusAlerts.GetAlertsFunc = func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + capturedCtx = ctx + return []k8s.PrometheusAlert{}, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + req.Header.Set("Authorization", "Bearer test-token-abc123") + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + token := k8s.BearerTokenFromContext(capturedCtx) + Expect(token).To(Equal("test-token-abc123")) + }) + + It("handles missing Authorization header gracefully", func() { + var capturedCtx context.Context + mockPrometheusAlerts.GetAlertsFunc = func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + capturedCtx = ctx + return []k8s.PrometheusAlert{}, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + token := k8s.BearerTokenFromContext(capturedCtx) + Expect(token).To(BeEmpty()) + }) + }) + + Context("alert enrichment from relabeled rules cache", func() { + It("enriches alerts with alertRuleId, prometheusRule metadata, and alertingRule name", func() { + baseRule := monitoringv1.Rule{ + Alert: "HighCPU", + Expr: intstr.FromString("node_cpu > 0.9"), + Labels: map[string]string{ + "severity": "critical", + }, + } + ruleId := alertrule.GetAlertingRuleId(&baseRule) + + relabeledRule := monitoringv1.Rule{ + Alert: "HighCPU", + Expr: intstr.FromString("node_cpu > 0.9"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "HighCPU", + "severity": "critical", + k8s.AlertRuleLabelId: ruleId, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "cluster-cpu-rules", + managementlabels.AlertingRuleLabelName: "my-alerting-rule", + }, + } + + mockRelabeled := &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{relabeledRule} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleId { + return relabeledRule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + + mockNamespace := &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { return mockRelabeled } + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { return mockNamespace } + mockManagement = management.New(context.Background(), mockK8s) + router = managementrouter.New(mockManagement) + + testAlerts := []k8s.PrometheusAlert{ + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "HighCPU", + "severity": "critical", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + k8s.AlertBackendLabel: "alertmanager", + }, + Annotations: map[string]string{"summary": "CPU is high"}, + State: "firing", + ActiveAt: time.Now(), + }, + } + mockPrometheusAlerts.SetActiveAlerts(testAlerts) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + + var response managementrouter.GetAlertsResponse + Expect(json.NewDecoder(w.Body).Decode(&response)).To(Succeed()) + Expect(response.Data.Alerts).To(HaveLen(1)) + + alert := response.Data.Alerts[0] + Expect(alert.AlertRuleId).To(Equal(ruleId)) + Expect(alert.PrometheusRuleNamespace).To(Equal("openshift-monitoring")) + Expect(alert.PrometheusRuleName).To(Equal("cluster-cpu-rules")) + Expect(alert.AlertingRuleName).To(Equal("my-alerting-rule")) + Expect(alert.AlertComponent).NotTo(BeEmpty()) + Expect(alert.AlertLayer).NotTo(BeEmpty()) + }) + + It("enriches platform alert without alertingRule when PrometheusRule is not from AlertingRule CR", func() { + baseRule := monitoringv1.Rule{ + Alert: "KubePodCrashLooping", + Expr: intstr.FromString("rate(kube_pod_restart_total[5m]) > 0"), + Labels: map[string]string{ + "severity": "warning", + }, + } + ruleId := alertrule.GetAlertingRuleId(&baseRule) + + relabeledRule := monitoringv1.Rule{ + Alert: "KubePodCrashLooping", + Expr: intstr.FromString("rate(kube_pod_restart_total[5m]) > 0"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "KubePodCrashLooping", + "severity": "warning", + k8s.AlertRuleLabelId: ruleId, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "kube-state-metrics", + }, + } + + mockRelabeled := &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{relabeledRule} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleId { + return relabeledRule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + + mockNamespace := &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { return mockRelabeled } + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { return mockNamespace } + mockManagement = management.New(context.Background(), mockK8s) + router = managementrouter.New(mockManagement) + + testAlerts := []k8s.PrometheusAlert{ + { + Labels: map[string]string{ + managementlabels.AlertNameLabel: "KubePodCrashLooping", + "severity": "warning", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + k8s.AlertBackendLabel: "alertmanager", + }, + State: "firing", + ActiveAt: time.Now(), + }, + } + mockPrometheusAlerts.SetActiveAlerts(testAlerts) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/alerts", nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + + var response managementrouter.GetAlertsResponse + Expect(json.NewDecoder(w.Body).Decode(&response)).To(Succeed()) + Expect(response.Data.Alerts).To(HaveLen(1)) + + alert := response.Data.Alerts[0] + Expect(alert.AlertRuleId).To(Equal(ruleId)) + Expect(alert.PrometheusRuleNamespace).To(Equal("openshift-monitoring")) + Expect(alert.PrometheusRuleName).To(Equal("kube-state-metrics")) + Expect(alert.AlertingRuleName).To(BeEmpty()) + }) + }) +}) diff --git a/internal/managementrouter/create_alert_rule.go b/internal/managementrouter/create_alert_rule.go new file mode 100644 index 000000000..ad282ed17 --- /dev/null +++ b/internal/managementrouter/create_alert_rule.go @@ -0,0 +1,54 @@ +package managementrouter + +import ( + "encoding/json" + "net/http" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/management" +) + +type CreateAlertRuleRequest struct { + AlertingRule *monitoringv1.Rule `json:"alertingRule,omitempty"` + PrometheusRule *management.PrometheusRuleOptions `json:"prometheusRule,omitempty"` +} + +type CreateAlertRuleResponse struct { + Id string `json:"id"` +} + +func (hr *httpRouter) CreateAlertRule(w http.ResponseWriter, req *http.Request) { + var payload CreateAlertRuleRequest + if err := json.NewDecoder(req.Body).Decode(&payload); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + + if payload.AlertingRule == nil { + writeError(w, http.StatusBadRequest, "alertingRule is required") + return + } + + alertRule := *payload.AlertingRule + + var ( + id string + err error + ) + + if payload.PrometheusRule != nil { + id, err = hr.managementClient.CreateUserDefinedAlertRule(req.Context(), alertRule, *payload.PrometheusRule) + } else { + id, err = hr.managementClient.CreatePlatformAlertRule(req.Context(), alertRule) + } + + if err != nil { + handleError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + _ = json.NewEncoder(w).Encode(CreateAlertRuleResponse{Id: id}) +} diff --git a/internal/managementrouter/create_alert_rule_test.go b/internal/managementrouter/create_alert_rule_test.go new file mode 100644 index 000000000..a79217d49 --- /dev/null +++ b/internal/managementrouter/create_alert_rule_test.go @@ -0,0 +1,212 @@ +package managementrouter_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("CreateAlertRule", func() { + var ( + router http.Handler + mockK8sRules *testutils.MockPrometheusRuleInterface + mockARules *testutils.MockAlertingRuleInterface + mockK8s *testutils.MockClient + ) + + BeforeEach(func() { + mockK8sRules = &testutils.MockPrometheusRuleInterface{} + mockARules = &testutils.MockAlertingRuleInterface{} + mockK8s = &testutils.MockClient{ + PrometheusRulesFunc: func() k8s.PrometheusRuleInterface { + return mockK8sRules + }, + AlertingRulesFunc: func() k8s.AlertingRuleInterface { + return mockARules + }, + NamespaceFunc: func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + }, + } + }) + + Context("create new user defined alert rule", func() { + It("creates a new rule", func() { + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "cpuHigh", + "expr": "vector(1)", + "for": "5m", + "labels": map[string]string{"severity": "warning"}, + "annotations": map[string]string{"summary": "cpu high"}, + }, + "prometheusRule": map[string]interface{}{ + "prometheusRuleName": "user-pr", + "prometheusRuleNamespace": "default", + }, + } + buf, _ := json.Marshal(body) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/alerting/rules", bytes.NewReader(buf)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusCreated)) + var resp struct { + Id string `json:"id"` + } + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Id).NotTo(BeEmpty()) + + pr, found, err := mockK8sRules.Get(context.Background(), "default", "user-pr") + Expect(err).NotTo(HaveOccurred()) + Expect(found).To(BeTrue()) + allAlerts := []string{} + for _, g := range pr.Spec.Groups { + for _, r := range g.Rules { + allAlerts = append(allAlerts, r.Alert) + } + } + Expect(allAlerts).To(ContainElement("cpuHigh")) + }) + + It("creates a new rule into a non-default group when groupName is provided", func() { + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "cpuCustomGroup", + "expr": "vector(1)", + }, + "prometheusRule": map[string]interface{}{ + "prometheusRuleName": "user-pr", + "prometheusRuleNamespace": "default", + "groupName": "custom-group", + }, + } + buf, _ := json.Marshal(body) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/alerting/rules", bytes.NewReader(buf)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusCreated)) + + pr, found, err := mockK8sRules.Get(context.Background(), "default", "user-pr") + Expect(err).NotTo(HaveOccurred()) + Expect(found).To(BeTrue()) + + var grp *monitoringv1.RuleGroup + for i := range pr.Spec.Groups { + if pr.Spec.Groups[i].Name == "custom-group" { + grp = &pr.Spec.Groups[i] + break + } + } + Expect(grp).NotTo(BeNil()) + alerts := []string{} + for _, r := range grp.Rules { + alerts = append(alerts, r.Alert) + } + Expect(alerts).To(ContainElement("cpuCustomGroup")) + }) + }) + + Context("invalid JSON body", func() { + It("fails for invalid JSON", func() { + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/alerting/rules", bytes.NewBufferString("{")) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("invalid request body")) + }) + }) + + Context("missing target PrometheusRule (name/namespace)", func() { + It("fails for missing target PR", func() { + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "x", + "expr": "vector(1)", + }, + "prometheusRule": map[string]interface{}{ + // missing PR name/namespace + }, + } + buf, _ := json.Marshal(body) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/alerting/rules", bytes.NewReader(buf)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("PrometheusRule Name and Namespace must be specified")) + }) + }) + + Context("target is platform-managed PR", func() { + It("rejects with MethodNotAllowed", func() { + mockNamespace := &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return mockNamespace + } + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + body := map[string]interface{}{ + "alertingRule": map[string]interface{}{ + "alert": "x", + "expr": "vector(1)", + }, + "prometheusRule": map[string]interface{}{ + "prometheusRuleName": "platform-pr", + "prometheusRuleNamespace": "openshift-monitoring", + }, + } + buf, _ := json.Marshal(body) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/alerting/rules", bytes.NewReader(buf)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusMethodNotAllowed)) + Expect(w.Body.String()).To(ContainSubstring("cannot add user-defined alert rule to a platform-managed PrometheusRule")) + }) + }) +}) diff --git a/internal/managementrouter/health_get.go b/internal/managementrouter/health_get.go new file mode 100644 index 000000000..49fa9625e --- /dev/null +++ b/internal/managementrouter/health_get.go @@ -0,0 +1,33 @@ +package managementrouter + +import ( + "encoding/json" + "log" + "net/http" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +type GetHealthResponse struct { + Alerting *k8s.AlertingHealth `json:"alerting,omitempty"` +} + +func (hr *httpRouter) GetHealth(w http.ResponseWriter, r *http.Request) { + resp := GetHealthResponse{} + + if hr.managementClient != nil { + health, err := hr.managementClient.GetAlertingHealth(r.Context()) + if err != nil { + handleError(w, err) + return + } + resp.Alerting = &health + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(resp); err != nil { + log.Printf("failed to encode health response: %v", err) + } +} diff --git a/internal/managementrouter/health_get_test.go b/internal/managementrouter/health_get_test.go new file mode 100644 index 000000000..46610d01a --- /dev/null +++ b/internal/managementrouter/health_get_test.go @@ -0,0 +1,172 @@ +package managementrouter_test + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" +) + +var _ = Describe("GetHealth", func() { + var ( + router http.Handler + mockManagement *healthStubManagementClient + ) + + BeforeEach(func() { + By("setting up the HTTP router") + mockManagement = &healthStubManagementClient{ + alertingHealth: func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + Platform: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{ + Name: "prometheus-k8s", + Namespace: "openshift-monitoring", + Status: k8s.RouteReachable, + }, + Alertmanager: k8s.AlertingRouteHealth{ + Name: "alertmanager-main", + Namespace: "openshift-monitoring", + Status: k8s.RouteReachable, + }, + }, + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{ + Name: "prometheus-user-workload", + Namespace: "openshift-user-workload-monitoring", + Status: k8s.RouteReachable, + }, + Alertmanager: k8s.AlertingRouteHealth{ + Name: "alertmanager-user-workload", + Namespace: "openshift-user-workload-monitoring", + Status: k8s.RouteReachable, + }, + }, + }, nil + }, + } + router = managementrouter.New(mockManagement) + }) + + Context("when calling the health endpoint", func() { + It("should return 200 OK status code", func() { + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/health", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying the status code") + Expect(w.Code).To(Equal(http.StatusOK)) + }) + + It("should return correct JSON structure with alerting data", func() { + By("making the request") + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/health", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + By("verifying the response body") + var response managementrouter.GetHealthResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Alerting).NotTo(BeNil()) + }) + }) + + Context("when GetAlertingHealth returns an error", func() { + BeforeEach(func() { + mockManagement.alertingHealth = func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{}, fmt.Errorf("connection refused") + } + }) + + It("should return 500 via handleError", func() { + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/health", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusInternalServerError)) + + var errResp map[string]string + err := json.NewDecoder(w.Body).Decode(&errResp) + Expect(err).NotTo(HaveOccurred()) + Expect(errResp["error"]).To(ContainSubstring("connection refused")) + }) + }) +}) + +type healthStubManagementClient struct { + alertingHealth func(ctx context.Context) (k8s.AlertingHealth, error) +} + +func (s *healthStubManagementClient) ListRules(ctx context.Context, prOptions management.PrometheusRuleOptions, arOptions management.AlertRuleOptions) ([]monitoringv1.Rule, error) { + return nil, nil +} + +func (s *healthStubManagementClient) GetRuleById(ctx context.Context, alertRuleId string) (monitoringv1.Rule, error) { + return monitoringv1.Rule{}, nil +} + +func (s *healthStubManagementClient) CreateUserDefinedAlertRule(ctx context.Context, alertRule monitoringv1.Rule, prOptions management.PrometheusRuleOptions) (string, error) { + return "", nil +} + +func (s *healthStubManagementClient) CreatePlatformAlertRule(ctx context.Context, alertRule monitoringv1.Rule) (string, error) { + return "", nil +} + +func (s *healthStubManagementClient) UpdateUserDefinedAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) (string, error) { + return "", nil +} + +func (s *healthStubManagementClient) DeleteUserDefinedAlertRuleById(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *healthStubManagementClient) UpdatePlatformAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) error { + return nil +} + +func (s *healthStubManagementClient) DropPlatformAlertRule(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *healthStubManagementClient) RestorePlatformAlertRule(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *healthStubManagementClient) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, nil +} + +func (s *healthStubManagementClient) GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{}, nil +} + +func (s *healthStubManagementClient) GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if s.alertingHealth != nil { + return s.alertingHealth(ctx) + } + return k8s.AlertingHealth{}, nil +} + +func (s *healthStubManagementClient) UpdateAlertRuleClassification(ctx context.Context, req management.UpdateRuleClassificationRequest) error { + return nil +} + +func (s *healthStubManagementClient) BulkUpdateAlertRuleClassification(ctx context.Context, items []management.UpdateRuleClassificationRequest) []error { + return nil +} diff --git a/internal/managementrouter/managementrouter_suite_test.go b/internal/managementrouter/managementrouter_suite_test.go new file mode 100644 index 000000000..3da1553b3 --- /dev/null +++ b/internal/managementrouter/managementrouter_suite_test.go @@ -0,0 +1,13 @@ +package managementrouter_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestHTTPRouter(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "HTTPRouter Suite") +} diff --git a/internal/managementrouter/query_filters.go b/internal/managementrouter/query_filters.go new file mode 100644 index 000000000..f8e3e5e9d --- /dev/null +++ b/internal/managementrouter/query_filters.go @@ -0,0 +1,35 @@ +package managementrouter + +import ( + "fmt" + "net/url" + "strings" +) + +var validStates = map[string]bool{ + "": true, + "pending": true, + "firing": true, + "silenced": true, +} + +// parseStateAndLabels returns the optional state filter and label matches. +// Any query param other than "state" is treated as a label match. +// Returns an error if the state value is not one of the known states. +func parseStateAndLabels(q url.Values) (string, map[string]string, error) { + state := strings.ToLower(strings.TrimSpace(q.Get("state"))) + if !validStates[state] { + return "", nil, fmt.Errorf("invalid state filter %q: must be one of pending, firing, silenced", q.Get("state")) + } + + labels := make(map[string]string) + for key, vals := range q { + if key == "state" { + continue + } + if len(vals) > 0 && strings.TrimSpace(vals[0]) != "" { + labels[strings.TrimSpace(key)] = strings.TrimSpace(vals[0]) + } + } + return state, labels, nil +} diff --git a/internal/managementrouter/router.go b/internal/managementrouter/router.go new file mode 100644 index 000000000..f0def407b --- /dev/null +++ b/internal/managementrouter/router.go @@ -0,0 +1,89 @@ +package managementrouter + +import ( + "encoding/json" + "errors" + "fmt" + "log" + "net/http" + "net/url" + "strings" + + "github.com/gorilla/mux" + + "github.com/openshift/monitoring-plugin/pkg/management" +) + +type httpRouter struct { + managementClient management.Client +} + +func New(managementClient management.Client) *mux.Router { + httpRouter := &httpRouter{ + managementClient: managementClient, + } + + r := mux.NewRouter() + + r.HandleFunc("/api/v1/alerting/health", httpRouter.GetHealth).Methods(http.MethodGet) + r.HandleFunc("/api/v1/alerting/alerts", httpRouter.GetAlerts).Methods(http.MethodGet) + r.HandleFunc("/api/v1/alerting/rules", httpRouter.GetRules).Methods(http.MethodGet) + r.HandleFunc("/api/v1/alerting/rules", httpRouter.CreateAlertRule).Methods(http.MethodPost) + r.HandleFunc("/api/v1/alerting/rules", httpRouter.BulkDeleteUserDefinedAlertRules).Methods(http.MethodDelete) + r.HandleFunc("/api/v1/alerting/rules", httpRouter.BulkUpdateAlertRules).Methods(http.MethodPatch) + r.HandleFunc("/api/v1/alerting/rules/{ruleId}", httpRouter.DeleteUserDefinedAlertRuleById).Methods(http.MethodDelete) + r.HandleFunc("/api/v1/alerting/rules/{ruleId}", httpRouter.UpdateAlertRule).Methods(http.MethodPatch) + + return r +} + +func writeError(w http.ResponseWriter, statusCode int, message string) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + resp, _ := json.Marshal(map[string]string{"error": message}) + _, _ = w.Write(resp) +} + +func handleError(w http.ResponseWriter, err error) { + status, message := parseError(err) + writeError(w, status, message) +} + +func parseError(err error) (int, string) { + var nf *management.NotFoundError + if errors.As(err, &nf) { + return http.StatusNotFound, err.Error() + } + var ve *management.ValidationError + if errors.As(err, &ve) { + return http.StatusBadRequest, err.Error() + } + var na *management.NotAllowedError + if errors.As(err, &na) { + return http.StatusMethodNotAllowed, err.Error() + } + var ce *management.ConflictError + if errors.As(err, &ce) { + return http.StatusConflict, err.Error() + } + log.Printf("An unexpected error occurred: %v", err) + return http.StatusInternalServerError, fmt.Sprintf("An unexpected error occurred: %s", err.Error()) +} + +func parseParam(raw string, name string) (string, error) { + decoded, err := url.PathUnescape(raw) + if err != nil { + return "", fmt.Errorf("invalid %s encoding", name) + } + value := strings.TrimSpace(decoded) + if value == "" { + return "", fmt.Errorf("missing %s", name) + } + return value, nil +} + +func getParam(r *http.Request, name string) (string, error) { + vars := mux.Vars(r) + raw := vars[name] + return parseParam(raw, name) +} diff --git a/internal/managementrouter/rules_get.go b/internal/managementrouter/rules_get.go new file mode 100644 index 000000000..15ea7aa80 --- /dev/null +++ b/internal/managementrouter/rules_get.go @@ -0,0 +1,48 @@ +package managementrouter + +import ( + "encoding/json" + "log" + "net/http" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +type GetRulesResponse struct { + Data GetRulesResponseData `json:"data"` + Warnings []string `json:"warnings,omitempty"` +} + +type GetRulesResponseData struct { + Groups []k8s.PrometheusRuleGroup `json:"groups"` +} + +func (hr *httpRouter) GetRules(w http.ResponseWriter, req *http.Request) { + state, labels, err := parseStateAndLabels(req.URL.Query()) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + ctx := k8s.WithBearerToken(req.Context(), bearerTokenFromRequest(req)) + + groups, err := hr.managementClient.GetRules(ctx, k8s.GetRulesRequest{ + Labels: labels, + State: state, + }) + if err != nil { + handleError(w, err) + return + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("Cache-Control", "no-store") + w.WriteHeader(http.StatusOK) + if err := json.NewEncoder(w).Encode(GetRulesResponse{ + Data: GetRulesResponseData{ + Groups: groups, + }, + Warnings: hr.rulesWarnings(ctx), + }); err != nil { + log.Printf("failed to encode rules response: %v", err) + } +} diff --git a/internal/managementrouter/rules_get_test.go b/internal/managementrouter/rules_get_test.go new file mode 100644 index 000000000..61ec668a9 --- /dev/null +++ b/internal/managementrouter/rules_get_test.go @@ -0,0 +1,204 @@ +package managementrouter_test + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" +) + +var _ = Describe("GetRules", func() { + var ( + mockManagement *stubManagementClient + router http.Handler + ) + + BeforeEach(func() { + mockManagement = &stubManagementClient{} + router = managementrouter.New(mockManagement) + }) + + Context("flat label parsing", func() { + It("parses flat query params into Labels map and state", func() { + var captured k8s.GetRulesRequest + mockManagement.getRules = func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + captured = req + return []k8s.PrometheusRuleGroup{}, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/rules?namespace=ns1&severity=critical&state=firing&team=sre", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + Expect(captured.State).To(Equal("firing")) + Expect(captured.Labels["namespace"]).To(Equal("ns1")) + Expect(captured.Labels["severity"]).To(Equal("critical")) + Expect(captured.Labels["team"]).To(Equal("sre")) + }) + }) + + Context("when getting rules without filters", func() { + It("returns groups in response", func() { + mockManagement.getRules = func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + }, + }, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/rules", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + Expect(w.Header().Get("Content-Type")).To(Equal("application/json")) + + var response managementrouter.GetRulesResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Data.Groups).To(HaveLen(1)) + Expect(response.Data.Groups[0].Name).To(Equal("group-a")) + }) + + It("returns warnings when user workload Prometheus route is missing", func() { + mockManagement.alertingHealth = func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{Status: k8s.RouteNotFound}, + }, + }, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/rules", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + var response managementrouter.GetRulesResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Warnings).To(ContainElement("user workload Prometheus route is missing")) + }) + + It("suppresses warnings when fallback is healthy", func() { + mockManagement.alertingHealth = func(ctx context.Context) (k8s.AlertingHealth, error) { + return k8s.AlertingHealth{ + UserWorkloadEnabled: true, + UserWorkload: &k8s.AlertingStackHealth{ + Prometheus: k8s.AlertingRouteHealth{ + Status: k8s.RouteUnreachable, + FallbackReachable: true, + }, + }, + }, nil + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/rules", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + var response managementrouter.GetRulesResponse + err := json.NewDecoder(w.Body).Decode(&response) + Expect(err).NotTo(HaveOccurred()) + Expect(response.Warnings).To(BeEmpty()) + }) + }) + + Context("when handling errors", func() { + It("returns 500 when GetRules fails", func() { + mockManagement.getRules = func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return nil, fmt.Errorf("connection error") + } + + req := httptest.NewRequest(http.MethodGet, "/api/v1/alerting/rules", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusInternalServerError)) + Expect(w.Body.String()).To(ContainSubstring("An unexpected error occurred")) + }) + }) +}) + +type stubManagementClient struct { + getRules func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) + alertingHealth func(ctx context.Context) (k8s.AlertingHealth, error) +} + +func (s *stubManagementClient) ListRules(ctx context.Context, prOptions management.PrometheusRuleOptions, arOptions management.AlertRuleOptions) ([]monitoringv1.Rule, error) { + return nil, nil +} + +func (s *stubManagementClient) GetRuleById(ctx context.Context, alertRuleId string) (monitoringv1.Rule, error) { + return monitoringv1.Rule{}, nil +} + +func (s *stubManagementClient) CreateUserDefinedAlertRule(ctx context.Context, alertRule monitoringv1.Rule, prOptions management.PrometheusRuleOptions) (string, error) { + return "", nil +} + +func (s *stubManagementClient) CreatePlatformAlertRule(ctx context.Context, alertRule monitoringv1.Rule) (string, error) { + return "", nil +} + +func (s *stubManagementClient) UpdateUserDefinedAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) (string, error) { + return "", nil +} + +func (s *stubManagementClient) DeleteUserDefinedAlertRuleById(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *stubManagementClient) UpdatePlatformAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) error { + return nil +} + +func (s *stubManagementClient) DropPlatformAlertRule(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *stubManagementClient) RestorePlatformAlertRule(ctx context.Context, alertRuleId string) error { + return nil +} + +func (s *stubManagementClient) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, nil +} + +func (s *stubManagementClient) GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + if s.getRules != nil { + return s.getRules(ctx, req) + } + return []k8s.PrometheusRuleGroup{}, nil +} + +func (s *stubManagementClient) GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if s.alertingHealth != nil { + return s.alertingHealth(ctx) + } + return k8s.AlertingHealth{}, nil +} + +func (s *stubManagementClient) UpdateAlertRuleClassification(ctx context.Context, req management.UpdateRuleClassificationRequest) error { + return nil +} + +func (s *stubManagementClient) BulkUpdateAlertRuleClassification(ctx context.Context, items []management.UpdateRuleClassificationRequest) []error { + return nil +} diff --git a/internal/managementrouter/user_defined_alert_rule_bulk_delete.go b/internal/managementrouter/user_defined_alert_rule_bulk_delete.go new file mode 100644 index 000000000..eea8ee19c --- /dev/null +++ b/internal/managementrouter/user_defined_alert_rule_bulk_delete.go @@ -0,0 +1,60 @@ +package managementrouter + +import ( + "encoding/json" + "net/http" +) + +type BulkDeleteUserDefinedAlertRulesRequest struct { + RuleIds []string `json:"ruleIds"` +} + +type BulkDeleteUserDefinedAlertRulesResponse struct { + Rules []DeleteUserDefinedAlertRulesResponse `json:"rules"` +} + +func (hr *httpRouter) BulkDeleteUserDefinedAlertRules(w http.ResponseWriter, req *http.Request) { + var payload BulkDeleteUserDefinedAlertRulesRequest + if err := json.NewDecoder(req.Body).Decode(&payload); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + if len(payload.RuleIds) == 0 { + writeError(w, http.StatusBadRequest, "ruleIds is required") + return + } + + results := make([]DeleteUserDefinedAlertRulesResponse, 0, len(payload.RuleIds)) + + for _, rawId := range payload.RuleIds { + id, err := parseParam(rawId, "ruleId") + if err != nil { + results = append(results, DeleteUserDefinedAlertRulesResponse{ + Id: rawId, + StatusCode: http.StatusBadRequest, + Message: err.Error(), + }) + continue + } + + if err := hr.managementClient.DeleteUserDefinedAlertRuleById(req.Context(), id); err != nil { + status, message := parseError(err) + results = append(results, DeleteUserDefinedAlertRulesResponse{ + Id: id, + StatusCode: status, + Message: message, + }) + continue + } + results = append(results, DeleteUserDefinedAlertRulesResponse{ + Id: id, + StatusCode: http.StatusNoContent, + }) + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(BulkDeleteUserDefinedAlertRulesResponse{ + Rules: results, + }) +} diff --git a/internal/managementrouter/user_defined_alert_rule_bulk_delete_test.go b/internal/managementrouter/user_defined_alert_rule_bulk_delete_test.go new file mode 100644 index 000000000..37824c566 --- /dev/null +++ b/internal/managementrouter/user_defined_alert_rule_bulk_delete_test.go @@ -0,0 +1,216 @@ +package managementrouter_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("BulkDeleteUserDefinedAlertRules", func() { + var ( + router http.Handler + mockK8s *testutils.MockClient + ) + + var ( + userRule1Name = "u1" + userRule1 = monitoringv1.Rule{Alert: userRule1Name, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "default", k8s.PrometheusRuleLabelName: "user-pr"}} + userRule1Id = alertrule.GetAlertingRuleId(&userRule1) + + userRule2Name = "u2" + userRule2 = monitoringv1.Rule{Alert: userRule2Name, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "default", k8s.PrometheusRuleLabelName: "user-pr"}} + userRule2Id = alertrule.GetAlertingRuleId(&userRule2) + + platformRuleName = "platform" + platformRule = monitoringv1.Rule{Alert: platformRuleName, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "platform-namespace-1", k8s.PrometheusRuleLabelName: "platform-pr"}} + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + mockK8s = &testutils.MockClient{} + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Rules: []monitoringv1.Rule{userRule1, userRule2, platformRule}, + }, + }, + }, + }, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + return nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + return nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + switch id { + case userRule1Id: + return userRule1, true + case userRule2Id: + return userRule2, true + case platformRuleId: + return platformRule, true + default: + return monitoringv1.Rule{}, false + } + }, + } + } + + // Provide owning AlertingRule so platform (user-via-platform) deletion can succeed + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + if name == "platform-alert-rules" { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-alert-rules", + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: "test-group", + Rules: []osmv1.Rule{ + {Alert: platformRuleName}, + }, + }, + }, + }, + }, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, ar osmv1.AlertingRule) error { + return nil + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return strings.HasPrefix(name, "platform-namespace-") + }, + } + } + }) + + Context("when deleting multiple rules", func() { + It("returns deleted and failed for mixed ruleIds and updates rules", func() { + body := map[string]any{"ruleIds": []string{userRule1Id, platformRuleId, ""}} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusOK)) + var resp struct { + Rules []struct { + Id string `json:"id"` + StatusCode int `json:"status_code"` + Message string `json:"message"` + } `json:"rules"` + } + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(3)) + + // u1 -> success + Expect(resp.Rules[0].Id).To(Equal(userRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent), resp.Rules[0].Message) + Expect(resp.Rules[0].Message).To(BeEmpty()) + + // platform1 (user-via-platform) -> success + Expect(resp.Rules[1].Id).To(Equal(platformRuleId)) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNoContent), resp.Rules[1].Message) + Expect(resp.Rules[1].Message).To(BeEmpty()) + + // "" -> bad request (missing id) + Expect(resp.Rules[2].Id).To(Equal("")) + Expect(resp.Rules[2].StatusCode).To(Equal(http.StatusBadRequest), resp.Rules[2].Message) + Expect(resp.Rules[2].Message).To(ContainSubstring("missing ruleId")) + }) + + It("returns all deleted when all user ruleIds succeed", func() { + body := map[string]any{"ruleIds": []string{userRule1Id, userRule2Id}} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + Expect(w.Code).To(Equal(http.StatusOK)) + var resp struct { + Rules []struct { + Id string `json:"id"` + StatusCode int `json:"status_code"` + Message string `json:"message"` + } `json:"rules"` + } + Expect(json.NewDecoder(w.Body).Decode(&resp)).To(Succeed()) + Expect(resp.Rules).To(HaveLen(2)) + + // platform1 -> success + Expect(resp.Rules[0].Id).To(Equal(userRule1Id)) + Expect(resp.Rules[0].StatusCode).To(Equal(http.StatusNoContent), resp.Rules[0].Message) + Expect(resp.Rules[0].Message).To(BeEmpty()) + + // platform2 -> success + Expect(resp.Rules[1].Id).To(Equal(userRule2Id)) + Expect(resp.Rules[1].StatusCode).To(Equal(http.StatusNoContent), resp.Rules[1].Message) + Expect(resp.Rules[1].Message).To(BeEmpty()) + }) + }) + + Context("when request body is invalid", func() { + It("returns 400", func() { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules", bytes.NewBufferString("{")) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("invalid request body")) + }) + }) + + Context("when ruleIds is empty", func() { + It("returns 400", func() { + body := map[string]interface{}{"ruleIds": []string{}} + buf, _ := json.Marshal(body) + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules", bytes.NewReader(buf)) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("ruleIds is required")) + }) + }) +}) diff --git a/internal/managementrouter/user_defined_alert_rule_delete_by_id.go b/internal/managementrouter/user_defined_alert_rule_delete_by_id.go new file mode 100644 index 000000000..778f7f474 --- /dev/null +++ b/internal/managementrouter/user_defined_alert_rule_delete_by_id.go @@ -0,0 +1,26 @@ +package managementrouter + +import ( + "net/http" +) + +type DeleteUserDefinedAlertRulesResponse struct { + Id string `json:"id"` + StatusCode int `json:"status_code"` + Message string `json:"message,omitempty"` +} + +func (hr *httpRouter) DeleteUserDefinedAlertRuleById(w http.ResponseWriter, req *http.Request) { + ruleId, err := getParam(req, "ruleId") + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + + if err := hr.managementClient.DeleteUserDefinedAlertRuleById(req.Context(), ruleId); err != nil { + handleError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} diff --git a/internal/managementrouter/user_defined_alert_rule_delete_by_id_test.go b/internal/managementrouter/user_defined_alert_rule_delete_by_id_test.go new file mode 100644 index 000000000..69f668581 --- /dev/null +++ b/internal/managementrouter/user_defined_alert_rule_delete_by_id_test.go @@ -0,0 +1,171 @@ +package managementrouter_test + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("DeleteUserDefinedAlertRuleById", func() { + var ( + router http.Handler + mockK8s *testutils.MockClient + ) + + var ( + userRule1Name = "u1" + userRule1 = monitoringv1.Rule{Alert: userRule1Name, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "default", k8s.PrometheusRuleLabelName: "user-pr"}} + userRule1Id = alertrule.GetAlertingRuleId(&userRule1) + + userRule2Name = "u2" + userRule2 = monitoringv1.Rule{Alert: userRule2Name, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "default", k8s.PrometheusRuleLabelName: "user-pr"}} + userRule2Id = alertrule.GetAlertingRuleId(&userRule2) + + platformRuleName = "p1" + platformRule = monitoringv1.Rule{Alert: platformRuleName, Labels: map[string]string{k8s.PrometheusRuleLabelNamespace: "platform-namespace-1", k8s.PrometheusRuleLabelName: "platform-pr"}} + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + mockK8s = &testutils.MockClient{} + mgmt := management.New(context.Background(), mockK8s) + router = managementrouter.New(mgmt) + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Rules: []monitoringv1.Rule{userRule1, userRule2, platformRule}, + }, + }, + }, + }, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + return nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + return nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + switch id { + case userRule1Id: + return userRule1, true + case userRule2Id: + return userRule2, true + case platformRuleId: + return platformRule, true + default: + return monitoringv1.Rule{}, false + } + }, + } + } + + // Provide owning AlertingRule so platform (user-via-platform) deletion can succeed + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + if name == "platform-alert-rules" { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-alert-rules", + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: "test-group", + Rules: []osmv1.Rule{ + {Alert: platformRuleName}, + }, + }, + }, + }, + }, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, ar osmv1.AlertingRule) error { + return nil + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return strings.HasPrefix(name, "platform-namespace-") + }, + } + } + }) + + Context("when ruleId is missing or blank", func() { + It("returns 400 with missing ruleId message", func() { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules/%20", nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusBadRequest)) + Expect(w.Body.String()).To(ContainSubstring("missing ruleId")) + }) + }) + + Context("when rule is not found", func() { + It("returns 404 with expected message", func() { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules/missing", nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusNotFound)) + Expect(w.Body.String()).To(ContainSubstring("AlertRule with id missing not found")) + }) + }) + + Context("when deleting a user-defined rule", func() { + It("returns 204", func() { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules/"+userRule1Id, nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusNoContent)) + }) + }) + + Context("when deleting a platform rule", func() { + It("returns 204 for user-via-platform (not operator-managed)", func() { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/alerting/rules/"+platformRuleId, nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + + Expect(w.Code).To(Equal(http.StatusNoContent)) + Expect(w.Body.String()).To(BeEmpty()) + }) + }) +}) diff --git a/pkg/alert_rule/alert_rule.go b/pkg/alert_rule/alert_rule.go new file mode 100644 index 000000000..a7d6f456d --- /dev/null +++ b/pkg/alert_rule/alert_rule.go @@ -0,0 +1,83 @@ +package alertrule + +import ( + "crypto/sha256" + "encoding/base64" + "fmt" + "sort" + "strings" + "unicode/utf8" + + "github.com/openshift/monitoring-plugin/pkg/classification" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func GetAlertingRuleId(alertRule *monitoringv1.Rule) string { + var name string + var kind string + if alertRule.Alert != "" { + name = alertRule.Alert + kind = "alert" + } else if alertRule.Record != "" { + name = alertRule.Record + kind = "record" + } else { + return "" + } + + expr := normalizeExpr(alertRule.Expr.String()) + forDuration := "" + if alertRule.For != nil { + forDuration = strings.TrimSpace(string(*alertRule.For)) + } + + labelsBlock := normalizedBusinessLabelsBlock(alertRule.Labels) + + // Canonical payload is intentionally derived from rule spec (expr/for/labels) and identity (kind/name), + // and excludes annotations and openshift_io_* provenance/system labels. + canonicalPayload := strings.Join([]string{kind, name, expr, forDuration, labelsBlock}, "\n---\n") + + // Generate SHA256 hash + hash := sha256.Sum256([]byte(canonicalPayload)) + + return "rid_" + base64.RawURLEncoding.EncodeToString(hash[:]) +} + +func normalizeExpr(expr string) string { + // Collapse consecutive whitespace so cosmetic formatting changes do not churn ids. + return strings.Join(strings.Fields(strings.TrimSpace(expr)), " ") +} + +func normalizedBusinessLabelsBlock(in map[string]string) string { + if len(in) == 0 { + return "" + } + + lines := make([]string, 0, len(in)) + for k, v := range in { + key := strings.TrimSpace(k) + if key == "" { + continue + } + if strings.HasPrefix(key, "openshift_io_") || key == managementlabels.AlertNameLabel { + // Skip system labels + continue + } + if !classification.ValidatePromLabelName(key) { + continue + } + if v == "" { + // Align with specHash behavior: drop empty values + continue + } + if !utf8.ValidString(v) { + continue + } + + lines = append(lines, fmt.Sprintf("%s=%s", key, v)) + } + + sort.Strings(lines) + return strings.Join(lines, "\n") +} diff --git a/pkg/alertcomponent/matcher.go b/pkg/alertcomponent/matcher.go new file mode 100644 index 000000000..8aa6f9227 --- /dev/null +++ b/pkg/alertcomponent/matcher.go @@ -0,0 +1,381 @@ +package alertcomponent + +import ( + "regexp" + + "github.com/prometheus/common/model" + + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +const ( + labelNamespace = "namespace" + labelSeverity = "severity" +) + +func ns(values ...string) LabelsMatcher { + return NewLabelsMatcher(labelNamespace, NewStringValuesMatcher(values...)) +} + +func alertNames(values ...string) LabelsMatcher { + return NewLabelsMatcher(managementlabels.AlertNameLabel, NewStringValuesMatcher(values...)) +} + +func regexAlertNames(regexes ...*regexp.Regexp) LabelsMatcher { + return NewLabelsMatcher(managementlabels.AlertNameLabel, NewRegexValuesMatcher(regexes...)) +} + +func labelValues(key string, values ...string) LabelsMatcher { + return NewLabelsMatcher(key, NewStringValuesMatcher(values...)) +} + +func comp(component string, ms ...LabelsMatcher) componentMatcher { + return componentMatcher{component: component, matchers: ms} +} + +// LabelsMatcher represents a matcher definition for a set of labels. +// It matches if all of the label matchers match the labels. +type LabelsMatcher interface { + Matches(labels model.LabelSet) (match bool, keys []model.LabelName) + Equals(other LabelsMatcher) bool +} + +func NewLabelsMatcher(key string, matcher ValueMatcher) LabelsMatcher { + return labelMatcher{key: key, matcher: matcher} +} + +func NewStringValuesMatcher(keys ...string) ValueMatcher { + return stringMatcher(keys) +} + +func NewRegexValuesMatcher(regexes ...*regexp.Regexp) ValueMatcher { + return regexpMatcher(regexes) +} + +// labelMatcher represents a matcher definition for a label. +type labelMatcher struct { + key string + matcher ValueMatcher +} + +// Matches implements the LabelsMatcher interface. +func (l labelMatcher) Matches(labels model.LabelSet) (bool, []model.LabelName) { + if l.matcher.Matches(string(labels[model.LabelName(l.key)])) { + return true, []model.LabelName{model.LabelName(l.key)} + } + return false, nil +} + +// Equals implements the LabelsMatcher interface. +func (l labelMatcher) Equals(other LabelsMatcher) bool { + ol, ok := other.(labelMatcher) + if !ok { + return false + } + return l.key == ol.key && l.matcher.Equals(ol.matcher) +} + +// ValueMatcher represents a matcher for a specific value. +// +// Multiple implementations are provided for different types of matchers. +type ValueMatcher interface { + Matches(value string) bool + Equals(other ValueMatcher) bool +} + +// stringMatcher is a matcher for a list of strings. +// +// It matches if the value is in the list of strings. +type stringMatcher []string + +func (s stringMatcher) Matches(value string) bool { + for _, v := range s { + if v == value { + return true + } + } + return false +} + +// Equals implements the ValueMatcher interface. +func (s stringMatcher) Equals(other ValueMatcher) bool { + o, ok := other.(stringMatcher) + if !ok { + return false + } + return equalsNoOrder(s, o) +} + +// regexpMatcher is a matcher for a list of regular expressions. +// +// It matches if the value matches any of the regular expressions. +type regexpMatcher []*regexp.Regexp + +func (r regexpMatcher) Matches(value string) bool { + for _, re := range r { + if re.MatchString(value) { + return true + } + } + return false +} + +// Equals implements the ValueMatcher interface. +func (r regexpMatcher) Equals(other ValueMatcher) bool { + o, ok := other.(regexpMatcher) + if !ok { + return false + } + s1 := make([]string, 0, len(r)) + for _, re := range r { + s1 = append(s1, re.String()) + } + s2 := make([]string, 0, len(o)) + for _, re := range o { + s2 = append(s2, re.String()) + } + return equalsNoOrder(s1, s2) +} + +func equalsNoOrder(a, b []string) bool { + if len(a) != len(b) { + return false + } + + seen := make(map[string]int, len(a)) + for _, v := range a { + seen[v]++ + } + for _, v := range b { + if seen[v] == 0 { + return false + } + seen[v]-- + } + return true +} + +// componentMatcher represents a matcher definition for a component. +// +// It matches if any of the label matchers match the labels. +type componentMatcher struct { + component string + matchers []LabelsMatcher +} + +// findComponent tries to determine a component for given labels using the provided matchers. +// +// It returns the component and the keys that matched. +// If no match is found, it returns an empty component and nil keys. +func findComponent(compMatchers []componentMatcher, labels model.LabelSet) ( + component string, keys []model.LabelName) { + for _, compMatcher := range compMatchers { + for _, labelsMatcher := range compMatcher.matchers { + if matches, keys := labelsMatcher.Matches(labels); matches { + return compMatcher.component, keys + } + } + } + return "", nil +} + +// componentMatcherFn is a function that tries matching provided labels to a component. +// It returns the layer, component and the keys from the labels that were used for matching. +// If no match is found, it returns an empty layer, component and nil keys. +type componentMatcherFn func(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) + +func evalMatcherFns(fns []componentMatcherFn, labels model.LabelSet) ( + layer, comp string, labelsSubset model.LabelSet) { + for _, fn := range fns { + if layer, comp, keys := fn(labels); layer != "" { + return string(layer), string(comp), getLabelsSubset(labels, keys...) + } + } + return "Others", "Others", getLabelsSubset(labels) +} + +// getLabelsSubset returns a subset of the labels with given keys. +func getLabelsSubset(m model.LabelSet, keys ...model.LabelName) model.LabelSet { + keys = append([]model.LabelName{ + model.LabelName(labelNamespace), + model.LabelName(managementlabels.AlertNameLabel), + model.LabelName(labelSeverity), + }, keys...) + return getMapSubset(m, keys...) +} + +// getMapSubset returns a subset of the labels with given keys. +func getMapSubset(m model.LabelSet, keys ...model.LabelName) model.LabelSet { + subset := make(model.LabelSet, len(keys)) + for _, key := range keys { + if val, ok := m[key]; ok { + subset[key] = val + } + } + return subset +} + +var ( + nodeAlerts []model.LabelValue = []model.LabelValue{ + "NodeClockNotSynchronising", + "KubeNodeNotReady", + "KubeNodeUnreachable", + "NodeSystemSaturation", + "NodeFilesystemSpaceFillingUp", + "NodeFilesystemAlmostOutOfSpace", + "NodeMemoryMajorPagesFaults", + "NodeNetworkTransmitErrs", + "NodeTextFileCollectorScrapeError", + "NodeFilesystemFilesFillingUp", + "NodeNetworkReceiveErrs", + "NodeClockSkewDetected", + "NodeFilesystemAlmostOutOfFiles", + "NodeWithoutOVNKubeNodePodRunning", + "InfraNodesNeedResizingSRE", + "NodeHighNumberConntrackEntriesUsed", + "NodeMemHigh", + "NodeNetworkInterfaceFlapping", + "NodeWithoutSDNPod", + "NodeCpuHigh", + "CriticalNodeNotReady", + "NodeFileDescriptorLimit", + "MCCPoolAlert", + "MCCDrainError", + "MCDRebootError", + "MCDPivotError", + } + + coreMatchers = []componentMatcher{ + comp("etcd", ns("openshift-etcd", "openshift-etcd-operator")), + comp("kube-apiserver", ns("openshift-kube-apiserver", "openshift-kube-apiserver-operator")), + comp("kube-controller-manager", ns("openshift-kube-controller-manager", "openshift-kube-controller-manager-operator", "kube-system")), + comp("kube-scheduler", ns("openshift-kube-scheduler", "openshift-kube-scheduler-operator")), + comp("machine-approver", ns("openshift-cluster-machine-approver", "openshift-machine-approver-operator")), + comp("machine-config", + ns("openshift-machine-config-operator"), + alertNames( + "HighOverallControlPlaneMemory", + "ExtremelyHighIndividualControlPlaneMemory", + "MissingMachineConfig", + "MCCBootImageUpdateError", + "KubeletHealthState", + "SystemMemoryExceedsReservation", + ), + ), + comp("version", + ns("openshift-cluster-version", "openshift-version-operator"), + alertNames("ClusterNotUpgradeable", "UpdateAvailable"), + ), + comp("dns", ns("openshift-dns", "openshift-dns-operator")), + comp("authentication", ns("openshift-authentication", "openshift-oauth-apiserver", "openshift-authentication-operator")), + comp("cert-manager", ns("openshift-cert-manager", "openshift-cert-manager-operator")), + comp("cloud-controller-manager", ns("openshift-cloud-controller-manager", "openshift-cloud-controller-manager-operator")), + comp("cloud-credential", ns("openshift-cloud-credential-operator")), + comp("cluster-api", ns("openshift-cluster-api", "openshift-cluster-api-operator")), + comp("config-operator", ns("openshift-config-operator")), + comp("kube-storage-version-migrator", ns("openshift-kube-storage-version-migrator", "openshift-kube-storage-version-migrator-operator")), + comp("image-registry", ns("openshift-image-registry", "openshift-image-registry-operator")), + comp("ingress", ns("openshift-ingress", "openshift-route-controller-manager", "openshift-ingress-canary", "openshift-ingress-operator")), + comp("console", ns("openshift-console", "openshift-console-operator")), + comp("insights", ns("openshift-insights", "openshift-insights-operator")), + comp("machine-api", ns("openshift-machine-api", "openshift-machine-api-operator")), + comp("monitoring", ns("openshift-monitoring", "openshift-monitoring-operator")), + comp("network", ns("openshift-network-operator", "openshift-ovn-kubernetes", "openshift-multus", "openshift-network-diagnostics", "openshift-sdn")), + comp("node-tuning", ns("openshift-cluster-node-tuning-operator", "openshift-node-tuning-operator")), + comp("openshift-apiserver", ns("openshift-apiserver", "openshift-apiserver-operator")), + comp("openshift-controller-manager", ns("openshift-controller-manager", "openshift-controller-manager-operator")), + comp("openshift-samples", ns("openshift-cluster-samples-operator", "openshift-samples-operator")), + comp("operator-lifecycle-manager", ns("openshift-operator-lifecycle-manager")), + comp("service-ca", ns("openshift-service-ca", "openshift-service-ca-operator")), + comp("storage", ns("openshift-storage", "openshift-cluster-csi-drivers", "openshift-cluster-storage-operator", "openshift-storage-operator")), + comp("vertical-pod-autoscaler", ns("openshift-vertical-pod-autoscaler", "openshift-vertical-pod-autoscaler-operator")), + comp("marketplace", ns("openshift-marketplace", "openshift-marketplace-operator")), + } + + workloadMatchers = []componentMatcher{ + comp("openshift-compliance", ns("openshift-compliance")), + comp("openshift-file-integrity", ns("openshift-file-integrity")), + comp("openshift-logging", ns("openshift-logging")), + comp("openshift-user-workload-monitoring", ns("openshift-user-workload-monitoring")), + comp("openshift-gitops", ns("openshift-gitops", "openshift-gitops-operator")), + comp("openshift-operators", ns("openshift-operators")), + comp("openshift-local-storage", ns("openshift-local-storage")), + comp("quay", labelValues("container", "quay-app", "quay-mirror", "quay-app-upgrade")), + comp("Argo", regexAlertNames(regexp.MustCompile("^Argo"))), + } +) + +var cvoAlerts = []model.LabelValue{"ClusterOperatorDown", "ClusterOperatorDegraded"} + +func cvoAlertsMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + for _, v := range cvoAlerts { + if labels[managementlabels.AlertNameLabel] == v { + component := labels["name"] + if component == "" { + component = "version" + } + return "cluster", component, nil + } + } + return "", "", nil +} + +func kubevirtOperatorMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + if labels["kubernetes_operator_part_of"] != "kubevirt" { + return "", "", nil + } + if labels["kubernetes_operator_component"] == "cnv-observability" { + return "", "", nil + } + if labels["operator_health_impact"] == "none" && labels["kubernetes_operator_component"] == "kubevirt" { + return "namespace", "OpenShift Virtualization Virtual Machine", []model.LabelName{ + "kubernetes_operator_part_of", + "kubernetes_operator_component", + "operator_health_impact", + } + } + return "cluster", "OpenShift Virtualization Operator", []model.LabelName{ + "kubernetes_operator_part_of", + "kubernetes_operator_component", + "operator_health_impact", + } +} + +func computeMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + for _, nodeAlert := range nodeAlerts { + if labels[managementlabels.AlertNameLabel] == nodeAlert { + component := "compute" + return "cluster", model.LabelValue(component), nil + } + } + return "", "", nil +} + +func coreMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + // Try matching against core components. + if component, keys := findComponent(coreMatchers, labels); component != "" { + return "cluster", model.LabelValue(component), keys + } + return "", "", nil +} + +func workloadMatcher(labels model.LabelSet) (layer, comp model.LabelValue, keys []model.LabelName) { + // Try matching against workload components. + if component, keys := findComponent(workloadMatchers, labels); component != "" { + return "namespace", model.LabelValue(component), keys + } + return "", "", nil +} + +// DetermineComponent determines the component for a given set of labels. +// It returns the layer and component strings. +func DetermineComponent(labels model.LabelSet) (layer, component string) { + layer, component, _ = evalMatcherFns([]componentMatcherFn{ + cvoAlertsMatcher, + kubevirtOperatorMatcher, + computeMatcher, + coreMatcher, + workloadMatcher, + }, labels) + return layer, component +} diff --git a/pkg/classification/validation.go b/pkg/classification/validation.go new file mode 100644 index 000000000..32f78b784 --- /dev/null +++ b/pkg/classification/validation.go @@ -0,0 +1,34 @@ +package classification + +import ( + "regexp" + "strings" +) + +var allowedLayers = map[string]struct{}{ + "cluster": {}, + "namespace": {}, +} + +var labelValueRegexp = regexp.MustCompile(`^[A-Za-z0-9]([A-Za-z0-9_.-]*[A-Za-z0-9])?$`) +var labelNameRegexp = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`) + +// ValidateLayer returns true if the provided layer is one of the allowed values. +func ValidateLayer(layer string) bool { + _, ok := allowedLayers[strings.ToLower(strings.TrimSpace(layer))] + return ok +} + +// ValidateComponent returns true if the component is a reasonable label value. +// Accept 1-253 chars, [A-Za-z0-9._-], must start/end alphanumeric. +func ValidateComponent(component string) bool { + c := strings.TrimSpace(component) + if c == "" || len(c) > 253 { + return false + } + return labelValueRegexp.MatchString(c) +} + +func ValidatePromLabelName(name string) bool { + return labelNameRegexp.MatchString(strings.TrimSpace(name)) +} diff --git a/pkg/k8s/alert_classification_configmap.go b/pkg/k8s/alert_classification_configmap.go new file mode 100644 index 000000000..baa23e5cd --- /dev/null +++ b/pkg/k8s/alert_classification_configmap.go @@ -0,0 +1,49 @@ +package k8s + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// AlertRuleClassificationConfigMapManager provides the minimal ConfigMap ops +// needed by the alert-rule classification update flow. +type AlertRuleClassificationConfigMapManager struct { + client *client +} + +var _ ConfigMapInterface = (*AlertRuleClassificationConfigMapManager)(nil) + +func (c *client) ConfigMaps() ConfigMapInterface { + return &AlertRuleClassificationConfigMapManager{client: c} +} + +func (m *AlertRuleClassificationConfigMapManager) Get(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, bool, error) { + cm, err := m.client.clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return nil, false, nil + } + return nil, false, err + } + return cm, true, nil +} + +func (m *AlertRuleClassificationConfigMapManager) Update(ctx context.Context, cm corev1.ConfigMap) error { + _, err := m.client.clientset.CoreV1().ConfigMaps(cm.Namespace).Update(ctx, &cm, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("update configmap %s/%s: %w", cm.Namespace, cm.Name, err) + } + return nil +} + +func (m *AlertRuleClassificationConfigMapManager) Create(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) { + created, err := m.client.clientset.CoreV1().ConfigMaps(cm.Namespace).Create(ctx, &cm, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("create configmap %s/%s: %w", cm.Namespace, cm.Name, err) + } + return created, nil +} diff --git a/pkg/k8s/alert_relabel_config.go b/pkg/k8s/alert_relabel_config.go new file mode 100644 index 000000000..2405e2e42 --- /dev/null +++ b/pkg/k8s/alert_relabel_config.go @@ -0,0 +1,99 @@ +package k8s + +import ( + "context" + "fmt" + + osmv1 "github.com/openshift/api/monitoring/v1" + osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/tools/cache" +) + +type alertRelabelConfigManager struct { + clientset *osmv1client.Clientset + arcInformer cache.SharedIndexInformer +} + +func newAlertRelabelConfigManager(ctx context.Context, clientset *osmv1client.Clientset) (*alertRelabelConfigManager, error) { + arcInformer := cache.NewSharedIndexInformer( + alertRelabelConfigListWatchForAllNamespaces(clientset), + &osmv1.AlertRelabelConfig{}, + 0, + cache.Indexers{}, + ) + + arcm := &alertRelabelConfigManager{ + clientset: clientset, + arcInformer: arcInformer, + } + + go arcm.arcInformer.Run(ctx.Done()) + + cache.WaitForNamedCacheSync("AlertRelabelConfig informer", ctx.Done(), + arcm.arcInformer.HasSynced, + ) + + return arcm, nil +} + +func alertRelabelConfigListWatchForAllNamespaces(clientset *osmv1client.Clientset) *cache.ListWatch { + return cache.NewListWatchFromClient(clientset.MonitoringV1().RESTClient(), "alertrelabelconfigs", "", fields.Everything()) +} + +func (arcm *alertRelabelConfigManager) List(ctx context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) { + arcs := arcm.arcInformer.GetStore().List() + + alertRelabelConfigs := make([]osmv1.AlertRelabelConfig, 0, len(arcs)) + for _, item := range arcs { + arc, ok := item.(*osmv1.AlertRelabelConfig) + if !ok { + continue + } + alertRelabelConfigs = append(alertRelabelConfigs, *arc) + } + + return alertRelabelConfigs, nil +} + +func (arcm *alertRelabelConfigManager) Get(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + arc, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil, false, nil + } + + return nil, false, err + } + + return arc, true, nil +} + +func (arcm *alertRelabelConfigManager) Create(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + created, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(arc.Namespace).Create(ctx, &arc, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + + return created, nil +} + +func (arcm *alertRelabelConfigManager) Update(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + _, err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(arc.Namespace).Update(ctx, &arc, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + + return nil +} + +func (arcm *alertRelabelConfigManager) Delete(ctx context.Context, namespace string, name string) error { + err := arcm.clientset.MonitoringV1().AlertRelabelConfigs(namespace).Delete(ctx, name, metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("failed to delete AlertRelabelConfig %s: %w", name, err) + } + + return nil +} diff --git a/pkg/k8s/alerting_health.go b/pkg/k8s/alerting_health.go new file mode 100644 index 000000000..790f4930b --- /dev/null +++ b/pkg/k8s/alerting_health.go @@ -0,0 +1,127 @@ +package k8s + +import ( + "context" + "fmt" + "strings" + "sync" + + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" +) + +const ( + clusterMonitoringConfigMap = "cluster-monitoring-config" + clusterMonitoringConfigKey = "config.yaml" +) + +type clusterMonitoringConfig struct { + EnableUserWorkload bool `yaml:"enableUserWorkload"` +} + +// clusterMonitoringConfigManager watches the cluster-monitoring-config ConfigMap +// via an informer and caches the parsed enableUserWorkload value so that +// AlertingHealth never needs a live API call. +type clusterMonitoringConfigManager struct { + informer cache.SharedIndexInformer + + mu sync.RWMutex + enabled bool + err error +} + +func newClusterMonitoringConfigManager(ctx context.Context, clientset *kubernetes.Clientset) (*clusterMonitoringConfigManager, error) { + informer := cache.NewSharedIndexInformer( + cache.NewListWatchFromClient( + clientset.CoreV1().RESTClient(), + "configmaps", + ClusterMonitoringNamespace, + fields.OneTermEqualSelector("metadata.name", clusterMonitoringConfigMap), + ), + &corev1.ConfigMap{}, + 0, + cache.Indexers{}, + ) + + m := &clusterMonitoringConfigManager{ + informer: informer, + } + + _, err := informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + cm, ok := obj.(*corev1.ConfigMap) + if !ok { + return + } + m.handleUpdate(cm) + }, + UpdateFunc: func(_, newObj interface{}) { + cm, ok := newObj.(*corev1.ConfigMap) + if !ok { + return + } + m.handleUpdate(cm) + }, + DeleteFunc: func(_ interface{}) { + m.mu.Lock() + defer m.mu.Unlock() + m.enabled = false + m.err = nil + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to add event handler to cluster-monitoring-config informer: %w", err) + } + + go informer.Run(ctx.Done()) + + cache.WaitForNamedCacheSync("ClusterMonitoringConfig informer", ctx.Done(), + informer.HasSynced, + ) + + return m, nil +} + +func (m *clusterMonitoringConfigManager) handleUpdate(cm *corev1.ConfigMap) { + m.mu.Lock() + defer m.mu.Unlock() + + raw, ok := cm.Data[clusterMonitoringConfigKey] + if !ok || strings.TrimSpace(raw) == "" { + m.enabled = false + m.err = nil + return + } + + var cfg clusterMonitoringConfig + if err := yaml.Unmarshal([]byte(raw), &cfg); err != nil { + m.enabled = false + m.err = fmt.Errorf("parse cluster monitoring config.yaml: %w", err) + return + } + + m.enabled = cfg.EnableUserWorkload + m.err = nil +} + +func (m *clusterMonitoringConfigManager) userWorkloadEnabled() (bool, error) { + m.mu.RLock() + defer m.mu.RUnlock() + return m.enabled, m.err +} + +// AlertingHealth returns alerting route health and UWM enablement status. +func (c *client) AlertingHealth(ctx context.Context) (AlertingHealth, error) { + health := c.prometheusAlerts.alertingHealth(ctx) + + enabled, err := c.clusterMonitoringConfig.userWorkloadEnabled() + if err != nil { + return health, fmt.Errorf("failed to determine user workload enablement: %w", err) + } + health.UserWorkloadEnabled = enabled + + return health, nil +} diff --git a/pkg/k8s/alerting_rule.go b/pkg/k8s/alerting_rule.go new file mode 100644 index 000000000..559f4b507 --- /dev/null +++ b/pkg/k8s/alerting_rule.go @@ -0,0 +1,107 @@ +package k8s + +import ( + "context" + "fmt" + + osmv1 "github.com/openshift/api/monitoring/v1" + osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/tools/cache" +) + +type alertingRuleManager struct { + clientset *osmv1client.Clientset + informer cache.SharedIndexInformer +} + +func newAlertingRuleManager(ctx context.Context, clientset *osmv1client.Clientset) (*alertingRuleManager, error) { + informer := cache.NewSharedIndexInformer( + alertingRuleListWatchClusterMonitoringNamespace(clientset), + &osmv1.AlertingRule{}, + 0, + cache.Indexers{}, + ) + + arm := &alertingRuleManager{ + clientset: clientset, + informer: informer, + } + + go arm.informer.Run(ctx.Done()) + + if !cache.WaitForNamedCacheSync("AlertingRule informer", ctx.Done(), arm.informer.HasSynced) { + return nil, errors.NewInternalError(fmt.Errorf("failed to sync AlertingRule informer")) + } + + return arm, nil +} + +func alertingRuleListWatchClusterMonitoringNamespace(clientset *osmv1client.Clientset) *cache.ListWatch { + return cache.NewListWatchFromClient(clientset.MonitoringV1().RESTClient(), "alertingrules", ClusterMonitoringNamespace, fields.Everything()) +} + +func (arm *alertingRuleManager) List(ctx context.Context) ([]osmv1.AlertingRule, error) { + items := arm.informer.GetStore().List() + + alertingRules := make([]osmv1.AlertingRule, 0, len(items)) + for _, item := range items { + ar, ok := item.(*osmv1.AlertingRule) + if !ok { + continue + } + alertingRules = append(alertingRules, *ar) + } + + return alertingRules, nil +} + +func (arm *alertingRuleManager) Get(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + ar, err := arm.clientset.MonitoringV1().AlertingRules(ClusterMonitoringNamespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil, false, nil + } + + return nil, false, err + } + + return ar, true, nil +} + +func (arm *alertingRuleManager) Create(ctx context.Context, ar osmv1.AlertingRule) (*osmv1.AlertingRule, error) { + if ar.Namespace != "" && ar.Namespace != ClusterMonitoringNamespace { + return nil, fmt.Errorf("invalid namespace %q: AlertingRule manager only supports %q", ar.Namespace, ClusterMonitoringNamespace) + } + + created, err := arm.clientset.MonitoringV1().AlertingRules(ClusterMonitoringNamespace).Create(ctx, &ar, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create AlertingRule %s/%s: %w", ClusterMonitoringNamespace, ar.Name, err) + } + + return created, nil +} + +func (arm *alertingRuleManager) Update(ctx context.Context, ar osmv1.AlertingRule) error { + if ar.Namespace != "" && ar.Namespace != ClusterMonitoringNamespace { + return fmt.Errorf("invalid namespace %q: AlertingRule manager only supports %q", ar.Namespace, ClusterMonitoringNamespace) + } + + _, err := arm.clientset.MonitoringV1().AlertingRules(ClusterMonitoringNamespace).Update(ctx, &ar, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update AlertingRule %s/%s: %w", ClusterMonitoringNamespace, ar.Name, err) + } + + return nil +} + +func (arm *alertingRuleManager) Delete(ctx context.Context, name string) error { + err := arm.clientset.MonitoringV1().AlertingRules(ClusterMonitoringNamespace).Delete(ctx, name, metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("failed to delete AlertingRule %s/%s: %w", ClusterMonitoringNamespace, name, err) + } + + return nil +} diff --git a/pkg/k8s/auth_context.go b/pkg/k8s/auth_context.go new file mode 100644 index 000000000..89aa5aef0 --- /dev/null +++ b/pkg/k8s/auth_context.go @@ -0,0 +1,26 @@ +package k8s + +import "context" + +type bearerTokenKey struct{} + +// WithBearerToken stores a bearer token in the context for downstream requests. +func WithBearerToken(ctx context.Context, token string) context.Context { + if token == "" { + return ctx + } + return context.WithValue(ctx, bearerTokenKey{}, token) +} + +func bearerTokenFromContext(ctx context.Context) string { + if token, ok := ctx.Value(bearerTokenKey{}).(string); ok { + return token + } + return "" +} + +// BearerTokenFromContext is an exported wrapper around bearerTokenFromContext, +// exposed for use in tests that need to verify token forwarding. +func BearerTokenFromContext(ctx context.Context) string { + return bearerTokenFromContext(ctx) +} diff --git a/pkg/k8s/client.go b/pkg/k8s/client.go new file mode 100644 index 000000000..074f09155 --- /dev/null +++ b/pkg/k8s/client.go @@ -0,0 +1,126 @@ +package k8s + +import ( + "context" + "fmt" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" + routeclient "github.com/openshift/client-go/route/clientset/versioned" + monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" + "github.com/sirupsen/logrus" +) + +var log = logrus.WithField("module", "k8s") + +var _ Client = (*client)(nil) + +type client struct { + clientset *kubernetes.Clientset + monitoringv1clientset *monitoringv1client.Clientset + osmv1clientset *osmv1client.Clientset + config *rest.Config + + prometheusAlerts *prometheusAlerts + + prometheusRuleManager *prometheusRuleManager + alertRelabelConfigManager *alertRelabelConfigManager + alertingRuleManager *alertingRuleManager + namespaceManager *namespaceManager + relabeledRulesManager *relabeledRulesManager + clusterMonitoringConfig *clusterMonitoringConfigManager +} + +func newClient(ctx context.Context, config *rest.Config) (Client, error) { + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + monitoringv1clientset, err := monitoringv1client.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create monitoringv1 clientset: %w", err) + } + + osmv1clientset, err := osmv1client.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create osmv1 clientset: %w", err) + } + + routeClientset, err := routeclient.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create route clientset: %w", err) + } + + c := &client{ + clientset: clientset, + monitoringv1clientset: monitoringv1clientset, + osmv1clientset: osmv1clientset, + config: config, + } + + c.prometheusRuleManager = newPrometheusRuleManager(ctx, monitoringv1clientset) + + c.prometheusAlerts = newPrometheusAlerts(routeClientset, clientset.CoreV1(), config, c.prometheusRuleManager) + + c.alertRelabelConfigManager, err = newAlertRelabelConfigManager(ctx, osmv1clientset) + if err != nil { + return nil, fmt.Errorf("failed to create alert relabel config manager: %w", err) + } + + c.alertingRuleManager, err = newAlertingRuleManager(ctx, osmv1clientset) + if err != nil { + return nil, fmt.Errorf("failed to create alerting rule manager: %w", err) + } + + c.namespaceManager, err = newNamespaceManager(ctx, clientset) + if err != nil { + return nil, fmt.Errorf("failed to create namespace manager: %w", err) + } + + c.clusterMonitoringConfig, err = newClusterMonitoringConfigManager(ctx, clientset) + if err != nil { + return nil, fmt.Errorf("failed to create cluster monitoring config manager: %w", err) + } + + c.relabeledRulesManager, err = newRelabeledRulesManager(ctx, c.namespaceManager, c.alertRelabelConfigManager, monitoringv1clientset, clientset) + if err != nil { + return nil, fmt.Errorf("failed to create relabeled rules config manager: %w", err) + } + + return c, nil +} + +func (c *client) TestConnection(_ context.Context) error { + _, err := c.clientset.Discovery().ServerVersion() + if err != nil { + return fmt.Errorf("failed to connect to cluster: %w", err) + } + return nil +} + +func (c *client) PrometheusAlerts() PrometheusAlertsInterface { + return c.prometheusAlerts +} + +func (c *client) PrometheusRules() PrometheusRuleInterface { + return c.prometheusRuleManager +} + +func (c *client) AlertRelabelConfigs() AlertRelabelConfigInterface { + return c.alertRelabelConfigManager +} + +func (c *client) AlertingRules() AlertingRuleInterface { + return c.alertingRuleManager +} + +func (c *client) RelabeledRules() RelabeledRulesInterface { + return c.relabeledRulesManager +} + +func (c *client) Namespace() NamespaceInterface { + return c.namespaceManager +} diff --git a/pkg/k8s/client_factory.go b/pkg/k8s/client_factory.go new file mode 100644 index 000000000..5542d455f --- /dev/null +++ b/pkg/k8s/client_factory.go @@ -0,0 +1,12 @@ +package k8s + +import ( + "context" + + "k8s.io/client-go/rest" +) + +// NewClient creates a new Kubernetes client with the given options +func NewClient(ctx context.Context, config *rest.Config) (Client, error) { + return newClient(ctx, config) +} diff --git a/pkg/k8s/external_management.go b/pkg/k8s/external_management.go new file mode 100644 index 000000000..7671c87e7 --- /dev/null +++ b/pkg/k8s/external_management.go @@ -0,0 +1,49 @@ +package k8s + +import ( + "reflect" + "strings" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// External management detection keys +const ( + ArgocdArgoprojIoPrefix = "argocd.argoproj.io/" + AppKubernetesIoManagedBy = "app.kubernetes.io/managed-by" +) + +// IsManagedByGitOps returns true if the provided annotations/labels indicate GitOps (e.g., ArgoCD) management. +func IsManagedByGitOps(annotations map[string]string, labels map[string]string) bool { + for k := range annotations { + if strings.HasPrefix(k, ArgocdArgoprojIoPrefix) { + return true + } + } + for k := range labels { + if strings.HasPrefix(k, ArgocdArgoprojIoPrefix) { + return true + } + } + if v, ok := labels[AppKubernetesIoManagedBy]; ok { + vl := strings.ToLower(strings.TrimSpace(v)) + if vl == "openshift-gitops" || vl == "argocd-cluster" || vl == "argocd" || strings.Contains(vl, "gitops") { + return true + } + } + return false +} + +// IsExternallyManagedObject returns whether an object is GitOps-managed and/or operator-managed. +func IsExternallyManagedObject(obj metav1.Object) (gitOpsManaged bool, operatorManaged bool) { + if obj == nil { + return false, false + } + // Handle typed-nil underlying values + if rv := reflect.ValueOf(obj); rv.Kind() == reflect.Ptr && rv.IsNil() { + return false, false + } + gitOpsManaged = IsManagedByGitOps(obj.GetAnnotations(), obj.GetLabels()) + operatorManaged = len(obj.GetOwnerReferences()) > 0 + return +} diff --git a/pkg/k8s/namespace.go b/pkg/k8s/namespace.go new file mode 100644 index 000000000..aba97a2a4 --- /dev/null +++ b/pkg/k8s/namespace.go @@ -0,0 +1,110 @@ +package k8s + +import ( + "context" + "fmt" + "sync" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/tools/cache" +) + +const ( + // ClusterMonitoringLabel is the label used to identify namespaces with cluster monitoring enabled + ClusterMonitoringLabel = "openshift.io/cluster-monitoring" +) + +type namespaceManager struct { + informer cache.SharedIndexInformer + + // monitoringNamespaces stores namespaces with openshift.io/cluster-monitoring=true + monitoringNamespaces map[string]bool + mu sync.RWMutex +} + +func newNamespaceManager(ctx context.Context, clientset *kubernetes.Clientset) (*namespaceManager, error) { + informer := cache.NewSharedIndexInformer( + namespaceListWatch(clientset.CoreV1()), + &corev1.Namespace{}, + 0, + cache.Indexers{}, + ) + + nm := &namespaceManager{ + informer: informer, + monitoringNamespaces: make(map[string]bool), + mu: sync.RWMutex{}, + } + + _, err := nm.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + ns, ok := obj.(*corev1.Namespace) + if !ok { + return + } + nm.updateMonitoringNamespace(ns) + }, + UpdateFunc: func(oldObj interface{}, newObj interface{}) { + ns, ok := newObj.(*corev1.Namespace) + if !ok { + return + } + nm.updateMonitoringNamespace(ns) + }, + DeleteFunc: func(obj interface{}) { + namespaceName, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + return + } + nm.removeMonitoringNamespace(namespaceName) + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to add event handler to namespace informer: %w", err) + } + + go nm.informer.Run(ctx.Done()) + + cache.WaitForNamedCacheSync("Namespace informer", ctx.Done(), + nm.informer.HasSynced, + ) + + return nm, nil +} + +func namespaceListWatch(client corev1client.CoreV1Interface) *cache.ListWatch { + return cache.NewFilteredListWatchFromClient( + client.RESTClient(), + "namespaces", + "", + func(options *metav1.ListOptions) { + options.LabelSelector = ClusterMonitoringLabel + "=true" + }, + ) +} + +func (nm *namespaceManager) updateMonitoringNamespace(ns *corev1.Namespace) { + nm.mu.Lock() + defer nm.mu.Unlock() + + if ns.Labels != nil && ns.Labels[ClusterMonitoringLabel] == "true" { + nm.monitoringNamespaces[ns.Name] = true + } else { + delete(nm.monitoringNamespaces, ns.Name) + } +} + +func (nm *namespaceManager) removeMonitoringNamespace(name string) { + nm.mu.Lock() + defer nm.mu.Unlock() + delete(nm.monitoringNamespaces, name) +} + +func (nm *namespaceManager) IsClusterMonitoringNamespace(name string) bool { + nm.mu.RLock() + defer nm.mu.RUnlock() + return nm.monitoringNamespaces[name] +} diff --git a/pkg/k8s/prometheus_alerts.go b/pkg/k8s/prometheus_alerts.go new file mode 100644 index 000000000..adae526fe --- /dev/null +++ b/pkg/k8s/prometheus_alerts.go @@ -0,0 +1,971 @@ +package k8s + +import ( + "context" + "crypto/tls" + "crypto/x509" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + routev1 "github.com/openshift/api/route/v1" + routeclient "github.com/openshift/client-go/route/clientset/versioned" + "github.com/sirupsen/logrus" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/client-go/rest" +) + +var ( + prometheusLog = logrus.WithField("module", "k8s-prometheus") +) + +const ( + namespaceCacheTTL = 30 * time.Second + serviceHealthTimeout = 5 * time.Second + serviceRequestTimeout = 10 * time.Second + maxTenancyProbeTargets = 3 +) + +type namespaceCache struct { + mu sync.Mutex + expiresAt time.Time + ttl time.Duration + value []string +} + +func newNamespaceCache(ttl time.Duration) *namespaceCache { + return &namespaceCache{ttl: ttl} +} + +func (c *namespaceCache) get() ([]string, bool) { + if c == nil { + return nil, false + } + + c.mu.Lock() + defer c.mu.Unlock() + + if c.expiresAt.IsZero() || time.Now().After(c.expiresAt) { + return nil, false + } + return copyStringSlice(c.value), true +} + +func (c *namespaceCache) set(namespaces []string) { + if c == nil { + return + } + + c.mu.Lock() + defer c.mu.Unlock() + + c.value = copyStringSlice(namespaces) + c.expiresAt = time.Now().Add(c.ttl) +} + +type prometheusAlerts struct { + routeClient routeclient.Interface + coreClient corev1client.CoreV1Interface + config *rest.Config + ruleManager PrometheusRuleInterface + nsCache *namespaceCache + + // thanosTenancyPort caches the resolved port after the first successful + // lookup so that we don't make a K8s API call on every request. + thanosTenancyPortOnce sync.Once + thanosTenancyPort int32 +} + +// GetAlertsRequest holds parameters for filtering alerts +type GetAlertsRequest struct { + // Labels filters alerts by labels + Labels map[string]string + // State filters alerts by state: "firing", "pending", "silenced", or "" for all states + State string +} + +type PrometheusAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + State string `json:"state"` + ActiveAt time.Time `json:"activeAt"` + Value string `json:"value"` + + AlertRuleId string `json:"alertRuleId,omitempty"` + AlertComponent string `json:"alertComponent,omitempty"` + AlertLayer string `json:"alertLayer,omitempty"` + PrometheusRuleName string `json:"prometheusRuleName,omitempty"` + PrometheusRuleNamespace string `json:"prometheusRuleNamespace,omitempty"` + AlertingRuleName string `json:"alertingRuleName,omitempty"` +} + +type prometheusAlertsData struct { + Alerts []PrometheusAlert `json:"alerts"` +} + +type prometheusAlertsResponse struct { + Status string `json:"status"` + Data prometheusAlertsData `json:"data"` +} + +type prometheusRulesData struct { + Groups []PrometheusRuleGroup `json:"groups"` +} + +type prometheusRulesResponse struct { + Status string `json:"status"` + Data prometheusRulesData `json:"data"` +} + +type alertmanagerAlertStatus struct { + State string `json:"state"` +} + +type alertmanagerAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations"` + StartsAt time.Time `json:"startsAt"` + EndsAt time.Time `json:"endsAt"` + GeneratorURL string `json:"generatorURL"` + Status alertmanagerAlertStatus `json:"status"` +} + +func newPrometheusAlerts(routeClient routeclient.Interface, coreClient corev1client.CoreV1Interface, config *rest.Config, ruleManager PrometheusRuleInterface) *prometheusAlerts { + return &prometheusAlerts{ + routeClient: routeClient, + coreClient: coreClient, + config: config, + ruleManager: ruleManager, + nsCache: newNamespaceCache(namespaceCacheTTL), + } +} + +func (pa *prometheusAlerts) GetAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) { + platformAlerts, err := pa.getAlertsForSource(ctx, PlatformRouteNamespace, PlatformRouteName, PlatformAlertmanagerRouteName, AlertSourcePlatform) + if err != nil { + return nil, err + } + + userAlerts, err := pa.getUserWorkloadAlerts(ctx, req) + if err != nil { + prometheusLog.Warnf("failed to get user workload alerts: %v", err) + } + + mergedAlerts := append(platformAlerts, userAlerts...) + + out := make([]PrometheusAlert, 0, len(mergedAlerts)) + for _, a := range mergedAlerts { + // Filter alerts based on state if provided + if !matchesAlertState(req.State, a.State) { + continue + } + + // Filter alerts based on labels if provided + if !labelsMatch(&req, &a) { + continue + } + + out = append(out, a) + } + return out, nil +} + +func matchesAlertState(requestedState string, alertState string) bool { + if requestedState == "" { + return true + } + if requestedState == "firing" { + return alertState == "firing" || alertState == "silenced" + } + return alertState == requestedState +} + +func (pa *prometheusAlerts) GetRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) { + platformRules, err := pa.getRulesViaProxy(ctx, PlatformRouteNamespace, PlatformRouteName, AlertSourcePlatform) + if err != nil { + return nil, err + } + + userRules, err := pa.getUserWorkloadRules(ctx, req) + if err != nil { + prometheusLog.Warnf("failed to get user workload rules: %v", err) + } + + return append(platformRules, userRules...), nil +} + +func (pa *prometheusAlerts) alertingHealth(ctx context.Context) AlertingHealth { + userPrometheus := pa.routeHealth(ctx, UserWorkloadRouteNamespace, UserWorkloadRouteName, PrometheusRulesPath) + if userPrometheus.Status != RouteReachable { + if ok := pa.thanosTenancyReachable(ctx, ThanosQuerierTenancyAlertsPath); ok { + userPrometheus.FallbackReachable = true + } + } + + userAlertmanager := pa.routeHealth(ctx, UserWorkloadRouteNamespace, UserWorkloadAlertmanagerRouteName, AlertmanagerAlertsPath) + if userAlertmanager.Status != RouteReachable { + if ok := pa.serviceReachable(ctx, UserWorkloadRouteNamespace, UserWorkloadAlertmanagerRouteName, UserWorkloadAlertmanagerPort, AlertmanagerAlertsPath); ok { + userAlertmanager.FallbackReachable = true + } + } + + platformStack := pa.stackHealth(ctx, PlatformRouteNamespace, PlatformRouteName, PlatformAlertmanagerRouteName) + userWorkloadStack := AlertingStackHealth{ + Prometheus: userPrometheus, + Alertmanager: userAlertmanager, + } + + return AlertingHealth{ + Platform: &platformStack, + UserWorkload: &userWorkloadStack, + } +} + +func (pa *prometheusAlerts) stackHealth(ctx context.Context, namespace string, promRouteName string, amRouteName string) AlertingStackHealth { + return AlertingStackHealth{ + Prometheus: pa.routeHealth(ctx, namespace, promRouteName, PrometheusRulesPath), + Alertmanager: pa.routeHealth(ctx, namespace, amRouteName, AlertmanagerAlertsPath), + } +} + +func (pa *prometheusAlerts) routeHealth(ctx context.Context, namespace string, routeName string, path string) AlertingRouteHealth { + health := AlertingRouteHealth{ + Name: routeName, + Namespace: namespace, + } + + if pa.routeClient == nil { + health.Error = "route client is not configured" + return health + } + + route, err := pa.routeClient.RouteV1().Routes(namespace).Get(ctx, routeName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + health.Status = RouteNotFound + health.Error = err.Error() + return health + } + health.Error = err.Error() + return health + } + + url := buildRouteURL(route.Spec.Host, route.Spec.Path, path) + client, err := pa.createHTTPClient() + if err != nil { + health.Status = RouteUnreachable + health.Error = err.Error() + return health + } + + if _, err := pa.executeRequest(ctx, client, url); err != nil { + health.Status = RouteUnreachable + health.Error = err.Error() + return health + } + + health.Status = RouteReachable + return health +} + +func (pa *prometheusAlerts) getAlertsForSource(ctx context.Context, namespace string, promRouteName string, amRouteName string, source string) ([]PrometheusAlert, error) { + amAlerts, amErr := pa.getAlertmanagerAlerts(ctx, namespace, amRouteName, source) + promAlerts, promErr := pa.getAlertsViaProxy(ctx, namespace, promRouteName, source) + + if amErr == nil { + pending := filterAlertsByState(promAlerts, "pending") + return append(amAlerts, pending...), nil + } + + if promErr != nil { + return nil, promErr + } + + return promAlerts, nil +} + +func (pa *prometheusAlerts) getUserWorkloadAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) { + if shouldPreferUserAlertmanager(req.State) { + alerts, err := pa.getUserWorkloadAlertsViaAlertmanager(ctx) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via alertmanager: %v", err) + } + + namespace := namespaceFromLabels(req.Labels) + if namespace != "" { + alerts, err := pa.getAlertsViaThanosTenancy(ctx, namespace, AlertSourceUser) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via thanos tenancy: %v", err) + } + + userNamespaces := pa.userRuleNamespaces(ctx) + if len(userNamespaces) > 0 { + alerts, err := pa.getAlertsViaThanosTenancyNamespaces(ctx, userNamespaces, AlertSourceUser) + if err == nil { + return alerts, nil + } + prometheusLog.Warnf("failed to get user workload alerts via thanos tenancy namespaces: %v", err) + } + + return pa.getAlertsForSource(ctx, UserWorkloadRouteNamespace, UserWorkloadRouteName, UserWorkloadAlertmanagerRouteName, AlertSourceUser) +} + +func shouldPreferUserAlertmanager(state string) bool { + return state == "firing" || state == "silenced" +} + +func (pa *prometheusAlerts) getUserWorkloadAlertsViaAlertmanager(ctx context.Context) ([]PrometheusAlert, error) { + alerts, err := pa.getAlertmanagerAlerts(ctx, UserWorkloadRouteNamespace, UserWorkloadAlertmanagerRouteName, AlertSourceUser) + if err != nil { + alerts, err = pa.getAlertmanagerAlertsViaService(ctx, UserWorkloadRouteNamespace, UserWorkloadAlertmanagerRouteName, UserWorkloadAlertmanagerPort, AlertSourceUser) + if err != nil { + return nil, err + } + } + + pending, err := pa.getAlertsViaProxy(ctx, UserWorkloadRouteNamespace, UserWorkloadRouteName, AlertSourceUser) + if err != nil { + pending, err = pa.getPrometheusAlertsViaService(ctx, UserWorkloadRouteNamespace, UserWorkloadPrometheusServiceName, UserWorkloadPrometheusPort, AlertSourceUser) + if err != nil { + return alerts, nil + } + } + + return append(alerts, filterAlertsByState(pending, "pending")...), nil +} + +func (pa *prometheusAlerts) getPrometheusAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + timeoutCtx, cancel := context.WithTimeout(ctx, serviceRequestTimeout) + defer cancel() + ctx = timeoutCtx + } + + raw, err := pa.getServiceResponse(ctx, namespace, serviceName, port, PrometheusAlertsPath) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendProm) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertmanagerAlertsViaService(ctx context.Context, namespace string, serviceName string, port int32, source string) ([]PrometheusAlert, error) { + raw, err := pa.getServiceResponse(ctx, namespace, serviceName, port, AlertmanagerAlertsPath) + if err != nil { + return nil, err + } + + var amAlerts []alertmanagerAlert + if err := json.Unmarshal(raw, &amAlerts); err != nil { + return nil, fmt.Errorf("decode alertmanager response: %w", err) + } + + converted := make([]PrometheusAlert, 0, len(amAlerts)) + for _, alert := range amAlerts { + state := mapAlertmanagerState(alert.Status.State) + if state == "" { + continue + } + converted = append(converted, PrometheusAlert{ + Labels: alert.Labels, + Annotations: alert.Annotations, + State: state, + ActiveAt: alert.StartsAt, + }) + } + + applyAlertMetadata(converted, source, AlertBackendAM) + if len(converted) == 0 { + return []PrometheusAlert{}, nil + } + return converted, nil +} + +func (pa *prometheusAlerts) serviceReachable(ctx context.Context, namespace string, serviceName string, port int32, path string) bool { + healthCtx, cancel := context.WithTimeout(ctx, serviceHealthTimeout) + defer cancel() + + _, err := pa.getServiceResponse(healthCtx, namespace, serviceName, port, path) + return err == nil +} + +func (pa *prometheusAlerts) getServiceResponse(ctx context.Context, namespace string, serviceName string, port int32, path string) ([]byte, error) { + baseURL := fmt.Sprintf("https://%s.%s.svc:%d", serviceName, namespace, port) + requestURL := fmt.Sprintf("%s%s", baseURL, path) + + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, requestURL) +} + +func (pa *prometheusAlerts) thanosTenancyReachable(ctx context.Context, path string) bool { + namespaces := pa.userRuleNamespaces(ctx) + if len(namespaces) == 0 { + return false + } + + limit := maxTenancyProbeTargets + if limit <= 0 || limit > len(namespaces) { + limit = len(namespaces) + } + + for i := 0; i < limit; i++ { + healthCtx, cancel := context.WithTimeout(ctx, serviceHealthTimeout) + _, err := pa.getThanosTenancyResponse(healthCtx, path, namespaces[i]) + cancel() + + if err == nil { + return true + } + if isTenancyExpectedError(err) { + continue + } + return false + } + + return false +} + +// isTenancyExpectedError returns true for errors that are expected when probing +// Thanos tenancy endpoints across user namespaces — e.g. the namespace has no +// rules (404), the SA lacks access (401/403), or the namespace is not yet +// instrumented. These are skipped; only a network/server error aborts the probe. +func isTenancyExpectedError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "status 401") || + strings.Contains(msg, "status 403") || + strings.Contains(msg, "status 404") || + strings.Contains(msg, "unauthorized") || + strings.Contains(msg, "forbidden") || + strings.Contains(msg, "not found") +} + +func (pa *prometheusAlerts) getAlertsViaProxy(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, PrometheusAlertsPath) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendProm) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertsViaThanosTenancy(ctx context.Context, namespace string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getThanosTenancyResponse(ctx, ThanosQuerierTenancyAlertsPath, namespace) + if err != nil { + return nil, err + } + + var alertsResp prometheusAlertsResponse + if err := json.Unmarshal(raw, &alertsResp); err != nil { + return nil, fmt.Errorf("decode thanos response: %w", err) + } + + if alertsResp.Status != "success" { + return nil, fmt.Errorf("thanos API returned non-success status: %s", alertsResp.Status) + } + + applyAlertMetadata(alertsResp.Data.Alerts, source, AlertBackendThanos) + return alertsResp.Data.Alerts, nil +} + +func (pa *prometheusAlerts) getAlertmanagerAlerts(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusAlert, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, AlertmanagerAlertsPath) + if err != nil { + return nil, err + } + + var amAlerts []alertmanagerAlert + if err := json.Unmarshal(raw, &amAlerts); err != nil { + return nil, fmt.Errorf("decode alertmanager response: %w", err) + } + + converted := make([]PrometheusAlert, 0, len(amAlerts)) + for _, alert := range amAlerts { + state := mapAlertmanagerState(alert.Status.State) + if state == "" { + continue + } + converted = append(converted, PrometheusAlert{ + Labels: alert.Labels, + Annotations: alert.Annotations, + State: state, + ActiveAt: alert.StartsAt, + }) + } + + applyAlertMetadata(converted, source, AlertBackendAM) + if len(converted) == 0 { + return []PrometheusAlert{}, nil + } + return converted, nil +} + +func (pa *prometheusAlerts) getUserWorkloadRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) { + namespace := namespaceFromLabels(req.Labels) + if namespace != "" { + rules, err := pa.getRulesViaThanosTenancy(ctx, namespace, AlertSourceUser) + if err == nil { + return rules, nil + } + prometheusLog.Warnf("failed to get user workload rules via thanos tenancy: %v", err) + } + + userNamespaces := pa.userRuleNamespaces(ctx) + if len(userNamespaces) > 0 { + groups, err := pa.getRulesViaThanosTenancyNamespaces(ctx, userNamespaces, AlertSourceUser) + if err == nil { + return groups, nil + } + prometheusLog.Warnf("failed to get user workload rules via thanos tenancy namespaces: %v", err) + } + + return pa.getRulesViaProxy(ctx, UserWorkloadRouteNamespace, UserWorkloadRouteName, AlertSourceUser) +} + +func (pa *prometheusAlerts) userRuleNamespaces(ctx context.Context) []string { + if cached, ok := pa.nsCache.get(); ok { + return cached + } + + if pa.ruleManager == nil { + namespaces := pa.allNonPlatformNamespaces(ctx) + pa.nsCache.set(namespaces) + return namespaces + } + + prometheusRules, err := pa.ruleManager.List(ctx, "") + if err != nil { + prometheusLog.WithError(err).Warn("failed to list PrometheusRules for user namespace discovery") + namespaces := pa.allNonPlatformNamespaces(ctx) + pa.nsCache.set(namespaces) + return namespaces + } + + namespaces := map[string]struct{}{} + for _, pr := range prometheusRules { + if pr.Namespace == "" { + continue + } + if pr.Namespace == PlatformRouteNamespace || pr.Namespace == UserWorkloadRouteNamespace { + continue + } + namespaces[pr.Namespace] = struct{}{} + } + + out := make([]string, 0, len(namespaces)) + for ns := range namespaces { + out = append(out, ns) + } + pa.nsCache.set(out) + return out +} + +func (pa *prometheusAlerts) allNonPlatformNamespaces(ctx context.Context) []string { + if pa.coreClient == nil { + return nil + } + + namespaceList, err := pa.coreClient.Namespaces().List(ctx, metav1.ListOptions{}) + if err != nil { + prometheusLog.WithError(err).Warn("failed to list namespaces for user namespace discovery") + return nil + } + + out := make([]string, 0, len(namespaceList.Items)) + for _, ns := range namespaceList.Items { + if ns.Name == PlatformRouteNamespace || ns.Name == UserWorkloadRouteNamespace { + continue + } + out = append(out, ns.Name) + } + return out +} + +// fanOutThanosTenancy calls fetch for each namespace, accumulates results, and +// returns combined results (or the last error if nothing succeeded). +func fanOutThanosTenancy[T any](namespaces []string, fetch func(string) ([]T, error)) ([]T, error) { + var out []T + var lastErr error + for _, namespace := range namespaces { + results, err := fetch(namespace) + if err != nil { + lastErr = err + continue + } + out = append(out, results...) + } + if len(out) > 0 { + return out, nil + } + return out, lastErr +} + +func (pa *prometheusAlerts) getAlertsViaThanosTenancyNamespaces(ctx context.Context, namespaces []string, source string) ([]PrometheusAlert, error) { + return fanOutThanosTenancy(namespaces, func(ns string) ([]PrometheusAlert, error) { + return pa.getAlertsViaThanosTenancy(ctx, ns, source) + }) +} + +func (pa *prometheusAlerts) getRulesViaThanosTenancyNamespaces(ctx context.Context, namespaces []string, source string) ([]PrometheusRuleGroup, error) { + return fanOutThanosTenancy(namespaces, func(ns string) ([]PrometheusRuleGroup, error) { + return pa.getRulesViaThanosTenancy(ctx, ns, source) + }) +} + +func (pa *prometheusAlerts) getRulesViaProxy(ctx context.Context, namespace string, routeName string, source string) ([]PrometheusRuleGroup, error) { + raw, err := pa.getPrometheusResponse(ctx, namespace, routeName, PrometheusRulesPath) + if err != nil { + return nil, err + } + + var rulesResp prometheusRulesResponse + if err := json.Unmarshal(raw, &rulesResp); err != nil { + return nil, fmt.Errorf("decode prometheus response: %w", err) + } + + if rulesResp.Status != "success" { + return nil, fmt.Errorf("prometheus API returned non-success status: %s", rulesResp.Status) + } + + applyRuleSource(rulesResp.Data.Groups, source) + return rulesResp.Data.Groups, nil +} + +func (pa *prometheusAlerts) getRulesViaThanosTenancy(ctx context.Context, namespace string, source string) ([]PrometheusRuleGroup, error) { + raw, err := pa.getThanosTenancyResponse(ctx, ThanosQuerierTenancyRulesPath, namespace) + if err != nil { + return nil, err + } + + var rulesResp prometheusRulesResponse + if err := json.Unmarshal(raw, &rulesResp); err != nil { + return nil, fmt.Errorf("decode thanos response: %w", err) + } + + if rulesResp.Status != "success" { + return nil, fmt.Errorf("thanos API returned non-success status: %s", rulesResp.Status) + } + + applyRuleSource(rulesResp.Data.Groups, source) + return rulesResp.Data.Groups, nil +} + +func (pa *prometheusAlerts) getPrometheusResponse(ctx context.Context, namespace string, routeName string, path string) ([]byte, error) { + url, err := pa.buildPrometheusURL(ctx, namespace, routeName, path) + if err != nil { + return nil, err + } + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, url) +} + +func (pa *prometheusAlerts) getThanosTenancyResponse(ctx context.Context, path string, namespace string) ([]byte, error) { + if namespace == "" { + return nil, fmt.Errorf("namespace is required for thanos tenancy requests") + } + + port := pa.resolveThanosTenancyRulesPort(ctx) + baseURL := fmt.Sprintf("https://%s.%s.svc:%d", ThanosQuerierServiceName, ThanosQuerierNamespace, port) + requestURL := fmt.Sprintf("%s%s?namespace=%s", baseURL, path, url.QueryEscape(namespace)) + + client, err := pa.createHTTPClient() + if err != nil { + return nil, err + } + + return pa.executeRequest(ctx, client, requestURL) +} + +func (pa *prometheusAlerts) resolveThanosTenancyRulesPort(ctx context.Context) int32 { + pa.thanosTenancyPortOnce.Do(func() { + pa.thanosTenancyPort = pa.lookupThanosTenancyRulesPort(ctx) + }) + return pa.thanosTenancyPort +} + +func (pa *prometheusAlerts) lookupThanosTenancyRulesPort(ctx context.Context) int32 { + if pa.coreClient == nil { + return DefaultThanosQuerierTenancyRulesPort + } + + service, err := pa.coreClient.Services(ThanosQuerierNamespace).Get(ctx, ThanosQuerierServiceName, metav1.GetOptions{}) + if err != nil { + prometheusLog.WithError(err).Warnf("failed to resolve thanos-querier %s port, falling back to default %d", ThanosQuerierTenancyRulesPortName, DefaultThanosQuerierTenancyRulesPort) + return DefaultThanosQuerierTenancyRulesPort + } + + for _, port := range service.Spec.Ports { + if port.Name == ThanosQuerierTenancyRulesPortName && port.Port > 0 { + return port.Port + } + } + + prometheusLog.Warnf("thanos-querier service missing %s port, falling back to default %d", ThanosQuerierTenancyRulesPortName, DefaultThanosQuerierTenancyRulesPort) + return DefaultThanosQuerierTenancyRulesPort +} + +func (pa *prometheusAlerts) buildPrometheusURL(ctx context.Context, namespace string, routeName string, path string) (string, error) { + route, err := pa.fetchPrometheusRoute(ctx, namespace, routeName) + if err != nil { + return "", err + } + + return buildRouteURL(route.Spec.Host, route.Spec.Path, path), nil +} + +func (pa *prometheusAlerts) fetchPrometheusRoute(ctx context.Context, namespace string, routeName string) (*routev1.Route, error) { + if pa.routeClient == nil { + return nil, fmt.Errorf("route client is not configured") + } + + route, err := pa.routeClient.RouteV1().Routes(namespace).Get(ctx, routeName, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to get prometheus route: %w", err) + } + + return route, nil +} + +func applyAlertMetadata(alerts []PrometheusAlert, source, backend string) { + for i := range alerts { + if alerts[i].Labels == nil { + alerts[i].Labels = map[string]string{} + } + alerts[i].Labels[AlertSourceLabel] = source + alerts[i].Labels[AlertBackendLabel] = backend + } +} + +func applyRuleSource(groups []PrometheusRuleGroup, source string) { + for gi := range groups { + for ri := range groups[gi].Rules { + rule := &groups[gi].Rules[ri] + if rule.Labels == nil { + rule.Labels = map[string]string{} + } + rule.Labels[AlertSourceLabel] = source + for ai := range rule.Alerts { + if rule.Alerts[ai].Labels == nil { + rule.Alerts[ai].Labels = map[string]string{} + } + rule.Alerts[ai].Labels[AlertSourceLabel] = source + } + } + } +} + +func filterAlertsByState(alerts []PrometheusAlert, state string) []PrometheusAlert { + out := make([]PrometheusAlert, 0, len(alerts)) + for _, alert := range alerts { + if alert.State == state { + out = append(out, alert) + } + } + return out +} + +func mapAlertmanagerState(state string) string { + if state == "active" { + return "firing" + } + if state == "suppressed" { + return "silenced" + } + return "" +} + +func buildRouteURL(host string, routePath string, requestPath string) string { + basePath := strings.TrimSuffix(routePath, "/") + if basePath == "" { + return fmt.Sprintf("https://%s%s", host, requestPath) + } + if requestPath == basePath || strings.HasPrefix(requestPath, basePath+"/") { + return fmt.Sprintf("https://%s%s", host, requestPath) + } + return fmt.Sprintf("https://%s%s%s", host, basePath, requestPath) +} + +func namespaceFromLabels(labels map[string]string) string { + if labels == nil { + return "" + } + return strings.TrimSpace(labels["namespace"]) +} + +func (pa *prometheusAlerts) createHTTPClient() (*http.Client, error) { + tlsConfig, err := pa.buildTLSConfig() + if err != nil { + return nil, err + } + + return &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: tlsConfig, + }, + }, nil +} + +func (pa *prometheusAlerts) buildTLSConfig() (*tls.Config, error) { + caCertPool, err := pa.loadCACertPool() + if err != nil { + return nil, err + } + + return &tls.Config{ + MinVersion: tls.VersionTLS12, + RootCAs: caCertPool, + }, nil +} + +func (pa *prometheusAlerts) loadCACertPool() (*x509.CertPool, error) { + caCertPool, err := x509.SystemCertPool() + if err != nil { + caCertPool = x509.NewCertPool() + } + + if len(pa.config.CAData) > 0 { + caCertPool.AppendCertsFromPEM(pa.config.CAData) + return caCertPool, nil + } + + if pa.config.CAFile != "" { + caCert, err := os.ReadFile(pa.config.CAFile) + if err != nil { + return nil, fmt.Errorf("read CA cert file: %w", err) + } + caCertPool.AppendCertsFromPEM(caCert) + } + + // OpenShift service CA bundle for in-cluster service certs. + if serviceCA, err := os.ReadFile(ServiceCAPath); err == nil { + caCertPool.AppendCertsFromPEM(serviceCA) + } + + return caCertPool, nil +} + +func copyStringSlice(in []string) []string { + if len(in) == 0 { + return []string{} + } + + out := make([]string, len(in)) + copy(out, in) + return out +} + +func (pa *prometheusAlerts) executeRequest(ctx context.Context, client *http.Client, url string) ([]byte, error) { + req, err := pa.createAuthenticatedRequest(ctx, url) + if err != nil { + return nil, err + } + + return pa.performRequest(client, req) +} + +func (pa *prometheusAlerts) createAuthenticatedRequest(ctx context.Context, url string) (*http.Request, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, fmt.Errorf("create request: %w", err) + } + + token := bearerTokenFromContext(ctx) + if token == "" { + var err error + token, err = pa.loadBearerToken() + if err != nil { + return nil, err + } + } + + req.Header.Set("Authorization", "Bearer "+token) + return req, nil +} + +func (pa *prometheusAlerts) loadBearerToken() (string, error) { + if pa.config.BearerToken != "" { + return pa.config.BearerToken, nil + } + + if pa.config.BearerTokenFile == "" { + return "", fmt.Errorf("no bearer token or token file configured") + } + + tokenBytes, err := os.ReadFile(pa.config.BearerTokenFile) + if err != nil { + return "", fmt.Errorf("load bearer token file: %w", err) + } + + return strings.TrimSpace(string(tokenBytes)), nil +} + +func (pa *prometheusAlerts) performRequest(client *http.Client, req *http.Request) ([]byte, error) { + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("execute request: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} + +func labelsMatch(req *GetAlertsRequest, alert *PrometheusAlert) bool { + for key, value := range req.Labels { + if alertValue, exists := alert.Labels[key]; !exists || alertValue != value { + return false + } + } + + return true +} diff --git a/pkg/k8s/prometheus_rule.go b/pkg/k8s/prometheus_rule.go new file mode 100644 index 000000000..48e7bae93 --- /dev/null +++ b/pkg/k8s/prometheus_rule.go @@ -0,0 +1,154 @@ +package k8s + +import ( + "context" + "fmt" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/cache" +) + +type prometheusRuleManager struct { + clientset *monitoringv1client.Clientset + informer cache.SharedIndexInformer +} + +func newPrometheusRuleManager(ctx context.Context, clientset *monitoringv1client.Clientset) *prometheusRuleManager { + informer := cache.NewSharedIndexInformer( + prometheusRuleListWatchForAllNamespaces(clientset), + &monitoringv1.PrometheusRule{}, + 0, + cache.Indexers{}, + ) + + go informer.Run(ctx.Done()) + + cache.WaitForNamedCacheSync("PrometheusRule informer", ctx.Done(), + informer.HasSynced, + ) + + return &prometheusRuleManager{ + clientset: clientset, + informer: informer, + } +} + +func prometheusRuleListWatchForAllNamespaces(clientset *monitoringv1client.Clientset) *cache.ListWatch { + return cache.NewListWatchFromClient(clientset.MonitoringV1().RESTClient(), "prometheusrules", "", fields.Everything()) +} + +func (prm *prometheusRuleManager) List(ctx context.Context, namespace string) ([]monitoringv1.PrometheusRule, error) { + prs := prm.informer.GetStore().List() + + prometheusRules := make([]monitoringv1.PrometheusRule, 0, len(prs)) + for _, item := range prs { + pr, ok := item.(*monitoringv1.PrometheusRule) + if !ok { + continue + } + prometheusRules = append(prometheusRules, *pr) + } + + return prometheusRules, nil +} + +func (prm *prometheusRuleManager) Get(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + pr, err := prm.clientset.MonitoringV1().PrometheusRules(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return nil, false, nil + } + + return nil, false, err + } + + return pr, true, nil +} + +func (prm *prometheusRuleManager) Update(ctx context.Context, pr monitoringv1.PrometheusRule) error { + _, err := prm.clientset.MonitoringV1().PrometheusRules(pr.Namespace).Update(ctx, &pr, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update PrometheusRule %s/%s: %w", pr.Namespace, pr.Name, err) + } + + return nil +} + +func (prm *prometheusRuleManager) Delete(ctx context.Context, namespace string, name string) error { + err := prm.clientset.MonitoringV1().PrometheusRules(namespace).Delete(ctx, name, metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("failed to delete PrometheusRule %s: %w", name, err) + } + + return nil +} + +func (prm *prometheusRuleManager) AddRule(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + pr, err := prm.getOrCreatePrometheusRule(ctx, namespacedName) + if err != nil { + return err + } + + // Find or create the group + var group *monitoringv1.RuleGroup + for i := range pr.Spec.Groups { + if pr.Spec.Groups[i].Name == groupName { + group = &pr.Spec.Groups[i] + break + } + } + if group == nil { + pr.Spec.Groups = append(pr.Spec.Groups, monitoringv1.RuleGroup{ + Name: groupName, + Rules: []monitoringv1.Rule{}, + }) + group = &pr.Spec.Groups[len(pr.Spec.Groups)-1] + } + + // Add the new rule to the group + group.Rules = append(group.Rules, rule) + + _, err = prm.clientset.MonitoringV1().PrometheusRules(namespacedName.Namespace).Update(ctx, pr, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update PrometheusRule %s/%s: %w", namespacedName.Namespace, namespacedName.Name, err) + } + + return nil +} + +func (prm *prometheusRuleManager) getOrCreatePrometheusRule(ctx context.Context, namespacedName types.NamespacedName) (*monitoringv1.PrometheusRule, error) { + pr, err := prm.clientset.MonitoringV1().PrometheusRules(namespacedName.Namespace).Get(ctx, namespacedName.Name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return prm.createPrometheusRule(ctx, namespacedName) + } + + return nil, fmt.Errorf("failed to get PrometheusRule %s/%s: %w", namespacedName.Namespace, namespacedName.Name, err) + } + + return pr, nil +} + +func (prm *prometheusRuleManager) createPrometheusRule(ctx context.Context, namespacedName types.NamespacedName) (*monitoringv1.PrometheusRule, error) { + pr := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespacedName.Name, + Namespace: namespacedName.Namespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{}, + }, + } + + pr, err := prm.clientset.MonitoringV1().PrometheusRules(namespacedName.Namespace).Create(ctx, pr, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create PrometheusRule %s/%s: %w", namespacedName.Namespace, namespacedName.Name, err) + } + + return pr, nil +} diff --git a/pkg/k8s/prometheus_rules_types.go b/pkg/k8s/prometheus_rules_types.go new file mode 100644 index 000000000..b44ea6ab2 --- /dev/null +++ b/pkg/k8s/prometheus_rules_types.go @@ -0,0 +1,52 @@ +package k8s + +import ( + "encoding/json" + "time" +) + +const ( + RuleTypeAlerting = "alerting" + RuleTypeRecording = "recording" +) + +// GetRulesRequest holds parameters for filtering rules alerts. +type GetRulesRequest struct { + // Labels filters alerts by labels + Labels map[string]string + // State filters alerts by state: "firing", "pending", "silenced", or "" for all states + State string +} + +// PrometheusRuleGroup models a rule group from the Prometheus alerting API. +type PrometheusRuleGroup struct { + Name string `json:"name"` + File string `json:"file,omitempty"` + Interval json.RawMessage `json:"interval,omitempty"` + Rules []PrometheusRule `json:"rules"` +} + +// PrometheusRule models a rule entry from the Prometheus alerting API. +type PrometheusRule struct { + Name string `json:"name,omitempty"` + Query string `json:"query,omitempty"` + Duration float64 `json:"duration,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` + Alerts []PrometheusRuleAlert `json:"alerts,omitempty"` + Health string `json:"health,omitempty"` + Type string `json:"type,omitempty"` + LastError string `json:"lastError,omitempty"` + EvaluationTime float64 `json:"evaluationTime,omitempty"` + LastEvaluation time.Time `json:"lastEvaluation,omitempty"` +} + +// PrometheusRuleAlert models an alert entry within a rule from the Prometheus alerting API. +type PrometheusRuleAlert struct { + Labels map[string]string `json:"labels"` + Annotations map[string]string `json:"annotations,omitempty"` + State string `json:"state"` + ActiveAt time.Time `json:"activeAt"` + Value string `json:"value"` + KeepFiringSince time.Time `json:"keepFiringSince,omitempty"` +} diff --git a/pkg/k8s/relabeled_rules.go b/pkg/k8s/relabeled_rules.go new file mode 100644 index 000000000..9c1366c71 --- /dev/null +++ b/pkg/k8s/relabeled_rules.go @@ -0,0 +1,447 @@ +package k8s + +import ( + "context" + "crypto/sha256" + "fmt" + "strings" + "sync" + "time" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "gopkg.in/yaml.v2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" +) + +const ( + resyncPeriod = 15 * time.Minute + queueBaseDelay = 50 * time.Millisecond + queueMaxDelay = 3 * time.Minute + + AlertRelabelConfigSecretName = "alert-relabel-configs" + AlertRelabelConfigSecretKey = "config.yaml" + + PrometheusRuleLabelNamespace = "openshift_io_prometheus_rule_namespace" + PrometheusRuleLabelName = "openshift_io_prometheus_rule_name" + AlertRuleLabelId = "openshift_io_alert_rule_id" + + AlertRuleClassificationComponentKey = "openshift_io_alert_rule_component" + AlertRuleClassificationLayerKey = "openshift_io_alert_rule_layer" + + AppKubernetesIoComponent = "app.kubernetes.io/component" + AppKubernetesIoComponentAlertManagementApi = "alert-management-api" + AppKubernetesIoComponentMonitoringPlugin = "monitoring-plugin" +) + +type relabeledRulesManager struct { + queue workqueue.TypedRateLimitingInterface[string] + + namespaceManager NamespaceInterface + alertRelabelConfigs AlertRelabelConfigInterface + prometheusRulesInformer cache.SharedIndexInformer + secretInformer cache.SharedIndexInformer + configMapInformer cache.SharedIndexInformer + clientset kubernetes.Interface + + // relabeledRules stores the relabeled rules in memory + relabeledRules map[string]monitoringv1.Rule + relabelConfigs []*relabel.Config + mu sync.RWMutex +} + +func newRelabeledRulesManager(ctx context.Context, namespaceManager NamespaceInterface, alertRelabelConfigs AlertRelabelConfigInterface, monitoringv1clientset *monitoringv1client.Clientset, clientset *kubernetes.Clientset) (*relabeledRulesManager, error) { + prometheusRulesInformer := cache.NewSharedIndexInformer( + prometheusRuleListWatchForAllNamespaces(monitoringv1clientset), + &monitoringv1.PrometheusRule{}, + resyncPeriod, + cache.Indexers{}, + ) + + secretInformer := cache.NewSharedIndexInformer( + alertRelabelConfigSecretListWatch(clientset, ClusterMonitoringNamespace), + &corev1.Secret{}, + resyncPeriod, + cache.Indexers{}, + ) + + queue := workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.NewTypedItemExponentialFailureRateLimiter[string](queueBaseDelay, queueMaxDelay), + workqueue.TypedRateLimitingQueueConfig[string]{Name: "relabeled-rules"}, + ) + + rrm := &relabeledRulesManager{ + queue: queue, + namespaceManager: namespaceManager, + alertRelabelConfigs: alertRelabelConfigs, + prometheusRulesInformer: prometheusRulesInformer, + secretInformer: secretInformer, + } + + _, err := rrm.prometheusRulesInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + promRule, ok := obj.(*monitoringv1.PrometheusRule) + if !ok { + return + } + log.Debugf("prometheus rule added: %s/%s", promRule.Namespace, promRule.Name) + rrm.queue.Add("prometheus-rule-sync") + }, + UpdateFunc: func(oldObj interface{}, newObj interface{}) { + promRule, ok := newObj.(*monitoringv1.PrometheusRule) + if !ok { + return + } + log.Debugf("prometheus rule updated: %s/%s", promRule.Namespace, promRule.Name) + rrm.queue.Add("prometheus-rule-sync") + }, + DeleteFunc: func(obj interface{}) { + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + obj = tombstone.Obj + } + + promRule, ok := obj.(*monitoringv1.PrometheusRule) + if !ok { + return + } + log.Debugf("prometheus rule deleted: %s/%s", promRule.Namespace, promRule.Name) + rrm.queue.Add("prometheus-rule-sync") + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to add event handler to prometheus rules informer: %w", err) + } + + _, err = rrm.secretInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + rrm.queue.Add("secret-sync") + }, + UpdateFunc: func(oldObj interface{}, newObj interface{}) { + rrm.queue.Add("secret-sync") + }, + DeleteFunc: func(obj interface{}) { + rrm.queue.Add("secret-sync") + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to add event handler to secret informer: %w", err) + } + + go rrm.prometheusRulesInformer.Run(ctx.Done()) + go rrm.secretInformer.Run(ctx.Done()) + + cache.WaitForNamedCacheSync("RelabeledRulesConfig informer", ctx.Done(), + rrm.prometheusRulesInformer.HasSynced, + rrm.secretInformer.HasSynced, + ) + + go rrm.worker(ctx) + rrm.queue.Add("initial-sync") + + return rrm, nil +} + +func alertRelabelConfigSecretListWatch(clientset *kubernetes.Clientset, namespace string) *cache.ListWatch { + return cache.NewListWatchFromClient( + clientset.CoreV1().RESTClient(), + "secrets", + namespace, + fields.OneTermEqualSelector("metadata.name", AlertRelabelConfigSecretName), + ) +} + +func (rrm *relabeledRulesManager) worker(ctx context.Context) { + for rrm.processNextWorkItem(ctx) { + } +} + +func (rrm *relabeledRulesManager) processNextWorkItem(ctx context.Context) bool { + key, quit := rrm.queue.Get() + if quit { + return false + } + + defer rrm.queue.Done(key) + + if err := rrm.sync(ctx); err != nil { + log.Errorf("error syncing relabeled rules: %v", err) + rrm.queue.AddRateLimited(key) + return true + } + + rrm.queue.Forget(key) + + return true +} + +func (rrm *relabeledRulesManager) sync(ctx context.Context) error { + relabelConfigs, err := rrm.loadRelabelConfigs() + if err != nil { + return fmt.Errorf("failed to load relabel configs: %w", err) + } + + rrm.mu.Lock() + rrm.relabelConfigs = relabelConfigs + rrm.mu.Unlock() + + alerts := rrm.collectAlerts(ctx, relabelConfigs) + + rrm.mu.Lock() + rrm.relabeledRules = alerts + rrm.mu.Unlock() + + log.Infof("Synced %d relabeled rules in memory", len(alerts)) + return nil +} + +func (rrm *relabeledRulesManager) loadRelabelConfigs() ([]*relabel.Config, error) { + storeKey := fmt.Sprintf("%s/%s", ClusterMonitoringNamespace, AlertRelabelConfigSecretName) + obj, exists, err := rrm.secretInformer.GetStore().GetByKey(storeKey) + if err != nil { + return nil, fmt.Errorf("failed to get secret from store: %w", err) + } + if !exists { + log.Infof("Alert relabel config secret %q not found", storeKey) + return nil, nil + } + + secret, ok := obj.(*corev1.Secret) + if !ok { + return nil, fmt.Errorf("unexpected object type in secret store: %T", obj) + } + + configData, ok := secret.Data[AlertRelabelConfigSecretKey] + if !ok { + return nil, fmt.Errorf("no config data found in secret %q", AlertRelabelConfigSecretName) + } + + var raw []*relabel.Config + if err := yaml.Unmarshal(configData, &raw); err != nil { + return nil, fmt.Errorf("failed to unmarshal relabel configs: %w", err) + } + + configs := make([]*relabel.Config, 0, len(raw)) + for i, config := range raw { + if config == nil { + log.Warnf("skipping nil relabel config entry at index %d", i) + continue + } + if config.NameValidationScheme == model.UnsetValidation { + config.NameValidationScheme = model.UTF8Validation + } + if err := config.Validate(model.UTF8Validation); err != nil { + return nil, fmt.Errorf("invalid relabel config at index %d: %w", i, err) + } + configs = append(configs, config) + } + + log.Infof("Loaded %d relabel configs from secret %s", len(configs), storeKey) + return configs, nil +} + +func (rrm *relabeledRulesManager) collectAlerts(ctx context.Context, relabelConfigs []*relabel.Config) map[string]monitoringv1.Rule { + alerts := make(map[string]monitoringv1.Rule) + seenIDs := make(map[string]struct{}) + + for _, obj := range rrm.prometheusRulesInformer.GetStore().List() { + promRule, ok := obj.(*monitoringv1.PrometheusRule) + if !ok { + continue + } + + // Skip deleted rules + if promRule.DeletionTimestamp != nil { + continue + } + + for _, group := range promRule.Spec.Groups { + for _, rule := range group.Rules { + // Only process alerting rules (skip recording rules) + if rule.Alert == "" { + continue + } + + // Compute a deterministic id from the rule spec. + // Do not trust any user-provided value in openshift_io_alert_rule_id since + // PrometheusRule content (including labels) can be tampered with. + alertRuleId := alertrule.GetAlertingRuleId(&rule) + if _, exists := seenIDs[alertRuleId]; exists { + // A second rule that computes to the same id is ambiguous/unsupported (a "true clone"). + // Don't silently overwrite the first rule in the cache. + log.Warnf("Duplicate alert rule id %q computed for %s/%s (alert=%q); skipping duplicate", alertRuleId, promRule.Namespace, promRule.Name, rule.Alert) + continue + } + seenIDs[alertRuleId] = struct{}{} + + if rule.Labels == nil { + rule.Labels = make(map[string]string) + } + + rule.Labels[managementlabels.AlertNameLabel] = rule.Alert + + if rrm.namespaceManager.IsClusterMonitoringNamespace(promRule.Namespace) { + // Relabel the alert labels + relabeledLabels, keep := relabel.Process(labels.FromMap(rule.Labels), relabelConfigs...) + if !keep { + // Alert was dropped by relabeling, skip it + log.Infof("Skipping dropped alert %s from %s/%s", rule.Alert, promRule.Namespace, promRule.Name) + continue + } + + // Update the alert labels + rule.Labels = relabeledLabels.Map() + } + + rule.Labels[AlertRuleLabelId] = alertRuleId + rule.Labels[PrometheusRuleLabelNamespace] = promRule.Namespace + rule.Labels[PrometheusRuleLabelName] = promRule.Name + + if arName := alertingRuleOwner(promRule); arName != "" { + rule.Labels[managementlabels.AlertingRuleLabelName] = arName + } + + ruleManagedBy, relabelConfigManagedBy := rrm.determineManagedBy(ctx, promRule, alertRuleId) + if ruleManagedBy != "" { + rule.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + rule.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + alerts[alertRuleId] = rule + } + } + } + + log.Debugf("Collected %d alerts", len(alerts)) + return alerts +} + +// alertingRuleOwner returns the name of the AlertingRule CR that generated +// this PrometheusRule, or "" if it was not generated by one. Detection is based +// on the ownerReferences set by the alerting-rules-controller. +func alertingRuleOwner(pr *monitoringv1.PrometheusRule) string { + for _, ref := range pr.OwnerReferences { + if ref.Kind == "AlertingRule" && ref.Controller != nil && *ref.Controller { + return ref.Name + } + } + return "" +} + +// GetAlertRelabelConfigName builds the AlertRelabelConfig name from a PrometheusRule name and alert rule ID +func GetAlertRelabelConfigName(promRuleName string, alertRuleId string) string { + return fmt.Sprintf("arc-%s-%s", sanitizeDNSName(promRuleName), shortHash(alertRuleId, 12)) +} + +// sanitizeDNSName lowercases and replaces invalid chars with '-', trims extra '-' +func sanitizeDNSName(in string) string { + if in == "" { + return "" + } + s := strings.ToLower(in) + // replace any char not [a-z0-9-] with '-' + out := make([]rune, 0, len(s)) + for _, r := range s { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + out = append(out, r) + } else { + out = append(out, '-') + } + } + // collapse multiple '-' and trim + res := strings.Trim(strings.ReplaceAll(string(out), "--", "-"), "-") + if res == "" { + return "arc" + } + return res +} + +func shortHash(id string, n int) string { + sum := sha256.Sum256([]byte(id)) + full := fmt.Sprintf("%x", sum[:]) + if n > len(full) { + return full + } + return full[:n] +} + +// determineManagedBy determines the openshift_io_rule_managed_by and openshift_io_relabel_config_managed_by label values +func (rrm *relabeledRulesManager) determineManagedBy(ctx context.Context, promRule *monitoringv1.PrometheusRule, alertRuleId string) (string, string) { + // Determine ruleManagedBy from PrometheusRule + var ruleManagedBy string + // If generated by AlertingRule CRD, do not mark as operator-managed; treat as user-via-platform + if alertingRuleOwner(promRule) == "" { + // Prefer operator-managed over GitOps when owner references indicate an operator + gitOpsManaged, operatorManaged := IsExternallyManagedObject(promRule) + if operatorManaged { + ruleManagedBy = managementlabels.ManagedByOperator + } else if gitOpsManaged { + ruleManagedBy = managementlabels.ManagedByGitOps + } + } + + // Determine relabelConfigManagedBy only for platform rules + isPlatform := rrm.namespaceManager.IsClusterMonitoringNamespace(promRule.Namespace) + var relabelConfigManagedBy string + if isPlatform && rrm.alertRelabelConfigs != nil { + arcName := GetAlertRelabelConfigName(promRule.Name, alertRuleId) + arc, found, err := rrm.alertRelabelConfigs.Get(ctx, promRule.Namespace, arcName) + if err == nil && found { + if IsManagedByGitOps(arc.Annotations, arc.Labels) { + relabelConfigManagedBy = managementlabels.ManagedByGitOps + } + } + } + + return ruleManagedBy, relabelConfigManagedBy +} + +// DetermineManagedByForTesting creates a minimal relabeledRulesManager for testing purposes +func DetermineManagedByForTesting(ctx context.Context, alertRelabelConfigs AlertRelabelConfigInterface, namespaceManager NamespaceInterface, promRule *monitoringv1.PrometheusRule, alertRuleId string) (string, string) { + rrm := &relabeledRulesManager{ + alertRelabelConfigs: alertRelabelConfigs, + namespaceManager: namespaceManager, + } + return rrm.determineManagedBy(ctx, promRule, alertRuleId) +} + +func (rrm *relabeledRulesManager) List(ctx context.Context) []monitoringv1.Rule { + rrm.mu.RLock() + defer rrm.mu.RUnlock() + + var result []monitoringv1.Rule + for _, rule := range rrm.relabeledRules { + result = append(result, rule) + } + + return result +} + +func (rrm *relabeledRulesManager) Get(ctx context.Context, id string) (monitoringv1.Rule, bool) { + rrm.mu.RLock() + defer rrm.mu.RUnlock() + + rule, ok := rrm.relabeledRules[id] + if !ok { + return monitoringv1.Rule{}, false + } + + return rule, true +} + +func (rrm *relabeledRulesManager) Config() []*relabel.Config { + rrm.mu.RLock() + defer rrm.mu.RUnlock() + + return append([]*relabel.Config{}, rrm.relabelConfigs...) +} diff --git a/pkg/k8s/types.go b/pkg/k8s/types.go new file mode 100644 index 000000000..3cc8176dc --- /dev/null +++ b/pkg/k8s/types.go @@ -0,0 +1,170 @@ +package k8s + +import ( + "context" + + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" +) + +// ClientOptions holds configuration options for creating a Kubernetes client +type ClientOptions struct { + // KubeconfigPath specifies the path to the kubeconfig file for remote connections + // If empty, will try default locations or in-cluster config + KubeconfigPath string +} + +// Client defines the contract for Kubernetes client operations +type Client interface { + // TestConnection tests the connection to the Kubernetes cluster + TestConnection(ctx context.Context) error + + // AlertingHealth returns alerting route and stack health details + AlertingHealth(ctx context.Context) (AlertingHealth, error) + + // PrometheusAlerts retrieves active Prometheus alerts + PrometheusAlerts() PrometheusAlertsInterface + + // PrometheusRules returns the PrometheusRule interface + PrometheusRules() PrometheusRuleInterface + + // AlertRelabelConfigs returns the AlertRelabelConfig interface + AlertRelabelConfigs() AlertRelabelConfigInterface + + // AlertingRules returns the AlertingRule interface + AlertingRules() AlertingRuleInterface + + // RelabeledRules returns the RelabeledRules interface + RelabeledRules() RelabeledRulesInterface + + // Namespace returns the Namespace interface + Namespace() NamespaceInterface + + // ConfigMaps returns the ConfigMap interface + ConfigMaps() ConfigMapInterface +} + +// RouteStatus describes the availability state of a monitoring route. +type RouteStatus string + +const ( + RouteNotFound RouteStatus = "notFound" + RouteUnreachable RouteStatus = "unreachable" + RouteReachable RouteStatus = "reachable" +) + +// AlertingRouteHealth describes route availability and reachability. +type AlertingRouteHealth struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + Status RouteStatus `json:"status"` + FallbackReachable bool `json:"fallbackReachable,omitempty"` + Error string `json:"error,omitempty"` +} + +// AlertingStackHealth describes alerting health for a monitoring stack. +type AlertingStackHealth struct { + Prometheus AlertingRouteHealth `json:"prometheus"` + Alertmanager AlertingRouteHealth `json:"alertmanager"` +} + +// AlertingHealth provides alerting health details for platform and user workload stacks. +type AlertingHealth struct { + Platform *AlertingStackHealth `json:"platform"` + UserWorkloadEnabled bool `json:"userWorkloadEnabled"` + UserWorkload *AlertingStackHealth `json:"userWorkload"` +} + +// PrometheusAlertsInterface defines operations for managing PrometheusAlerts +type PrometheusAlertsInterface interface { + // GetAlerts retrieves Prometheus alerts with optional state filtering + GetAlerts(ctx context.Context, req GetAlertsRequest) ([]PrometheusAlert, error) + // GetRules retrieves Prometheus alerting rules and active alerts + GetRules(ctx context.Context, req GetRulesRequest) ([]PrometheusRuleGroup, error) +} + +// PrometheusRuleInterface defines operations for managing PrometheusRules +type PrometheusRuleInterface interface { + // List lists all PrometheusRules in the cluster + List(ctx context.Context, namespace string) ([]monitoringv1.PrometheusRule, error) + + // Get retrieves a PrometheusRule by namespace and name + Get(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) + + // Update updates an existing PrometheusRule + Update(ctx context.Context, pr monitoringv1.PrometheusRule) error + + // Delete deletes a PrometheusRule by namespace and name + Delete(ctx context.Context, namespace string, name string) error + + // AddRule adds a new rule to the specified PrometheusRule + AddRule(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error +} + +// AlertRelabelConfigInterface defines operations for managing AlertRelabelConfigs +type AlertRelabelConfigInterface interface { + // List lists all AlertRelabelConfigs in the cluster + List(ctx context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) + + // Get retrieves an AlertRelabelConfig by namespace and name + Get(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) + + // Create creates a new AlertRelabelConfig + Create(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) + + // Update updates an existing AlertRelabelConfig + Update(ctx context.Context, arc osmv1.AlertRelabelConfig) error + + // Delete deletes an AlertRelabelConfig by namespace and name + Delete(ctx context.Context, namespace string, name string) error +} + +// AlertingRuleInterface defines operations for managing AlertingRules +// in the cluster monitoring namespace +type AlertingRuleInterface interface { + // List lists all AlertingRules in the cluster + List(ctx context.Context) ([]osmv1.AlertingRule, error) + + // Get retrieves an AlertingRule by name + Get(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) + + // Create creates a new AlertingRule + Create(ctx context.Context, ar osmv1.AlertingRule) (*osmv1.AlertingRule, error) + + // Update updates an existing AlertingRule + Update(ctx context.Context, ar osmv1.AlertingRule) error + + // Delete deletes an AlertingRule by name + Delete(ctx context.Context, name string) error +} + +// RelabeledRulesInterface defines operations for managing relabeled rules +type RelabeledRulesInterface interface { + // List retrieves the relabeled rules for a given PrometheusRule + List(ctx context.Context) []monitoringv1.Rule + + // Get retrieves the relabeled rule for a given id + Get(ctx context.Context, id string) (monitoringv1.Rule, bool) + + // Config returns the list of alert relabel configs + Config() []*relabel.Config +} + +// NamespaceInterface defines operations for Namespaces +type NamespaceInterface interface { + // IsClusterMonitoringNamespace checks if a namespace has the openshift.io/cluster-monitoring=true label + IsClusterMonitoringNamespace(name string) bool +} + +// ConfigMapInterface defines minimal operations used for classification updates +type ConfigMapInterface interface { + // Get retrieves a ConfigMap by namespace and name + Get(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, bool, error) + // Update updates an existing ConfigMap + Update(ctx context.Context, cm corev1.ConfigMap) error + // Create creates a new ConfigMap + Create(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) +} diff --git a/pkg/k8s/vars.go b/pkg/k8s/vars.go new file mode 100644 index 000000000..5e2d83b2a --- /dev/null +++ b/pkg/k8s/vars.go @@ -0,0 +1,34 @@ +package k8s + +const ( + ClusterMonitoringNamespace = "openshift-monitoring" + + PlatformRouteNamespace = "openshift-monitoring" + PlatformRouteName = "prometheus-k8s" + PlatformAlertmanagerRouteName = "alertmanager-main" + UserWorkloadRouteNamespace = "openshift-user-workload-monitoring" + UserWorkloadRouteName = "prometheus-user-workload" + UserWorkloadAlertmanagerRouteName = "alertmanager-user-workload" + PrometheusAlertsPath = "/v1/alerts" + PrometheusRulesPath = "/v1/rules" + AlertmanagerAlertsPath = "/api/v2/alerts" + UserWorkloadAlertmanagerPort = 9095 + UserWorkloadPrometheusServiceName = "prometheus-user-workload-web" + UserWorkloadPrometheusPort = 9090 + + ThanosQuerierNamespace = "openshift-monitoring" + ThanosQuerierServiceName = "thanos-querier" + ThanosQuerierTenancyRulesPortName = "tenancy-rules" + DefaultThanosQuerierTenancyRulesPort = 9093 + ThanosQuerierTenancyAlertsPath = "/api/v1/alerts" + ThanosQuerierTenancyRulesPath = "/api/v1/rules" + ServiceCAPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" + + AlertSourceLabel = "openshift_io_alert_source" + AlertSourcePlatform = "platform" + AlertSourceUser = "user" + AlertBackendLabel = "openshift_io_alert_backend" + AlertBackendAM = "alertmanager" + AlertBackendProm = "prometheus" + AlertBackendThanos = "thanos" +) diff --git a/pkg/management/alert_rule_id_match.go b/pkg/management/alert_rule_id_match.go new file mode 100644 index 000000000..8e11d9047 --- /dev/null +++ b/pkg/management/alert_rule_id_match.go @@ -0,0 +1,16 @@ +package management + +import ( + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +// ruleMatchesAlertRuleID returns true when the provided rule's computed, deterministic +// alert rule id matches the requested id. +// +// Note: we intentionally compute the id from the rule spec rather than trusting any +// label value, since labels can be user-controlled/tampered with. +func ruleMatchesAlertRuleID(rule monitoringv1.Rule, alertRuleId string) bool { + return alertRuleId != "" && alertRuleId == alertrule.GetAlertingRuleId(&rule) +} + diff --git a/pkg/management/alert_rule_preconditions.go b/pkg/management/alert_rule_preconditions.go new file mode 100644 index 000000000..8edfb4318 --- /dev/null +++ b/pkg/management/alert_rule_preconditions.go @@ -0,0 +1,98 @@ +package management + +import ( + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + osmv1 "github.com/openshift/api/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +// Standardized NotAllowed errors +func notAllowedGitOpsEdit() error { + return &NotAllowedError{Message: "This alert is managed by GitOps; edit it in Git."} +} +func notAllowedGitOpsRemove() error { + return &NotAllowedError{Message: "This alert is managed by GitOps; remove it in Git."} +} +func notAllowedOperatorUpdate() error { + return &NotAllowedError{Message: "This alert is managed by an operator; it can't be updated and can only be silenced."} +} +func notAllowedOperatorDelete() error { + return &NotAllowedError{Message: "This alert is managed by an operator; it can't be deleted and can only be silenced."} +} + +// isRuleManagedByGitOpsLabel returns true if the relabeled rule indicates GitOps management via its managed-by label. +func isRuleManagedByGitOpsLabel(relabeled monitoringv1.Rule) bool { + if relabeled.Labels == nil { + return false + } + return relabeled.Labels[managementlabels.RuleManagedByLabel] == managementlabels.ManagedByGitOps +} + +// isRuleManagedByOperator returns true if the relabeled rule indicates operator management via its managed-by label. +func isRuleManagedByOperator(relabeled monitoringv1.Rule) bool { + return relabeled.Labels != nil && relabeled.Labels[managementlabels.RuleManagedByLabel] == managementlabels.ManagedByOperator +} + +// validateUserDeletePreconditions enforces common label-based constraints for user-source delete. +func validateUserDeletePreconditions(relabeled monitoringv1.Rule) error { + if isRuleManagedByGitOpsLabel(relabeled) { + return notAllowedGitOpsRemove() + } + if isRuleManagedByOperator(relabeled) { + return notAllowedOperatorDelete() + } + return nil +} + +// validateUserUpdatePreconditions enforces common constraints for user-source update. +func validateUserUpdatePreconditions(relabeled monitoringv1.Rule, pr *monitoringv1.PrometheusRule) error { + if isRuleManagedByGitOpsLabel(relabeled) { + return notAllowedGitOpsEdit() + } + if isRuleManagedByOperator(relabeled) { + return notAllowedOperatorUpdate() + } + // Authoritative operator-managed check on PR owner references if provided + if pr != nil { + if _, operatorManaged := k8s.IsExternallyManagedObject(pr); operatorManaged { + return notAllowedOperatorUpdate() + } + } + return nil +} + +// validatePlatformDeletePreconditions enforces constraints before mutating the owning AlertingRule. +func validatePlatformDeletePreconditions(ar *osmv1.AlertingRule) error { + // Block if owning AR is externally managed (GitOps or operator) + if ar != nil { + if gitOpsManaged, operatorManaged := k8s.IsExternallyManagedObject(ar); gitOpsManaged { + return notAllowedGitOpsRemove() + } else if operatorManaged { + return notAllowedOperatorDelete() + } + } + return nil +} + +// validatePlatformUpdatePreconditions enforces constraints before ARC-based update. +// pr may be nil if not fetched yet; arc may be nil if absent. +func validatePlatformUpdatePreconditions(relabeled monitoringv1.Rule, pr *monitoringv1.PrometheusRule, arc *osmv1.AlertRelabelConfig) error { + // Rule-level GitOps block + if isRuleManagedByGitOpsLabel(relabeled) { + return notAllowedGitOpsEdit() + } + // PR metadata GitOps block + if pr != nil { + if gitOpsManaged, _ := k8s.IsExternallyManagedObject(pr); gitOpsManaged { + return notAllowedGitOpsEdit() + } + } + // ARC metadata GitOps block + if arc != nil && k8s.IsManagedByGitOps(arc.Annotations, arc.Labels) { + return notAllowedGitOpsEdit() + } + return nil +} diff --git a/pkg/management/classification_override_key.go b/pkg/management/classification_override_key.go new file mode 100644 index 000000000..edce5b8ea --- /dev/null +++ b/pkg/management/classification_override_key.go @@ -0,0 +1,19 @@ +package management + +import "encoding/base64" + +func classificationOverrideKey(ruleId string) string { + return base64.RawURLEncoding.EncodeToString([]byte(ruleId)) +} + +func OverrideConfigMapName(ruleNamespace string) string { + return "alert-classification-overrides-" + ruleNamespace +} + +func decodeClassificationOverrideKey(key string) (string, bool) { + decoded, err := base64.RawURLEncoding.DecodeString(key) + if err != nil { + return "", false + } + return string(decoded), true +} diff --git a/pkg/management/classification_override_types.go b/pkg/management/classification_override_types.go new file mode 100644 index 000000000..546cd5696 --- /dev/null +++ b/pkg/management/classification_override_types.go @@ -0,0 +1,18 @@ +package management + +// alertRuleClassificationOverridePayload is the ConfigMap entry payload stored under each rule ID key. +// It may include optional metadata fields for readability, but only Classification is used by the backend. +type alertRuleClassificationOverridePayload struct { + AlertName string `json:"alertName,omitempty"` + RuleName string `json:"prometheusRuleName,omitempty"` + RuleNamespace string `json:"prometheusRuleNamespace,omitempty"` + + Classification alertRuleClassification `json:"classification"` +} + +type alertRuleClassification struct { + Component string `json:"openshift_io_alert_rule_component,omitempty"` + Layer string `json:"openshift_io_alert_rule_layer,omitempty"` + ComponentFrom string `json:"openshift_io_alert_rule_component_from,omitempty"` + LayerFrom string `json:"openshift_io_alert_rule_layer_from,omitempty"` +} diff --git a/pkg/management/client_factory.go b/pkg/management/client_factory.go new file mode 100644 index 000000000..09ce8b1e4 --- /dev/null +++ b/pkg/management/client_factory.go @@ -0,0 +1,16 @@ +package management + +import ( + "context" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +// New creates a new management client. +func New(ctx context.Context, k8sClient k8s.Client) Client { + return &client{ + k8sClient: k8sClient, + overrideNamespace: detectOverrideNamespace(), + } +} + diff --git a/pkg/management/create_platform_alert_rule.go b/pkg/management/create_platform_alert_rule.go new file mode 100644 index 000000000..a580528f9 --- /dev/null +++ b/pkg/management/create_platform_alert_rule.go @@ -0,0 +1,134 @@ +package management + +import ( + "context" + "fmt" + "strings" + + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +const ( + defaultAlertingRuleName = "platform-alert-rules" + defaultPlatformGroupName = "platform-alert-rules" +) + +func (c *client) CreatePlatformAlertRule(ctx context.Context, alertRule monitoringv1.Rule) (string, error) { + err := validatePlatformCreateInputs(alertRule) + if err != nil { + return "", err + } + + newRuleId := alertrule.GetAlertingRuleId(&alertRule) + + if _, found := c.k8sClient.RelabeledRules().Get(ctx, newRuleId); found { + return "", &ConflictError{Message: "alert rule with exact config already exists"} + } + + if alertRule.Labels == nil { + alertRule.Labels = map[string]string{} + } + alertRule.Labels[k8s.AlertRuleLabelId] = newRuleId + + osmRule := toOSMRule(alertRule) + + existing, found, err := c.k8sClient.AlertingRules().Get(ctx, defaultAlertingRuleName) + if err != nil { + return "", fmt.Errorf("failed to get AlertingRule %s: %w", defaultAlertingRuleName, err) + } + + if found { + // Disallow adding to externally managed AlertingRules + if gitOpsManaged, operatorManaged := k8s.IsExternallyManagedObject(existing); gitOpsManaged { + return "", &NotAllowedError{Message: "The AlertingRule is managed by GitOps; create the alert in Git."} + } else if operatorManaged { + return "", &NotAllowedError{Message: "This AlertingRule is managed by an operator; you cannot add alerts to it."} + } + updated := existing.DeepCopy() + if err := addRuleToGroup(&updated.Spec, defaultPlatformGroupName, osmRule); err != nil { + return "", err + } + if err := c.k8sClient.AlertingRules().Update(ctx, *updated); err != nil { + return "", fmt.Errorf("failed to update AlertingRule %s: %w", defaultAlertingRuleName, err) + } + return newRuleId, nil + } + + ar := osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: defaultAlertingRuleName, + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: defaultPlatformGroupName, + Rules: []osmv1.Rule{osmRule}, + }, + }, + }, + } + + if _, err := c.k8sClient.AlertingRules().Create(ctx, ar); err != nil { + return "", fmt.Errorf("failed to create AlertingRule %s: %w", defaultAlertingRuleName, err) + } + + return newRuleId, nil +} + +func validatePlatformCreateInputs(alertRule monitoringv1.Rule) error { + alertName := strings.TrimSpace(alertRule.Alert) + if alertName == "" { + return &ValidationError{Message: "alert name is required"} + } + + if strings.TrimSpace(alertRule.Expr.String()) == "" { + return &ValidationError{Message: "expr is required"} + } + + if v, ok := alertRule.Labels["severity"]; ok && !isValidSeverity(v) { + return &ValidationError{Message: fmt.Sprintf("invalid severity %q: must be one of critical|warning|info|none", v)} + } + + return nil +} + +func addRuleToGroup(spec *osmv1.AlertingRuleSpec, groupName string, rule osmv1.Rule) error { + for i := range spec.Groups { + if spec.Groups[i].Name != groupName { + continue + } + for _, existing := range spec.Groups[i].Rules { + if existing.Alert == rule.Alert { + return &ConflictError{Message: fmt.Sprintf("alert rule %q already exists in group %q", rule.Alert, groupName)} + } + } + spec.Groups[i].Rules = append(spec.Groups[i].Rules, rule) + return nil + } + spec.Groups = append(spec.Groups, osmv1.RuleGroup{ + Name: groupName, + Rules: []osmv1.Rule{rule}, + }) + return nil +} + +func toOSMRule(rule monitoringv1.Rule) osmv1.Rule { + osmRule := osmv1.Rule{ + Alert: rule.Alert, + Expr: rule.Expr, + Labels: rule.Labels, + Annotations: rule.Annotations, + } + + if rule.For != nil { + osmRule.For = osmv1.Duration(*rule.For) + } + + return osmRule +} diff --git a/pkg/management/create_platform_alert_rule_test.go b/pkg/management/create_platform_alert_rule_test.go new file mode 100644 index 000000000..07c0c816b --- /dev/null +++ b/pkg/management/create_platform_alert_rule_test.go @@ -0,0 +1,270 @@ +package management_test + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("CreatePlatformAlertRule", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + + baseRule monitoringv1.Rule + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + + baseRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Expr: intstr.FromString("up == 0"), + For: (*monitoringv1.Duration)(stringPtr("5m")), + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "summary": "platform alert", + }, + } + }) + + Context("validation", func() { + It("returns error when alert name is empty", func() { + rule := baseRule + rule.Alert = " " + + _, err := client.CreatePlatformAlertRule(ctx, rule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("alert name is required")) + }) + + It("returns error when expr is empty", func() { + rule := baseRule + rule.Expr = intstr.FromString(" ") + + _, err := client.CreatePlatformAlertRule(ctx, rule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("expr is required")) + }) + + It("returns error when severity is invalid", func() { + rule := baseRule + rule.Labels = map[string]string{"severity": "fatal"} + + _, err := client.CreatePlatformAlertRule(ctx, rule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid severity")) + }) + }) + + Context("duplicate detection", func() { + It("returns conflict when same rule id already exists in relabeled rules", func() { + ruleID := alertrule.GetAlertingRuleId(&baseRule) + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleID { + return baseRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + _, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("exact config already exists")) + }) + }) + + Context("when target AlertingRule exists", func() { + It("returns NotAllowed when AlertingRule is GitOps-managed", func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: k8s.ClusterMonitoringNamespace, + Annotations: map[string]string{"argocd.argoproj.io/tracking-id": "abc"}, + }, + }, true, nil + }, + } + } + + _, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("The AlertingRule is managed by GitOps")) + }) + + It("adds rule to default group and updates AlertingRule", func() { + var updated osmv1.AlertingRule + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: "platform-alert-rules", + Rules: []osmv1.Rule{ + { + Alert: "ExistingAlert", + Expr: intstr.FromString("vector(1)"), + }, + }, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, ar osmv1.AlertingRule) error { + updated = ar + return nil + }, + } + } + + ruleID, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).NotTo(HaveOccurred()) + Expect(ruleID).To(Equal(alertrule.GetAlertingRuleId(&baseRule))) + Expect(updated.Name).To(Equal("platform-alert-rules")) + Expect(updated.Spec.Groups).To(HaveLen(1)) + Expect(updated.Spec.Groups[0].Name).To(Equal("platform-alert-rules")) + Expect(updated.Spec.Groups[0].Rules).To(HaveLen(2)) + Expect(updated.Spec.Groups[0].Rules[1].Labels).To(HaveKey(k8s.AlertRuleLabelId)) + }) + + It("returns conflict when same alert name exists in target group", func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: "platform-alert-rules", + Rules: []osmv1.Rule{ + { + Alert: "PlatformAlert", + Expr: intstr.FromString("vector(1)"), + }, + }, + }, + }, + }, + }, true, nil + }, + } + } + + _, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("already exists in group")) + }) + }) + + Context("when target AlertingRule does not exist", func() { + It("creates AlertingRule in cluster monitoring namespace", func() { + var created osmv1.AlertingRule + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return nil, false, nil + }, + CreateFunc: func(ctx context.Context, ar osmv1.AlertingRule) (*osmv1.AlertingRule, error) { + created = ar + return &ar, nil + }, + } + } + + _, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).NotTo(HaveOccurred()) + Expect(created.Name).To(Equal("platform-alert-rules")) + Expect(created.Namespace).To(Equal(k8s.ClusterMonitoringNamespace)) + Expect(created.Spec.Groups).To(HaveLen(1)) + Expect(created.Spec.Groups[0].Name).To(Equal("platform-alert-rules")) + Expect(created.Spec.Groups[0].Rules).To(HaveLen(1)) + Expect(created.Spec.Groups[0].Rules[0].Labels).To(HaveKey(k8s.AlertRuleLabelId)) + }) + + It("returns wrapped error when AlertingRules Get fails", func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return nil, false, errors.New("get failed") + }, + } + } + + _, err := client.CreatePlatformAlertRule(ctx, baseRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get AlertingRule")) + Expect(err.Error()).To(ContainSubstring("get failed")) + }) + }) +}) diff --git a/pkg/management/create_user_defined_alert_rule.go b/pkg/management/create_user_defined_alert_rule.go new file mode 100644 index 000000000..ad2533a3b --- /dev/null +++ b/pkg/management/create_user_defined_alert_rule.go @@ -0,0 +1,136 @@ +package management + +import ( + "context" + "strings" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/types" +) + +const ( + DefaultGroupName = "user-defined-rules" +) + +func (c *client) CreateUserDefinedAlertRule(ctx context.Context, alertRule monitoringv1.Rule, prOptions PrometheusRuleOptions) (string, error) { + if prOptions.Name == "" || prOptions.Namespace == "" { + return "", &ValidationError{Message: "PrometheusRule Name and Namespace must be specified"} + } + + // compute id from the rule content BEFORE mutating labels + computedRuleID := alertrule.GetAlertingRuleId(&alertRule) + // set/stamp the rule id label on user-defined rules + if alertRule.Labels == nil { + alertRule.Labels = map[string]string{} + } + alertRule.Labels[k8s.AlertRuleLabelId] = computedRuleID + + // Check if rule with the same ID already exists (fast path) + _, found := c.k8sClient.RelabeledRules().Get(ctx, computedRuleID) + if found { + return "", &ConflictError{Message: "alert rule with exact config already exists"} + } + + // Deny creating an equivalent rule (same spec: expr, for, labels including severity) even if alert name differs + if c.existsUserDefinedRuleWithSameSpec(ctx, alertRule) { + return "", &ConflictError{Message: "alert rule with equivalent spec already exists"} + } + + nn := types.NamespacedName{ + Name: prOptions.Name, + Namespace: prOptions.Namespace, + } + + if c.IsPlatformAlertRule(nn) { + return "", &NotAllowedError{Message: "cannot add user-defined alert rule to a platform-managed PrometheusRule"} + } + + // Enforce uniqueness within the target PrometheusRule: + // - "True clones" (different entries with identical definitions) are unsupported; they compute to the same rule ID. + pr, prFound, err := c.k8sClient.PrometheusRules().Get(ctx, nn.Namespace, nn.Name) + if err != nil { + return "", err + } + if prFound && pr != nil { + // Disallow adding to GitOps- or operator-managed PrometheusRule + if gitOpsManaged, operatorManaged := k8s.IsExternallyManagedObject(pr); gitOpsManaged { + return "", &NotAllowedError{Message: "This PrometheusRule is managed by GitOps; create the alert in Git."} + } else if operatorManaged { + return "", &NotAllowedError{Message: "This PrometheusRule is managed by an operator; you cannot add alerts to it."} + } + for _, g := range pr.Spec.Groups { + for _, r := range g.Rules { + // Treat "true clones" as unsupported: identical definitions compute to the same id. + if r.Alert != "" && alertrule.GetAlertingRuleId(&r) == computedRuleID { + return "", &ConflictError{Message: "alert rule with exact config already exists"} + } + } + } + } + + if prOptions.GroupName == "" { + prOptions.GroupName = DefaultGroupName + } + + err = c.k8sClient.PrometheusRules().AddRule(ctx, nn, prOptions.GroupName, alertRule) + if err != nil { + return "", err + } + + return computedRuleID, nil +} + +// existsUserDefinedRuleWithSameSpec returns true if a rule with an equivalent +// specification already exists in the relabeled rules cache. +func (c *client) existsUserDefinedRuleWithSameSpec(ctx context.Context, candidate monitoringv1.Rule) bool { + for _, existing := range c.k8sClient.RelabeledRules().List(ctx) { + if rulesHaveEquivalentSpec(existing, candidate) { + return true + } + } + return false +} + +// rulesHaveEquivalentSpec compares two alert rules for equivalence based on +// expression, duration (for) and non-system labels (excluding openshift_io_* and alertname). +func rulesHaveEquivalentSpec(a, b monitoringv1.Rule) bool { + if a.Expr.String() != b.Expr.String() { + return false + } + var af, bf string + if a.For != nil { + af = string(*a.For) + } + if b.For != nil { + bf = string(*b.For) + } + if af != bf { + return false + } + al := filterBusinessLabels(a.Labels) + bl := filterBusinessLabels(b.Labels) + if len(al) != len(bl) { + return false + } + for k, v := range al { + if bl[k] != v { + return false + } + } + return true +} + +// filterBusinessLabels returns labels excluding system/provenance and identity labels. +func filterBusinessLabels(in map[string]string) map[string]string { + out := map[string]string{} + for k, v := range in { + if strings.HasPrefix(k, "openshift_io_") || k == managementlabels.AlertNameLabel { + continue + } + out[k] = v + } + return out +} diff --git a/pkg/management/create_user_defined_alert_rule_test.go b/pkg/management/create_user_defined_alert_rule_test.go new file mode 100644 index 000000000..b69e8544d --- /dev/null +++ b/pkg/management/create_user_defined_alert_rule_test.go @@ -0,0 +1,377 @@ +package management_test + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("CreateUserDefinedAlertRule", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + testRule = monitoringv1.Rule{ + Alert: "TestAlert", + Expr: intstr.FromString("up == 0"), + For: (*monitoringv1.Duration)(stringPtr("5m")), + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "summary": "Test alert", + }, + } + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + Context("when target PrometheusRule is GitOps-managed", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return false }, + } + } + // No duplicate + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + // Existing PrometheusRule with GitOps annotation + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + Annotations: map[string]string{"argocd.argoproj.io/tracking-id": "abc"}, + }, + }, true, nil + }, + } + } + }) + + It("returns NotAllowed with GitOps message", func() { + prOptions := management.PrometheusRuleOptions{Name: "user-pr", Namespace: "user-ns"} + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("This PrometheusRule is managed by GitOps; create the alert in Git.")) + }) + }) + + Context("when target PrometheusRule is operator-managed", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return false }, + } + } + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + // Existing PrometheusRule with OwnerReferences + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "Deployment", Name: "some-operator"}, + }, + }, + }, true, nil + }, + } + } + }) + + It("returns NotAllowed for operator-managed PrometheusRule", func() { + prOptions := management.PrometheusRuleOptions{Name: "user-pr", Namespace: "user-ns"} + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("This PrometheusRule is managed by an operator; you cannot add alerts to it.")) + }) + }) + Context("when PrometheusRule Name is not specified", func() { + It("returns an error", func() { + prOptions := management.PrometheusRuleOptions{ + Namespace: "test-namespace", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("PrometheusRule Name and Namespace must be specified")) + }) + }) + + Context("when PrometheusRule Namespace is not specified", func() { + It("returns an error", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "test-rule", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("PrometheusRule Name and Namespace must be specified")) + }) + }) + + Context("when trying to add rule to platform-managed PrometheusRule", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns an error", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "platform-rule", + Namespace: "openshift-monitoring", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot add user-defined alert rule to a platform-managed PrometheusRule")) + }) + }) + + Context("when rule with same ID already exists", func() { + BeforeEach(func() { + ruleId := alertrule.GetAlertingRuleId(&testRule) + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == ruleId { + return testRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns an error", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("alert rule with exact config already exists")) + }) + }) + + Context("when AddRule fails", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + AddRuleFunc: func(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + return errors.New("failed to add rule") + }, + } + } + }) + + It("returns the error", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to add rule")) + }) + }) + + Context("when successfully creating a rule", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + AddRuleFunc: func(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + return nil + }, + } + } + }) + + It("returns the rule ID", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + } + + ruleId, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(ruleId).NotTo(BeEmpty()) + Expect(ruleId).To(Equal(alertrule.GetAlertingRuleId(&testRule))) + }) + + It("uses default group name when not specified", func() { + var capturedGroupName string + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + AddRuleFunc: func(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + capturedGroupName = groupName + return nil + }, + } + } + + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(capturedGroupName).To(Equal("user-defined-rules")) + }) + + It("uses custom group name when specified", func() { + var capturedGroupName string + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + AddRuleFunc: func(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + capturedGroupName = groupName + return nil + }, + } + } + + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + GroupName: "custom-group", + } + + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(capturedGroupName).To(Equal("custom-group")) + }) + }) + + Context("duplicate detection ignoring alert name", func() { + BeforeEach(func() { + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return false }, + } + } + // existing rule with different alert name but same spec (expr/for/labels) + existing := monitoringv1.Rule{} + (&testRule).DeepCopyInto(&existing) + existing.Alert = "OtherName" + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{existing} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("denies adding equivalent rule with different alert name", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "user-rule", + Namespace: "user-namespace", + } + _, err := client.CreateUserDefinedAlertRule(ctx, testRule, prOptions) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("equivalent spec already exists")) + }) + }) +}) + +func stringPtr(s string) *string { + return &s +} diff --git a/pkg/management/delete_user_defined_alert_rule_by_id.go b/pkg/management/delete_user_defined_alert_rule_by_id.go new file mode 100644 index 000000000..103f687ff --- /dev/null +++ b/pkg/management/delete_user_defined_alert_rule_by_id.go @@ -0,0 +1,155 @@ +package management + +import ( + "context" + "fmt" + + osmv1 "github.com/openshift/api/monitoring/v1" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/types" +) + +func (c *client) DeleteUserDefinedAlertRuleById(ctx context.Context, alertRuleId string) error { + rule, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found { + return &NotFoundError{Resource: "AlertRule", Id: alertRuleId} + } + + namespace := rule.Labels[k8s.PrometheusRuleLabelNamespace] + name := rule.Labels[k8s.PrometheusRuleLabelName] + + // Disallow deleting any GitOps-managed rule + if err := validateUserDeletePreconditions(rule); err != nil { + return err + } + + if c.IsPlatformAlertRule(types.NamespacedName{Namespace: namespace, Name: name}) { + return c.deletePlatformAlertRuleById(ctx, rule, alertRuleId) + } + + // user-source branch: preconditions were validated above + + return c.deleteUserAlertRuleById(ctx, namespace, name, alertRuleId) +} + +func (c *client) filterRulesById(rules []monitoringv1.Rule, alertRuleId string, updated *bool) []monitoringv1.Rule { + var newRules []monitoringv1.Rule + + for _, rule := range rules { + if ruleMatchesAlertRuleID(rule, alertRuleId) { + *updated = true + continue + } + newRules = append(newRules, rule) + } + + return newRules +} + +// deletePlatformAlertRuleById deletes a platform rule from its owning AlertingRule CR. +func (c *client) deletePlatformAlertRuleById(ctx context.Context, relabeled monitoringv1.Rule, alertRuleId string) error { + namespace := relabeled.Labels[k8s.PrometheusRuleLabelNamespace] + name := relabeled.Labels[k8s.PrometheusRuleLabelName] + + // Delete from owning AlertingRule + arName := relabeled.Labels[managementlabels.AlertingRuleLabelName] + if arName == "" { + arName = defaultAlertingRuleName + } + ar, found, err := c.k8sClient.AlertingRules().Get(ctx, arName) + if err != nil { + return fmt.Errorf("failed to get AlertingRule %s: %w", arName, err) + } + if !found || ar == nil { + return &NotFoundError{Resource: "AlertingRule", Id: arName} + } + // Common preconditions for platform delete + if err := validatePlatformDeletePreconditions(ar); err != nil { + return err + } + + // Find original platform rule for reliable match by alert name + originalRule, err := c.getOriginalPlatformRule(ctx, namespace, name, alertRuleId) + if err != nil { + return err + } + + updated, newGroups := removeAlertFromAlertingRuleGroups(ar.Spec.Groups, originalRule.Alert) + if !updated { + return &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("alert %q not found in AlertingRule %s", originalRule.Alert, arName), + } + } + ar.Spec.Groups = newGroups + if err := c.k8sClient.AlertingRules().Update(ctx, *ar); err != nil { + return fmt.Errorf("failed to update AlertingRule %s: %w", ar.Name, err) + } + return nil +} + +// deleteUserAlertRuleById deletes a user-sourced rule from its PrometheusRule. +func (c *client) deleteUserAlertRuleById(ctx context.Context, namespace, name, alertRuleId string) error { + pr, found, err := c.k8sClient.PrometheusRules().Get(ctx, namespace, name) + if err != nil { + return err + } + if !found { + return &NotFoundError{Resource: "PrometheusRule", Id: fmt.Sprintf("%s/%s", namespace, name)} + } + + updated := false + var newGroups []monitoringv1.RuleGroup + for _, group := range pr.Spec.Groups { + newRules := c.filterRulesById(group.Rules, alertRuleId, &updated) + if len(newRules) > 0 { + group.Rules = newRules + newGroups = append(newGroups, group) + } else if len(newRules) != len(group.Rules) { + updated = true + } + } + if !updated { + return &NotFoundError{Resource: "AlertRule", Id: alertRuleId, AdditionalInfo: "rule not found in the given PrometheusRule"} + } + + if len(newGroups) == 0 { + if err := c.k8sClient.PrometheusRules().Delete(ctx, pr.Namespace, pr.Name); err != nil { + return fmt.Errorf("failed to delete PrometheusRule %s/%s: %w", pr.Namespace, pr.Name, err) + } + return nil + } + + pr.Spec.Groups = newGroups + if err := c.k8sClient.PrometheusRules().Update(ctx, *pr); err != nil { + return fmt.Errorf("failed to update PrometheusRule %s/%s: %w", pr.Namespace, pr.Name, err) + } + return nil +} + +// removeAlertFromAlertingRuleGroups removes all instances of an alert by alert name across groups. +// Returns whether any change occurred and the resulting groups (dropping empty groups). +func removeAlertFromAlertingRuleGroups(groups []osmv1.RuleGroup, alertName string) (bool, []osmv1.RuleGroup) { + updated := false + newGroups := make([]osmv1.RuleGroup, 0, len(groups)) + for _, g := range groups { + var kept []osmv1.Rule + for _, r := range g.Rules { + if r.Alert == alertName { + updated = true + continue + } + kept = append(kept, r) + } + if len(kept) > 0 { + g.Rules = kept + newGroups = append(newGroups, g) + } else if len(g.Rules) > 0 { + updated = true + } + } + return updated, newGroups +} diff --git a/pkg/management/delete_user_defined_alert_rule_by_id_test.go b/pkg/management/delete_user_defined_alert_rule_by_id_test.go new file mode 100644 index 000000000..9ac520bc1 --- /dev/null +++ b/pkg/management/delete_user_defined_alert_rule_by_id_test.go @@ -0,0 +1,611 @@ +package management_test + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + osmv1 "github.com/openshift/api/monitoring/v1" + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("DeleteUserDefinedAlertRuleById", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + userRule1 = monitoringv1.Rule{ + Alert: "UserAlert1", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "user-namespace", + k8s.PrometheusRuleLabelName: "user-rule", + }, + } + userRule1Id = alertrule.GetAlertingRuleId(&userRule1) + + userRule2 = monitoringv1.Rule{ + Alert: "UserAlert2", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "user-namespace", + k8s.PrometheusRuleLabelName: "user-rule", + }, + } + + platformRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + }, + } + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + Context("when rule is not found in RelabeledRules", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns NotFoundError", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, "nonexistent-id") + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("AlertRule")) + Expect(notFoundErr.Id).To(Equal("nonexistent-id")) + }) + }) + + Context("when deleting a platform rule not operator-managed (user-via-platform)", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + } + // Original PrometheusRule containing the platform rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{platformRule}, + }, + }, + }, + }, true, nil + }, + } + } + // Provide owning AlertingRule so deletion can succeed + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + if name == "platform-alert-rules" { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-alert-rules", + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{ + { + Name: "test-group", + Rules: []osmv1.Rule{ + {Alert: platformRule.Alert}, + }, + }, + }, + }, + }, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, ar osmv1.AlertingRule) error { + return nil + }, + } + } + }) + + It("deletes rule from owning AlertingRule", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, platformRuleId) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Context("when deleting a platform rule but owning AlertingRule is GitOps-managed", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return name == "openshift-monitoring" }, + } + } + // PR contains the rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{{Name: "grp", Rules: []monitoringv1.Rule{platformRule}}}, + }, + }, true, nil + }, + } + } + // Owning AR exists and is GitOps-managed via metadata + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-alert-rules", + Namespace: k8s.ClusterMonitoringNamespace, + Annotations: map[string]string{"argocd.argoproj.io/tracking-id": "gitops"}, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{{Name: "grp", Rules: []osmv1.Rule{{Alert: platformRule.Alert}}}}, + }, + }, true, nil + }, + } + } + }) + It("blocks deletion with GitOps message", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, platformRuleId) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by GitOps")) + }) + }) + + Context("when deleting a platform rule but owning AlertingRule is operator-managed", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { return name == "openshift-monitoring" }, + } + } + // PR contains the rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{{Name: "grp", Rules: []monitoringv1.Rule{platformRule}}}, + }, + }, true, nil + }, + } + } + // Owning AR exists and is operator-managed via ownerReferences + mockK8s.AlertingRulesFunc = func() k8s.AlertingRuleInterface { + return &testutils.MockAlertingRuleInterface{ + GetFunc: func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + controller := true + return &osmv1.AlertingRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-alert-rules", + Namespace: k8s.ClusterMonitoringNamespace, + OwnerReferences: []metav1.OwnerReference{ + {Kind: "SomeOperatorKind", Name: "operator", Controller: &controller}, + }, + }, + Spec: osmv1.AlertingRuleSpec{ + Groups: []osmv1.RuleGroup{{Name: "grp", Rules: []osmv1.Rule{{Alert: platformRule.Alert}}}}, + }, + }, true, nil + }, + } + } + }) + It("blocks deletion with operator-managed message", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, platformRuleId) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by an operator")) + }) + }) + + Context("when PrometheusRule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, nil + }, + } + } + }) + + It("returns NotFoundError", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("PrometheusRule")) + }) + }) + + Context("when PrometheusRule Get returns an error", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, errors.New("failed to get PrometheusRule") + }, + } + } + }) + + It("returns the error", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get PrometheusRule")) + }) + }) + + Context("when rule is not found in PrometheusRule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{userRule2}, + }, + }, + }, + }, true, nil + }, + } + } + }) + + It("returns NotFoundError", func() { + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("AlertRule")) + Expect(notFoundErr.Id).To(Equal(userRule1Id)) + }) + }) + + Context("when deleting the only rule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{userRule1}, + }, + }, + }, + }, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + return nil + }, + } + } + }) + + It("deletes the entire PrometheusRule", func() { + var deleteCalled bool + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{userRule1}, + }, + }, + }, + }, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + deleteCalled = true + return nil + }, + } + } + + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).NotTo(HaveOccurred()) + Expect(deleteCalled).To(BeTrue()) + }) + }) + + Context("when deleting one of multiple rules", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + }) + + It("updates the PrometheusRule with remaining rules", func() { + var updateCalled bool + var updatedPR *monitoringv1.PrometheusRule + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{userRule1, userRule2}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + updateCalled = true + updatedPR = &pr + return nil + }, + } + } + + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).NotTo(HaveOccurred()) + Expect(updateCalled).To(BeTrue()) + Expect(updatedPR.Spec.Groups).To(HaveLen(1)) + Expect(updatedPR.Spec.Groups[0].Rules).To(HaveLen(1)) + Expect(updatedPR.Spec.Groups[0].Rules[0].Alert).To(Equal("UserAlert2")) + }) + }) + + Context("when deleting all rules from a group", func() { + It("removes the empty group", func() { + anotherRule := monitoringv1.Rule{ + Alert: "AnotherAlert", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "user-namespace", + k8s.PrometheusRuleLabelName: "user-rule", + }, + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRule1Id { + return userRule1, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return false + }, + } + } + + var updatedPR *monitoringv1.PrometheusRule + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "group-to-be-empty", + Rules: []monitoringv1.Rule{userRule1}, + }, + { + Name: "group-with-rules", + Rules: []monitoringv1.Rule{anotherRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + updatedPR = &pr + return nil + }, + } + } + + err := client.DeleteUserDefinedAlertRuleById(ctx, userRule1Id) + Expect(err).NotTo(HaveOccurred()) + Expect(updatedPR.Spec.Groups).To(HaveLen(1)) + Expect(updatedPR.Spec.Groups[0].Name).To(Equal("group-with-rules")) + }) + }) +}) diff --git a/pkg/management/errors.go b/pkg/management/errors.go new file mode 100644 index 000000000..d0bec9127 --- /dev/null +++ b/pkg/management/errors.go @@ -0,0 +1,44 @@ +package management + +import "fmt" + +type NotFoundError struct { + Resource string + Id string + + AdditionalInfo string +} + +func (r *NotFoundError) Error() string { + s := fmt.Sprintf("%s with id %s not found", r.Resource, r.Id) + + if r.AdditionalInfo != "" { + s += fmt.Sprintf(": %s", r.AdditionalInfo) + } + + return s +} + +type NotAllowedError struct { + Message string +} + +func (r *NotAllowedError) Error() string { + return r.Message +} + +type ValidationError struct { + Message string +} + +func (e *ValidationError) Error() string { + return e.Message +} + +type ConflictError struct { + Message string +} + +func (e *ConflictError) Error() string { + return e.Message +} diff --git a/pkg/management/get_alerting_health.go b/pkg/management/get_alerting_health.go new file mode 100644 index 000000000..001d13f15 --- /dev/null +++ b/pkg/management/get_alerting_health.go @@ -0,0 +1,21 @@ +package management + +import ( + "context" + "time" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +const alertingHealthTimeout = 10 * time.Second + +// GetAlertingHealth retrieves alerting health details. +func (c *client) GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if _, hasDeadline := ctx.Deadline(); !hasDeadline { + timeoutCtx, cancel := context.WithTimeout(ctx, alertingHealthTimeout) + defer cancel() + ctx = timeoutCtx + } + + return c.k8sClient.AlertingHealth(ctx) +} diff --git a/pkg/management/get_alerts.go b/pkg/management/get_alerts.go new file mode 100644 index 000000000..323e47145 --- /dev/null +++ b/pkg/management/get_alerts.go @@ -0,0 +1,399 @@ +package management + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/openshift/monitoring-plugin/pkg/alertcomponent" + "github.com/openshift/monitoring-plugin/pkg/classification" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "k8s.io/apimachinery/pkg/types" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var cvoAlertNames = map[string]struct{}{ + "ClusterOperatorDown": {}, + "ClusterOperatorDegraded": {}, +} + +func (c *client) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + alerts, err := c.k8sClient.PrometheusAlerts().GetAlerts(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to get prometheus alerts: %w", err) + } + + configs := c.k8sClient.RelabeledRules().Config() + rules := c.k8sClient.RelabeledRules().List(ctx) + classificationCache := map[string]map[string]alertRuleClassificationOverridePayload{} + + result := make([]k8s.PrometheusAlert, 0, len(alerts)) + for _, alert := range alerts { + // Only apply relabel configs for platform alerts. User workload alerts + // already come from their own stack and should not be relabeled here. + if alert.Labels[k8s.AlertSourceLabel] != k8s.AlertSourceUser { + relabels, keep := relabel.Process(labels.FromMap(alert.Labels), configs...) + if !keep { + continue + } + alert.Labels = relabels.Map() + } + + // Add calculated rule ID and source when not present (labels enrichment) + c.setRuleIDAndSourceIfMissing(ctx, &alert, rules) + + // correlate alert -> base alert rule via subset matching against relabeled rules + alertRuleId := alert.Labels[k8s.AlertRuleLabelId] + component := "" + layer := "" + + bestRule, corrId := correlateAlertToRule(alert.Labels, rules) + if corrId != "" { + alertRuleId = corrId + } + if bestRule == nil && alertRuleId != "" { + if rule, ok := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId); ok { + bestRule = &rule + } + } + + if bestRule != nil { + if src := c.deriveAlertSource(bestRule.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + component, layer = classifyFromRule(bestRule) + } else { + component, layer = classifyFromAlertLabels(alert.Labels) + } + + // CVO alerts have special defaults, but user overrides should still take precedence. + if cvoComponent, cvoLayer, ok := classifyCvoAlert(alert.Labels); ok { + component = cvoComponent + layer = cvoLayer + } + + if bestRule != nil && alertRuleId != "" { + ov, ok, err := c.getRuleClassificationOverride(ctx, bestRule, alertRuleId, classificationCache) + if err != nil { + return nil, err + } + if ok { + if ov.ComponentFrom != "" { + if v := strings.TrimSpace(alert.Labels[ov.ComponentFrom]); v != "" && classification.ValidateComponent(v) { + component = v + } + } else if ov.Component != "" { + component = ov.Component + } + + if ov.LayerFrom != "" { + if v := alert.Labels[ov.LayerFrom]; classification.ValidateLayer(v) { + layer = strings.ToLower(strings.TrimSpace(v)) + } + } else if ov.Layer != "" { + layer = ov.Layer + } + } + } + + // keep label and optional enriched fields consistent + if alert.Labels[k8s.AlertRuleLabelId] == "" && alertRuleId != "" { + alert.Labels[k8s.AlertRuleLabelId] = alertRuleId + } + alert.AlertRuleId = alertRuleId + + alert.AlertComponent = component + alert.AlertLayer = layer + + if bestRule != nil && bestRule.Labels != nil { + alert.PrometheusRuleNamespace = bestRule.Labels[k8s.PrometheusRuleLabelNamespace] + alert.PrometheusRuleName = bestRule.Labels[k8s.PrometheusRuleLabelName] + alert.AlertingRuleName = bestRule.Labels[managementlabels.AlertingRuleLabelName] + } + + result = append(result, alert) + } + + return result, nil +} + +type ruleClassificationOverride struct { + Component string + Layer string + ComponentFrom string + LayerFrom string +} + +func (c *client) setRuleIDAndSourceIfMissing(ctx context.Context, alert *k8s.PrometheusAlert, rules []monitoringv1.Rule) { + if alert.Labels[k8s.AlertRuleLabelId] == "" { + for _, existing := range rules { + if existing.Alert != alert.Labels[managementlabels.AlertNameLabel] { + continue + } + if !ruleMatchesAlert(existing.Labels, alert.Labels) { + continue + } + rid := alertrule.GetAlertingRuleId(&existing) + alert.Labels[k8s.AlertRuleLabelId] = rid + if alert.Labels[k8s.AlertSourceLabel] == "" { + if src := c.deriveAlertSource(existing.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + } + break + } + } + if alert.Labels[k8s.AlertSourceLabel] != "" { + return + } + if rid := alert.Labels[k8s.AlertRuleLabelId]; rid != "" { + if existing, ok := c.k8sClient.RelabeledRules().Get(ctx, rid); ok { + if src := c.deriveAlertSource(existing.Labels); src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + } + } +} + +func ruleMatchesAlert(existingRuleLabels, alertLabels map[string]string) bool { + existingBusiness := filterBusinessLabels(existingRuleLabels) + for k, v := range existingBusiness { + lv, ok := alertLabels[k] + if !ok || lv != v { + return false + } + } + return true +} + +// correlateAlertToRule tries to find the base alert rule for the given alert labels +// by subset-matching against relabeled rules. +func correlateAlertToRule(alertLabels map[string]string, rules []monitoringv1.Rule) (*monitoringv1.Rule, string) { + // Determine best match: prefer rules with more labels (more specific) + var ( + bestId string + bestRule *monitoringv1.Rule + bestLabelCount int + ) + for i := range rules { + rule := &rules[i] + ruleLabels := sanitizeRuleLabels(rule.Labels) + if isSubset(ruleLabels, alertLabels) { + if len(ruleLabels) > bestLabelCount { + bestLabelCount = len(ruleLabels) + bestRule = rule + bestId = rule.Labels[k8s.AlertRuleLabelId] + } + } + } + if bestRule == nil { + return nil, "" + } + return bestRule, bestId +} + +// sanitizeRuleLabels removes meta labels that will not be present on alerts +func sanitizeRuleLabels(in map[string]string) map[string]string { + out := make(map[string]string, len(in)) + for k, v := range in { + if k == k8s.PrometheusRuleLabelNamespace || k == k8s.PrometheusRuleLabelName || k == k8s.AlertRuleLabelId { + continue + } + out[k] = v + } + return out +} + +// isSubset returns true if all key/value pairs in sub are present in sup +func isSubset(sub map[string]string, sup map[string]string) bool { + for k, v := range sub { + if sv, ok := sup[k]; !ok || sv != v { + return false + } + } + return true +} + +func (c *client) deriveAlertSource(ruleLabels map[string]string) string { + ns := ruleLabels[k8s.PrometheusRuleLabelNamespace] + name := ruleLabels[k8s.PrometheusRuleLabelName] + if ns == "" || name == "" { + return "" + } + if c.IsPlatformAlertRule(types.NamespacedName{Namespace: ns, Name: name}) { + return k8s.AlertSourcePlatform + } + return k8s.AlertSourceUser +} + +func (c *client) getRuleClassificationOverride(ctx context.Context, rule *monitoringv1.Rule, ruleId string, cache map[string]map[string]alertRuleClassificationOverridePayload) (ruleClassificationOverride, bool, error) { + if rule.Labels == nil { + return ruleClassificationOverride{}, false, nil + } + ns := rule.Labels[k8s.PrometheusRuleLabelNamespace] + if ns == "" { + return ruleClassificationOverride{}, false, nil + } + + entries, ok := cache[ns] + if !ok { + overrideNamespace := c.overrideNamespace + cmName := OverrideConfigMapName(ns) + cm, exists, err := c.k8sClient.ConfigMaps().Get(ctx, overrideNamespace, cmName) + if err != nil { + return ruleClassificationOverride{}, false, err + } + if !exists { + cache[ns] = nil + return ruleClassificationOverride{}, false, nil + } + if cm.Labels == nil || + cm.Labels[managementlabels.AlertClassificationOverridesTypeLabelKey] != managementlabels.AlertClassificationOverridesTypeLabelValue || + cm.Labels[k8s.PrometheusRuleLabelNamespace] != ns { + cache[ns] = nil + return ruleClassificationOverride{}, false, nil + } + entries = map[string]alertRuleClassificationOverridePayload{} + for key, raw := range cm.Data { + ruleId, ok := decodeClassificationOverrideKey(key) + if !ok { + continue + } + var entry alertRuleClassificationOverridePayload + if err := json.Unmarshal([]byte(raw), &entry); err != nil { + continue + } + entries[ruleId] = entry + } + cache[ns] = entries + } + + if entries == nil { + return ruleClassificationOverride{}, false, nil + } + entry, ok := entries[ruleId] + if !ok { + return ruleClassificationOverride{}, false, nil + } + + ov := ruleClassificationOverride{ + Component: strings.TrimSpace(entry.Classification.Component), + Layer: entry.Classification.Layer, + ComponentFrom: entry.Classification.ComponentFrom, + LayerFrom: entry.Classification.LayerFrom, + } + + if ov.Component != "" && !classification.ValidateComponent(ov.Component) { + ov.Component = "" + } + if ov.Layer != "" && classification.ValidateLayer(ov.Layer) { + ov.Layer = strings.ToLower(strings.TrimSpace(ov.Layer)) + } else { + ov.Layer = "" + } + + ov.ComponentFrom = strings.TrimSpace(ov.ComponentFrom) + if ov.ComponentFrom != "" && !classification.ValidatePromLabelName(ov.ComponentFrom) { + ov.ComponentFrom = "" + } + + ov.LayerFrom = strings.TrimSpace(ov.LayerFrom) + if ov.LayerFrom != "" && !classification.ValidatePromLabelName(ov.LayerFrom) { + ov.LayerFrom = "" + } + + if ov.Component == "" && ov.Layer == "" && ov.ComponentFrom == "" && ov.LayerFrom == "" { + return ruleClassificationOverride{}, false, nil + } + + return ov, true, nil +} + +func classifyFromRule(rule *monitoringv1.Rule) (string, string) { + lbls := model.LabelSet{} + for k, v := range rule.Labels { + lbls[model.LabelName(k)] = model.LabelValue(v) + } + if _, ok := lbls["namespace"]; !ok { + if ns := rule.Labels[k8s.PrometheusRuleLabelNamespace]; ns != "" { + lbls["namespace"] = model.LabelValue(ns) + } + } + if rule.Alert != "" { + lbls[model.LabelName(managementlabels.AlertNameLabel)] = model.LabelValue(rule.Alert) + } + + layer, component := alertcomponent.DetermineComponent(lbls) + if component == "" || component == "Others" { + component = "other" + layer = deriveLayerFromSource(rule.Labels) + } + + component, layer = applyRuleScopedDefaults(rule.Labels, component, layer) + return component, layer +} + +func classifyFromAlertLabels(alertLabels map[string]string) (string, string) { + lbls := model.LabelSet{} + for k, v := range alertLabels { + lbls[model.LabelName(k)] = model.LabelValue(v) + } + layer, component := alertcomponent.DetermineComponent(lbls) + if component == "" || component == "Others" { + component = "other" + layer = deriveLayerFromSource(alertLabels) + } + component, layer = applyRuleScopedDefaults(alertLabels, component, layer) + return component, layer +} + +func deriveLayerFromSource(labels map[string]string) string { + // - platform (openshift-monitoring prometheus) -> cluster + // - user -> namespace + if labels[k8s.AlertSourceLabel] == k8s.AlertSourcePlatform { + return "cluster" + } + if labels[k8s.PrometheusRuleLabelNamespace] == k8s.ClusterMonitoringNamespace { + return "cluster" + } + promSrc := labels["prometheus"] + if strings.HasPrefix(promSrc, "openshift-monitoring/") { + return "cluster" + } + return "namespace" +} + +func applyRuleScopedDefaults(ruleLabels map[string]string, component, layer string) (string, string) { + if ruleLabels == nil { + return component, layer + } + if v := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationComponentKey]); v != "" { + if classification.ValidateComponent(v) { + component = v + } + } + if v := strings.TrimSpace(ruleLabels[k8s.AlertRuleClassificationLayerKey]); v != "" { + if classification.ValidateLayer(v) { + layer = strings.ToLower(strings.TrimSpace(v)) + } + } + return component, layer +} + +func classifyCvoAlert(alertLabels map[string]string) (string, string, bool) { + if _, ok := cvoAlertNames[alertLabels[managementlabels.AlertNameLabel]]; !ok { + return "", "", false + } + component := alertLabels["name"] + if component == "" { + component = "version" + } + return component, "cluster", true +} diff --git a/pkg/management/get_alerts_test.go b/pkg/management/get_alerts_test.go new file mode 100644 index 000000000..6179107ff --- /dev/null +++ b/pkg/management/get_alerts_test.go @@ -0,0 +1,532 @@ +package management_test + +import ( + "context" + "encoding/base64" + "errors" + "os" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var _ = Describe("GetAlerts", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + overrideNamespace = "plugin-test-ns" + ) + + BeforeEach(func() { + Expect(os.Setenv("MONITORING_PLUGIN_NAMESPACE", overrideNamespace)).To(Succeed()) + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + AfterEach(func() { + Expect(os.Unsetenv("MONITORING_PLUGIN_NAMESPACE")).To(Succeed()) + }) + + Context("when PrometheusAlerts returns an error", func() { + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return nil, errors.New("failed to get alerts") + }, + } + } + }) + + It("returns an error", func() { + req := k8s.GetAlertsRequest{} + _, err := client.GetAlerts(ctx, req) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get prometheus alerts")) + }) + }) + + Context("when PrometheusAlerts returns alerts", func() { + var ( + alert1 = k8s.PrometheusAlert{ + Labels: map[string]string{ + managementlabels.AlertNameLabel: "Alert1", + "severity": "warning", + "namespace": "default", + }, + State: "firing", + } + alert2 = k8s.PrometheusAlert{ + Labels: map[string]string{ + managementlabels.AlertNameLabel: "Alert2", + "severity": "critical", + "namespace": "kube-system", + }, + State: "pending", + } + ) + + Context("without relabel configs", func() { + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1, alert2}, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + }) + + It("returns all alerts without modification", func() { + req := k8s.GetAlertsRequest{} + alerts, err := client.GetAlerts(ctx, req) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(2)) + Expect(alerts[0].Labels[managementlabels.AlertNameLabel]).To(Equal("Alert1")) + Expect(alerts[1].Labels[managementlabels.AlertNameLabel]).To(Equal("Alert2")) + }) + }) + + Context("with classification overrides", func() { + var ( + overrideComponent = "unit-test-component" + overrideLayer = "namespace" + ) + + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + } + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + ns := &testutils.MockNamespaceInterface{} + ns.SetMonitoringNamespaces(map[string]bool{"openshift-monitoring": true}) + return ns + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{rule} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == alertrule.GetAlertingRuleId(&rule) { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + }) + + It("applies overrides from labeled ConfigMap", func() { + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + ruleId := alertrule.GetAlertingRuleId(&rule) + key := base64.RawURLEncoding.EncodeToString([]byte(ruleId)) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: management.OverrideConfigMapName("openshift-monitoring"), + Namespace: overrideNamespace, + Labels: map[string]string{ + managementlabels.AlertClassificationOverridesTypeLabelKey: managementlabels.AlertClassificationOverridesTypeLabelValue, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + }, + }, + Data: map[string]string{ + key: `{"classification":{"openshift_io_alert_rule_component":"` + overrideComponent + `","openshift_io_alert_rule_layer":"` + overrideLayer + `"}}`, + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { + return &testutils.MockConfigMapInterface{ + ConfigMaps: map[string]*corev1.ConfigMap{ + overrideNamespace + "/" + management.OverrideConfigMapName("openshift-monitoring"): cm, + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal(overrideComponent)) + Expect(alerts[0].AlertLayer).To(Equal(overrideLayer)) + }) + + It("derives component from alert label when openshift_io_alert_rule_component_from is set", func() { + alert1WithName := alert1 + alert1WithName.Labels = map[string]string{} + for k, v := range alert1.Labels { + alert1WithName.Labels[k] = v + } + alert1WithName.Labels["name"] = "kube-apiserver" + + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1WithName}, nil + }, + } + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + ruleId := alertrule.GetAlertingRuleId(&rule) + key := base64.RawURLEncoding.EncodeToString([]byte(ruleId)) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: management.OverrideConfigMapName("openshift-monitoring"), + Namespace: overrideNamespace, + Labels: map[string]string{ + managementlabels.AlertClassificationOverridesTypeLabelKey: managementlabels.AlertClassificationOverridesTypeLabelValue, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + }, + }, + Data: map[string]string{ + key: `{"classification":{"openshift_io_alert_rule_component_from":"name","openshift_io_alert_rule_layer":"namespace"}}`, + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { + return &testutils.MockConfigMapInterface{ + ConfigMaps: map[string]*corev1.ConfigMap{ + overrideNamespace + "/" + management.OverrideConfigMapName("openshift-monitoring"): cm, + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("kube-apiserver")) + Expect(alerts[0].AlertLayer).To(Equal("namespace")) + }) + + It("derives layer from alert label when openshift_io_alert_rule_layer_from is set", func() { + alert1WithLayer := alert1 + alert1WithLayer.Labels = map[string]string{} + for k, v := range alert1.Labels { + alert1WithLayer.Labels[k] = v + } + alert1WithLayer.Labels["layer"] = "cluster" + + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1WithLayer}, nil + }, + } + } + + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + ruleId := alertrule.GetAlertingRuleId(&rule) + key := base64.RawURLEncoding.EncodeToString([]byte(ruleId)) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: management.OverrideConfigMapName("openshift-monitoring"), + Namespace: overrideNamespace, + Labels: map[string]string{ + managementlabels.AlertClassificationOverridesTypeLabelKey: managementlabels.AlertClassificationOverridesTypeLabelValue, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + }, + }, + Data: map[string]string{ + key: `{"classification":{"openshift_io_alert_rule_layer_from":"layer","openshift_io_alert_rule_component":"unit-test-component"}}`, + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { + return &testutils.MockConfigMapInterface{ + ConfigMaps: map[string]*corev1.ConfigMap{ + overrideNamespace + "/" + management.OverrideConfigMapName("openshift-monitoring"): cm, + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("unit-test-component")) + Expect(alerts[0].AlertLayer).To(Equal("cluster")) + }) + + It("ignores overrides when label is missing", func() { + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + ruleId := alertrule.GetAlertingRuleId(&rule) + key := base64.RawURLEncoding.EncodeToString([]byte(ruleId)) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: management.OverrideConfigMapName("openshift-monitoring"), + Namespace: overrideNamespace, + }, + Data: map[string]string{ + key: `{"classification":{"openshift_io_alert_rule_component":"` + overrideComponent + `","openshift_io_alert_rule_layer":"` + overrideLayer + `"}}`, + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { + return &testutils.MockConfigMapInterface{ + ConfigMaps: map[string]*corev1.ConfigMap{ + overrideNamespace + "/" + management.OverrideConfigMapName("openshift-monitoring"): cm, + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("other")) + Expect(alerts[0].AlertLayer).To(Equal("cluster")) + }) + }) + + Context("with rule-scoped classification labels", func() { + It("uses rule labels as defaults when no overrides exist", func() { + alert := k8s.PrometheusAlert{ + Labels: map[string]string{ + "alertname": "AlertRuleDefaults", + "severity": "warning", + "namespace": "default", + k8s.AlertRuleClassificationComponentKey: "team-a", + k8s.AlertRuleClassificationLayerKey: "namespace", + }, + State: "firing", + } + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert}, nil + }, + } + } + + rule := monitoringv1.Rule{ + Alert: "AlertRuleDefaults", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.AlertRuleClassificationComponentKey: "team-a", + k8s.AlertRuleClassificationLayerKey: "namespace", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "defaults-rule", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{rule} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("team-a")) + Expect(alerts[0].AlertLayer).To(Equal("namespace")) + }) + }) + + Context("without a matching rule", func() { + It("falls back to default mapping from alert labels", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + } + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{} + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("other")) + Expect(alerts[0].AlertLayer).To(Equal("namespace")) + }) + }) + + Context("with a matching rule but no overrides or rule labels", func() { + It("falls back to default mapping derived from rule context", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1}, nil + }, + } + } + rule := monitoringv1.Rule{ + Alert: "Alert1", + Labels: map[string]string{ + "severity": "warning", + "namespace": "default", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "default-rule", + }, + } + rule.Labels[k8s.AlertRuleLabelId] = alertrule.GetAlertingRuleId(&rule) + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{rule} + }, + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == rule.Labels[k8s.AlertRuleLabelId] { + return rule, true + } + return monitoringv1.Rule{}, false + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + alerts, err := client.GetAlerts(ctx, k8s.GetAlertsRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].AlertComponent).To(Equal("other")) + Expect(alerts[0].AlertLayer).To(Equal("cluster")) + }) + }) + + Context("with relabel configs that keep all alerts", func() { + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{alert1, alert2}, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ConfigFunc: func() []*relabel.Config { + // Return empty config list to avoid validation issues in tests + // Relabel functionality is tested elsewhere (in k8s package) + return []*relabel.Config{} + }, + } + } + }) + + It("returns all alerts without modification when no relabel configs", func() { + req := k8s.GetAlertsRequest{} + alerts, err := client.GetAlerts(ctx, req) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(2)) + Expect(alerts[0].Labels["severity"]).To(Equal("warning")) + Expect(alerts[1].Labels["severity"]).To(Equal("critical")) + }) + }) + + Context("when no alerts are returned from Prometheus", func() { + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetAlertsFunc: func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + return []k8s.PrometheusAlert{}, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + }) + + It("returns an empty list", func() { + req := k8s.GetAlertsRequest{} + alerts, err := client.GetAlerts(ctx, req) + Expect(err).NotTo(HaveOccurred()) + Expect(alerts).To(HaveLen(0)) + }) + }) + }) +}) diff --git a/pkg/management/get_rule_by_id.go b/pkg/management/get_rule_by_id.go new file mode 100644 index 000000000..e786ee464 --- /dev/null +++ b/pkg/management/get_rule_by_id.go @@ -0,0 +1,16 @@ +package management + +import ( + "context" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func (c *client) GetRuleById(ctx context.Context, alertRuleId string) (monitoringv1.Rule, error) { + rule, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found { + return monitoringv1.Rule{}, &NotFoundError{Resource: "AlertRule", Id: alertRuleId} + } + + return rule, nil +} diff --git a/pkg/management/get_rule_by_id_test.go b/pkg/management/get_rule_by_id_test.go new file mode 100644 index 000000000..d24218732 --- /dev/null +++ b/pkg/management/get_rule_by_id_test.go @@ -0,0 +1,462 @@ +package management_test + +import ( + "context" + "errors" + "maps" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var _ = Describe("GetRuleById", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + testRule = monitoringv1.Rule{ + Alert: "TestAlert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + k8s.PrometheusRuleLabelNamespace: "test-namespace", + k8s.PrometheusRuleLabelName: "test-rule", + }, + } + testRuleId = alertrule.GetAlertingRuleId(&testRule) + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + Context("when rule is found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return testRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns the rule", func() { + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Alert).To(Equal("TestAlert")) + Expect(rule.Labels["severity"]).To(Equal("warning")) + }) + }) + + Context("when rule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns NotFoundError", func() { + _, err := client.GetRuleById(ctx, "nonexistent-id") + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("AlertRule")) + Expect(notFoundErr.Id).To(Equal("nonexistent-id")) + }) + }) + + Context("when multiple rules exist", func() { + var ( + rule1 = monitoringv1.Rule{ + Alert: "Alert1", + Expr: intstr.FromString("up == 0"), + } + rule1Id = alertrule.GetAlertingRuleId(&rule1) + + rule2 = monitoringv1.Rule{ + Alert: "Alert2", + Expr: intstr.FromString("down == 1"), + } + rule2Id = alertrule.GetAlertingRuleId(&rule2) + ) + + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + switch id { + case rule1Id: + return rule1, true + case rule2Id: + return rule2, true + default: + return monitoringv1.Rule{}, false + } + }, + } + } + }) + + It("returns the correct rule based on ID", func() { + rule, err := client.GetRuleById(ctx, rule1Id) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Alert).To(Equal("Alert1")) + + rule, err = client.GetRuleById(ctx, rule2Id) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Alert).To(Equal("Alert2")) + }) + }) + + Context("with recording rules", func() { + var ( + recordingRule = monitoringv1.Rule{ + Record: "job:request_latency_seconds:mean5m", + Expr: intstr.FromString("avg by (job) (request_latency_seconds)"), + } + recordingRuleId = alertrule.GetAlertingRuleId(&recordingRule) + ) + + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == recordingRuleId { + return recordingRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns the recording rule", func() { + rule, err := client.GetRuleById(ctx, recordingRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Record).To(Equal("job:request_latency_seconds:mean5m")) + }) + }) + + Context("when rule has openshift_io_rule_managed_by label computed by DetermineManagedBy", func() { + var ( + mockARC *testutils.MockAlertRelabelConfigInterface + mockNamespaceMgr *testutils.MockNamespaceInterface + ) + + BeforeEach(func() { + mockARC = &testutils.MockAlertRelabelConfigInterface{} + mockNamespaceMgr = &testutils.MockNamespaceInterface{} + }) + + It("returns rule with openshift_io_rule_managed_by=operator when PrometheusRule has OwnerReferences", func() { + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "operator-rule", + Namespace: "test-namespace", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "test-operator", + UID: "test-uid", + }, + }, + }, + } + + mockNamespaceMgr.IsClusterMonitoringNamespaceFunc = func(name string) bool { + return false // User rule + } + ruleManagedBy, relabelConfigManagedBy := k8s.DetermineManagedByForTesting(ctx, mockARC, mockNamespaceMgr, promRule, testRuleId) + + // Create rule with label computed by DetermineManagedBy + ruleWithLabel := testRule + if ruleWithLabel.Labels == nil { + ruleWithLabel.Labels = make(map[string]string) + } else { + ruleWithLabel.Labels = maps.Clone(ruleWithLabel.Labels) // Deep copy labels + } + ruleWithLabel.Labels[managementlabels.AlertNameLabel] = ruleWithLabel.Alert + ruleWithLabel.Labels[k8s.AlertRuleLabelId] = testRuleId + ruleWithLabel.Labels[k8s.PrometheusRuleLabelNamespace] = promRule.Namespace + ruleWithLabel.Labels[k8s.PrometheusRuleLabelName] = promRule.Name + if ruleManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return ruleWithLabel, true + } + return monitoringv1.Rule{}, false + }, + } + } + + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Labels).To(HaveKey(managementlabels.RuleManagedByLabel)) + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("operator")) + }) + + It("returns rule without openshift_io_rule_managed_by label when PrometheusRule has no special conditions", func() { + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "local-rule", + Namespace: "test-namespace", + }, + } + + mockNamespaceMgr.IsClusterMonitoringNamespaceFunc = func(name string) bool { + return false // User rule + } + ruleManagedBy, relabelConfigManagedBy := k8s.DetermineManagedByForTesting(ctx, mockARC, mockNamespaceMgr, promRule, testRuleId) + + ruleWithLabel := testRule + if ruleWithLabel.Labels == nil { + ruleWithLabel.Labels = make(map[string]string) + } else { + ruleWithLabel.Labels = maps.Clone(ruleWithLabel.Labels) // Deep copy labels + } + ruleWithLabel.Labels[managementlabels.AlertNameLabel] = ruleWithLabel.Alert + ruleWithLabel.Labels[k8s.AlertRuleLabelId] = testRuleId + ruleWithLabel.Labels[k8s.PrometheusRuleLabelNamespace] = promRule.Namespace + ruleWithLabel.Labels[k8s.PrometheusRuleLabelName] = promRule.Name + if ruleManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return ruleWithLabel, true + } + return monitoringv1.Rule{}, false + }, + } + } + + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Labels).NotTo(HaveKey(managementlabels.RuleManagedByLabel)) // Label should not be added + }) + + It("returns platform rule with openshift_io_relabel_config_managed_by=gitops when AlertRelabelConfig is GitOps managed", func() { + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-rule", + Namespace: "openshift-monitoring", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "test-operator", + UID: "test-uid", + }, + }, + }, + } + + mockARC.GetFunc = func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Annotations: map[string]string{ + "argocd.argoproj.io/tracking-id": "test-id", + }, + }, + }, true, nil + } + + mockNamespaceMgr.IsClusterMonitoringNamespaceFunc = func(name string) bool { + return true // Platform rule + } + ruleManagedBy, relabelConfigManagedBy := k8s.DetermineManagedByForTesting(ctx, mockARC, mockNamespaceMgr, promRule, testRuleId) + + ruleWithLabel := testRule + if ruleWithLabel.Labels == nil { + ruleWithLabel.Labels = make(map[string]string) + } else { + ruleWithLabel.Labels = maps.Clone(ruleWithLabel.Labels) // Deep copy labels + } + ruleWithLabel.Labels[managementlabels.AlertNameLabel] = ruleWithLabel.Alert + ruleWithLabel.Labels[k8s.AlertRuleLabelId] = testRuleId + ruleWithLabel.Labels[k8s.PrometheusRuleLabelNamespace] = promRule.Namespace + ruleWithLabel.Labels[k8s.PrometheusRuleLabelName] = promRule.Name + if ruleManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return ruleWithLabel, true + } + return monitoringv1.Rule{}, false + }, + } + } + + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Labels).To(HaveKey(managementlabels.RuleManagedByLabel)) + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("operator")) // Platform rule with OwnerReferences + Expect(rule.Labels).To(HaveKey(managementlabels.RelabelConfigManagedByLabel)) + Expect(rule.Labels[managementlabels.RelabelConfigManagedByLabel]).To(Equal("gitops")) + }) + + It("returns platform rule with openshift_io_rule_managed_by=gitops when PrometheusRule is GitOps managed", func() { + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-rule", + Namespace: "openshift-monitoring", + Annotations: map[string]string{ + "argocd.argoproj.io/tracking-id": "test-id", + }, + }, + } + + mockNamespaceMgr.IsClusterMonitoringNamespaceFunc = func(name string) bool { + return true // Platform rule + } + ruleManagedBy, relabelConfigManagedBy := k8s.DetermineManagedByForTesting(ctx, mockARC, mockNamespaceMgr, promRule, testRuleId) + + ruleWithLabel := testRule + if ruleWithLabel.Labels == nil { + ruleWithLabel.Labels = make(map[string]string) + } else { + ruleWithLabel.Labels = maps.Clone(ruleWithLabel.Labels) // Deep copy labels + } + ruleWithLabel.Labels[managementlabels.AlertNameLabel] = ruleWithLabel.Alert + ruleWithLabel.Labels[k8s.AlertRuleLabelId] = testRuleId + ruleWithLabel.Labels[k8s.PrometheusRuleLabelNamespace] = promRule.Namespace + ruleWithLabel.Labels[k8s.PrometheusRuleLabelName] = promRule.Name + if ruleManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return ruleWithLabel, true + } + return monitoringv1.Rule{}, false + }, + } + } + + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Labels).To(HaveKey(managementlabels.RuleManagedByLabel)) + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("gitops")) // Platform rule with GitOps annotations + }) + + It("returns platform rule without openshift_io_relabel_config_managed_by label when AlertRelabelConfig is not GitOps managed", func() { + promRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "platform-rule", + Namespace: "openshift-monitoring", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "test-operator", + UID: "test-uid", + }, + }, + }, + } + + mockARC.GetFunc = func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + // No GitOps annotations/labels + }, + }, true, nil + } + + mockNamespaceMgr.IsClusterMonitoringNamespaceFunc = func(name string) bool { + return true // Platform rule + } + ruleManagedBy, relabelConfigManagedBy := k8s.DetermineManagedByForTesting(ctx, mockARC, mockNamespaceMgr, promRule, testRuleId) + + ruleWithLabel := testRule + if ruleWithLabel.Labels == nil { + ruleWithLabel.Labels = make(map[string]string) + } else { + ruleWithLabel.Labels = maps.Clone(ruleWithLabel.Labels) // Deep copy labels + } + ruleWithLabel.Labels[managementlabels.AlertNameLabel] = ruleWithLabel.Alert + ruleWithLabel.Labels[k8s.AlertRuleLabelId] = testRuleId + ruleWithLabel.Labels[k8s.PrometheusRuleLabelNamespace] = promRule.Namespace + ruleWithLabel.Labels[k8s.PrometheusRuleLabelName] = promRule.Name + if ruleManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RuleManagedByLabel] = ruleManagedBy + } + if relabelConfigManagedBy != "" { + ruleWithLabel.Labels[managementlabels.RelabelConfigManagedByLabel] = relabelConfigManagedBy + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == testRuleId { + return ruleWithLabel, true + } + return monitoringv1.Rule{}, false + }, + } + } + + rule, err := client.GetRuleById(ctx, testRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(rule.Labels).To(HaveKey(managementlabels.RuleManagedByLabel)) + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("operator")) // Platform rule with OwnerReferences + Expect(rule.Labels).NotTo(HaveKey(managementlabels.RelabelConfigManagedByLabel)) // Label should not be added + }) + }) +}) diff --git a/pkg/management/get_rules.go b/pkg/management/get_rules.go new file mode 100644 index 000000000..f30822d35 --- /dev/null +++ b/pkg/management/get_rules.go @@ -0,0 +1,391 @@ +package management + +import ( + "context" + "fmt" + "math" + "sort" + "strings" + "time" + "unicode" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/relabel" + "github.com/prometheus/prometheus/promql/parser" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +func (c *client) GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + groups, err := c.k8sClient.PrometheusAlerts().GetRules(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to get prometheus rules: %w", err) + } + + configs := c.k8sClient.RelabeledRules().Config() + relabeledByAlert := indexRelabeledRules(c.k8sClient.RelabeledRules().List(ctx)) + applyFilters := req.State != "" || len(req.Labels) > 0 + + // Deduplicate rules that carry the same openshift_io_alert_rule_id across + // groups. This occurs when the same PrometheusRule group name is defined in + // multiple CRDs — Prometheus returns separate groups with identical rules + // that hash to the same ID after enrichment. + seenIDs := make(map[string]struct{}) + + filteredGroups := make([]k8s.PrometheusRuleGroup, 0, len(groups)) + for groupIdx := range groups { + group := groups[groupIdx] + filteredRules := make([]k8s.PrometheusRule, 0, len(group.Rules)) + + for ruleIdx := range group.Rules { + rule := group.Rules[ruleIdx] + if applyFilters && rule.Type != k8s.RuleTypeAlerting { + continue + } + applyRelabeledRuleLabels(&rule, relabeledByAlert) + + if ruleID := rule.Labels[k8s.AlertRuleLabelId]; ruleID != "" { + if _, seen := seenIDs[ruleID]; seen { + continue + } + seenIDs[ruleID] = struct{}{} + } + + if len(rule.Alerts) == 0 { + if applyFilters && rule.Type == k8s.RuleTypeAlerting { + continue + } + filteredRules = append(filteredRules, rule) + continue + } + + relabeledAlerts := make([]k8s.PrometheusRuleAlert, 0, len(rule.Alerts)) + for _, alert := range rule.Alerts { + if alert.State == "pending" || alert.State == "firing" { + if alert.Labels[k8s.AlertSourceLabel] != k8s.AlertSourceUser { + // Apply relabeling to the "real" alert labels only; preserve plugin meta labels. + src := alert.Labels[k8s.AlertSourceLabel] + in := make(map[string]string, len(alert.Labels)) + for k, v := range alert.Labels { + in[k] = v + } + delete(in, k8s.AlertSourceLabel) + + relabeledLabels, keep := relabel.Process(labels.FromMap(in), configs...) + if !keep { + continue + } + alert.Labels = relabeledLabels.Map() + if src != "" { + alert.Labels[k8s.AlertSourceLabel] = src + } + } + } + + if req.State != "" && alert.State != req.State { + continue + } + if !ruleAlertLabelsMatch(&req, &alert) { + continue + } + relabeledAlerts = append(relabeledAlerts, alert) + } + rule.Alerts = relabeledAlerts + + if applyFilters && rule.Type == k8s.RuleTypeAlerting && len(rule.Alerts) == 0 { + continue + } + + filteredRules = append(filteredRules, rule) + } + + group.Rules = filteredRules + if applyFilters && len(group.Rules) == 0 { + continue + } + filteredGroups = append(filteredGroups, group) + } + + return filteredGroups, nil +} + +func indexRelabeledRules(rules []monitoringv1.Rule) map[string][]monitoringv1.Rule { + byAlert := make(map[string][]monitoringv1.Rule, len(rules)) + for _, rule := range rules { + alertName := rule.Alert + if alertName == "" && rule.Labels != nil { + alertName = rule.Labels[managementlabels.AlertNameLabel] + } + if alertName == "" { + continue + } + byAlert[alertName] = append(byAlert[alertName], rule) + } + return byAlert +} + +func relabeledAlertName(rule *monitoringv1.Rule) string { + if rule == nil { + return "" + } + if rule.Alert != "" { + return rule.Alert + } + if rule.Labels != nil { + return rule.Labels[managementlabels.AlertNameLabel] + } + return "" +} + +func applyRelabeledRuleLabels(rule *k8s.PrometheusRule, relabeledByAlert map[string][]monitoringv1.Rule) { + if rule == nil || rule.Name == "" || rule.Type == k8s.RuleTypeRecording { + return + } + + // Preserve plugin meta labels added during API fetch. + source := "" + if rule.Labels != nil { + source = rule.Labels[k8s.AlertSourceLabel] + } + + match := findRelabeledMatch(rule, relabeledByAlert[rule.Name]) + if match == nil || match.Labels == nil { + return + } + + // Replace rule labels with the relabeled cache version so that actions which + // remove/rename labels (e.g. LabelDrop/LabelKeep/LabelMap) are faithfully reflected. + labelsOut := make(map[string]string, len(match.Labels)+1) + for k, v := range match.Labels { + labelsOut[k] = v + } + if source != "" { + labelsOut[k8s.AlertSourceLabel] = source + } + rule.Labels = labelsOut +} + +func findRelabeledMatch(rule *k8s.PrometheusRule, candidates []monitoringv1.Rule) *monitoringv1.Rule { + // Strict match first (preserves correctness when multiple rules share alertname). + for i := range candidates { + candidate := &candidates[i] + if promRuleMatchesRelabeled(rule, candidate) { + return candidate + } + } + + // If relabeling modified rule labels (e.g. severity), strict label matching may fail. + // Retry on a best-effort basis using (alertname, expr, for) only. If this is ambiguous, + // do not guess. + var relaxed *monitoringv1.Rule + for i := range candidates { + candidate := &candidates[i] + if rule == nil || candidate == nil { + continue + } + candidateName := relabeledAlertName(candidate) + if rule.Name == "" || candidateName == "" || rule.Name != candidateName { + continue + } + if canonicalizePromQL(rule.Query) != canonicalizePromQL(candidate.Expr.String()) { + continue + } + if !durationMatches(rule.Duration, candidate.For) { + continue + } + if relaxed != nil { + // ambiguous + relaxed = nil + break + } + relaxed = candidate + } + if relaxed != nil { + return relaxed + } + + // Fallback: if alertname is globally unique, avoid brittle PromQL/metadata matching. + // This helps when Prometheus stringifies PromQL differently than PrometheusRule YAML + // (e.g. label matcher ordering). + if len(candidates) == 1 { + return &candidates[0] + } + return nil +} + +func promRuleMatchesRelabeled(rule *k8s.PrometheusRule, candidate *monitoringv1.Rule) bool { + if rule == nil || candidate == nil { + return false + } + candidateName := relabeledAlertName(candidate) + if rule.Name == "" || candidateName == "" || rule.Name != candidateName { + return false + } + if canonicalizePromQL(rule.Query) != canonicalizePromQL(candidate.Expr.String()) { + return false + } + if !durationMatches(rule.Duration, candidate.For) { + return false + } + if !stringMapEqual(filterBusinessLabels(rule.Labels), filterBusinessLabels(candidate.Labels)) { + return false + } + return true +} + +func canonicalizePromQL(in string) string { + s := strings.TrimSpace(in) + if s == "" { + return "" + } + expr, err := parser.ParseExpr(s) + if err == nil && expr != nil { + parser.Inspect(expr, func(node parser.Node, _ []parser.Node) error { + switch n := node.(type) { + case *parser.VectorSelector: + sort.Slice(n.LabelMatchers, func(i, j int) bool { + mi, mj := n.LabelMatchers[i], n.LabelMatchers[j] + if mi == nil || mj == nil { + return mi != nil + } + if mi.Name != mj.Name { + return mi.Name < mj.Name + } + if mi.Type != mj.Type { + return mi.Type < mj.Type + } + return mi.Value < mj.Value + }) + case *parser.AggregateExpr: + sort.Strings(n.Grouping) + case *parser.BinaryExpr: + if n.VectorMatching != nil { + sort.Strings(n.VectorMatching.MatchingLabels) + sort.Strings(n.VectorMatching.Include) + } + } + return nil + }) + + return expr.String() + } + return normalizeSpaceOutsideQuotes(s) +} + +func normalizeSpaceOutsideQuotes(in string) string { + if in == "" { + return "" + } + in = strings.TrimSpace(in) + + var b strings.Builder + b.Grow(len(in)) + + inQuote := false + escaped := false + pendingSpace := false + lastNoSpaceToken := false + + isNoSpaceToken := func(r rune) bool { + switch r { + case '(', ')', '{', '}', ',', '+', '-', '*', '/', '%', '^', '=', '!', '<', '>': + return true + default: + return false + } + } + + for _, r := range in { + if escaped { + if pendingSpace { + if !lastNoSpaceToken { + b.WriteByte(' ') + } + pendingSpace = false + } + b.WriteRune(r) + escaped = false + lastNoSpaceToken = false + continue + } + + if inQuote && r == '\\' { + if pendingSpace { + if !lastNoSpaceToken { + b.WriteByte(' ') + } + pendingSpace = false + } + b.WriteRune(r) + escaped = true + lastNoSpaceToken = false + continue + } + + if r == '"' { + if pendingSpace { + if !lastNoSpaceToken { + b.WriteByte(' ') + } + pendingSpace = false + } + inQuote = !inQuote + b.WriteRune(r) + lastNoSpaceToken = false + continue + } + + if !inQuote && unicode.IsSpace(r) { + pendingSpace = true + continue + } + + if pendingSpace && !lastNoSpaceToken && !isNoSpaceToken(r) { + b.WriteByte(' ') + } + pendingSpace = false + + b.WriteRune(r) + lastNoSpaceToken = !inQuote && isNoSpaceToken(r) + } + + return strings.TrimSpace(b.String()) +} + +func durationMatches(seconds float64, duration *monitoringv1.Duration) bool { + if duration == nil { + return seconds == 0 + } + parsed, err := time.ParseDuration(string(*duration)) + if err != nil { + return false + } + return math.Abs(parsed.Seconds()-seconds) < 0.001 +} + +func stringMapEqual(a, b map[string]string) bool { + if len(a) == 0 && len(b) == 0 { + return true + } + if len(a) != len(b) { + return false + } + for k, v := range a { + if b[k] != v { + return false + } + } + return true +} + +func ruleAlertLabelsMatch(req *k8s.GetRulesRequest, alert *k8s.PrometheusRuleAlert) bool { + for key, value := range req.Labels { + if alertValue, exists := alert.Labels[key]; !exists || alertValue != value { + return false + } + } + + return true +} diff --git a/pkg/management/get_rules_test.go b/pkg/management/get_rules_test.go new file mode 100644 index 000000000..42ccddb39 --- /dev/null +++ b/pkg/management/get_rules_test.go @@ -0,0 +1,421 @@ +package management_test + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/relabel" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var _ = Describe("GetRules", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + Context("when PrometheusAlerts returns rule groups", func() { + BeforeEach(func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetRulesFunc: func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + Rules: []k8s.PrometheusRule{ + { + Name: "rule-a", + Type: k8s.RuleTypeAlerting, + Alerts: []k8s.PrometheusRuleAlert{ + { + State: "firing", + Labels: map[string]string{ + "alertname": "Alert1", + "severity": "warning", + }, + }, + { + State: "pending", + Labels: map[string]string{ + "alertname": "Alert2", + "severity": "critical", + }, + }, + { + State: "inactive", + Labels: map[string]string{ + "alertname": "Alert3", + "severity": "warning", + }, + }, + }, + }, + }, + }, + }, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{} + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{ + { + SourceLabels: model.LabelNames{"alertname"}, + Regex: relabel.MustNewRegexp("Alert2"), + Action: relabel.Drop, + NameValidationScheme: model.UTF8Validation, + }, + { + SourceLabels: model.LabelNames{"alertname"}, + Regex: relabel.MustNewRegexp("Alert1"), + TargetLabel: "severity", + Replacement: "critical", + Action: relabel.Replace, + NameValidationScheme: model.UTF8Validation, + }, + } + }, + } + } + }) + + It("applies relabel configs to pending/firing alerts only", func() { + groups, err := client.GetRules(ctx, k8s.GetRulesRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + + rules := groups[0].Rules + Expect(rules).To(HaveLen(1)) + + alerts := rules[0].Alerts + Expect(alerts).To(HaveLen(2)) + Expect(alerts[0].Labels["alertname"]).To(Equal("Alert1")) + Expect(alerts[0].Labels["severity"]).To(Equal("critical")) + Expect(alerts[1].Labels["alertname"]).To(Equal("Alert3")) + Expect(alerts[1].Labels["severity"]).To(Equal("warning")) + }) + + It("filters alerts by state and labels", func() { + req := k8s.GetRulesRequest{ + State: "firing", + Labels: map[string]string{"severity": "critical"}, + } + groups, err := client.GetRules(ctx, req) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + + alerts := groups[0].Rules[0].Alerts + Expect(alerts).To(HaveLen(1)) + Expect(alerts[0].Labels["alertname"]).To(Equal("Alert1")) + Expect(alerts[0].Labels["severity"]).To(Equal("critical")) + }) + + It("drops non-matching alerting rules when filters are provided", func() { + req := k8s.GetRulesRequest{ + State: "firing", + Labels: map[string]string{"severity": "does-not-exist"}, + } + groups, err := client.GetRules(ctx, req) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(BeEmpty()) + }) + + It("adds managed-by labels from relabeled rules", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetRulesFunc: func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + Rules: []k8s.PrometheusRule{ + { + Name: "AlertWithManagedBy", + Type: "alerting", + Query: "up == 0", + Duration: 0, + Labels: map[string]string{"severity": "critical"}, + Annotations: map[string]string{ + "summary": "test alert", + }, + }, + }, + }, + }, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{ + { + Alert: "AlertWithManagedBy", + Expr: intstr.FromString("up ==\n 0"), + Labels: map[string]string{ + "severity": "critical", + k8s.AlertRuleLabelId: "alert-id-1", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + managementlabels.RuleManagedByLabel: "operator", + managementlabels.RelabelConfigManagedByLabel: "gitops", + }, + Annotations: map[string]string{ + "summary": "test alert", + }, + }, + } + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + groups, err := client.GetRules(ctx, k8s.GetRulesRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + + rule := groups[0].Rules[0] + Expect(rule.Labels[k8s.AlertRuleLabelId]).To(Equal("alert-id-1")) + Expect(rule.Labels[k8s.PrometheusRuleLabelNamespace]).To(Equal("openshift-monitoring")) + Expect(rule.Labels[k8s.PrometheusRuleLabelName]).To(Equal("platform-rule")) + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("operator")) + Expect(rule.Labels[managementlabels.RelabelConfigManagedByLabel]).To(Equal("gitops")) + }) + + It("enriches rule labels with id, source, classification, PrometheusRule metadata, and ARC-updated labels", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetRulesFunc: func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + Rules: []k8s.PrometheusRule{ + { + Name: "ARCUpdatedRule", + Type: "alerting", + Query: "up == 0", + Duration: 0, + Labels: map[string]string{ + "severity": "warning", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + }, + }, + }, + }, + }, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{ + { + Alert: "ARCUpdatedRule", + Expr: intstr.FromString("up ==\n 0"), + Labels: map[string]string{ + // ARC-updated / relabeled labels + "severity": "critical", + "team": "sre", + + // Required enrichment + k8s.AlertRuleLabelId: "rid-arc-1", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + + // Classification labels + k8s.AlertRuleClassificationComponentKey: "compute", + k8s.AlertRuleClassificationLayerKey: "cluster", + + // Managed-by labels (GitOps/Operator signals) + managementlabels.RuleManagedByLabel: "operator", + managementlabels.RelabelConfigManagedByLabel: "gitops", + }, + }, + } + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + groups, err := client.GetRules(ctx, k8s.GetRulesRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + Expect(groups[0].Rules).To(HaveLen(1)) + + rule := groups[0].Rules[0] + + // Source should be preserved from rules API response + Expect(rule.Labels[k8s.AlertSourceLabel]).To(Equal(k8s.AlertSourcePlatform)) + + // Enrichment labels should be present + Expect(rule.Labels[k8s.AlertRuleLabelId]).To(Equal("rid-arc-1")) + Expect(rule.Labels[k8s.PrometheusRuleLabelNamespace]).To(Equal("openshift-monitoring")) + Expect(rule.Labels[k8s.PrometheusRuleLabelName]).To(Equal("platform-rule")) + + // Classification labels should be present + Expect(rule.Labels[k8s.AlertRuleClassificationComponentKey]).To(Equal("compute")) + Expect(rule.Labels[k8s.AlertRuleClassificationLayerKey]).To(Equal("cluster")) + + // ARC-updated labels should reflect the relabeled rules view + Expect(rule.Labels["severity"]).To(Equal("critical")) + Expect(rule.Labels["team"]).To(Equal("sre")) + + // Managed-by labels should be present + Expect(rule.Labels[managementlabels.RuleManagedByLabel]).To(Equal("operator")) + Expect(rule.Labels[managementlabels.RelabelConfigManagedByLabel]).To(Equal("gitops")) + }) + + It("enriches rule labels when relabeled rule has alertname label but empty Alert field", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetRulesFunc: func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + Rules: []k8s.PrometheusRule{ + { + Name: "EmptyAlertFieldRule", + Type: "alerting", + Query: "up == 0", + Duration: 0, + Labels: map[string]string{ + "severity": "warning", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + }, + }, + }, + }, + }, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{ + { + Alert: "", + Expr: intstr.FromString("up ==\n 0"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "EmptyAlertFieldRule", + "severity": "critical", + k8s.AlertRuleLabelId: "rid-empty-alert-1", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + }, + }, + } + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + groups, err := client.GetRules(ctx, k8s.GetRulesRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + Expect(groups[0].Rules).To(HaveLen(1)) + + rule := groups[0].Rules[0] + Expect(rule.Labels[k8s.AlertSourceLabel]).To(Equal(k8s.AlertSourcePlatform)) + Expect(rule.Labels[k8s.AlertRuleLabelId]).To(Equal("rid-empty-alert-1")) + Expect(rule.Labels[k8s.PrometheusRuleLabelNamespace]).To(Equal("openshift-monitoring")) + Expect(rule.Labels[k8s.PrometheusRuleLabelName]).To(Equal("platform-rule")) + Expect(rule.Labels["severity"]).To(Equal("critical")) + }) + + It("does not guess when multiple relabeled candidates match relaxed criteria", func() { + mockK8s.PrometheusAlertsFunc = func() k8s.PrometheusAlertsInterface { + return &testutils.MockPrometheusAlertsInterface{ + GetRulesFunc: func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + return []k8s.PrometheusRuleGroup{ + { + Name: "group-a", + Rules: []k8s.PrometheusRule{ + { + Name: "AmbiguousRule", + Type: "alerting", + Query: "up == 0", + Duration: 0, + Labels: map[string]string{ + "severity": "warning", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + }, + }, + }, + }, + }, nil + }, + } + } + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{ + { + Alert: "", + Expr: intstr.FromString("up ==\n 0"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "AmbiguousRule", + "severity": "critical", + k8s.AlertRuleLabelId: "rid-amb-1", + }, + }, + { + Alert: "", + Expr: intstr.FromString("up==0"), + Labels: map[string]string{ + managementlabels.AlertNameLabel: "AmbiguousRule", + "severity": "critical", + k8s.AlertRuleLabelId: "rid-amb-2", + }, + }, + } + }, + ConfigFunc: func() []*relabel.Config { + return []*relabel.Config{} + }, + } + } + + groups, err := client.GetRules(ctx, k8s.GetRulesRequest{}) + Expect(err).NotTo(HaveOccurred()) + Expect(groups).To(HaveLen(1)) + Expect(groups[0].Rules).To(HaveLen(1)) + + rule := groups[0].Rules[0] + Expect(rule.Labels[k8s.AlertSourceLabel]).To(Equal(k8s.AlertSourcePlatform)) + Expect(rule.Labels).NotTo(HaveKey(k8s.AlertRuleLabelId)) + Expect(rule.Labels["severity"]).To(Equal("warning")) + }) + }) +}) diff --git a/pkg/management/label_utils.go b/pkg/management/label_utils.go new file mode 100644 index 000000000..d83b49076 --- /dev/null +++ b/pkg/management/label_utils.go @@ -0,0 +1,29 @@ +package management + +import ( + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +// isProtectedLabel returns true for labels we will not modify via ARC for platform rules. +// These carry provenance or rule identity and must remain intact. +var protectedLabels = map[string]bool{ + managementlabels.AlertNameLabel: true, + k8s.AlertRuleLabelId: true, +} + +func isProtectedLabel(label string) bool { + return protectedLabels[label] +} + +// isValidSeverity validates allowed severity values. +var validSeverities = map[string]bool{ + "critical": true, + "warning": true, + "info": true, + "none": true, +} + +func isValidSeverity(s string) bool { + return validSeverities[s] +} diff --git a/pkg/management/list_rules.go b/pkg/management/list_rules.go new file mode 100644 index 000000000..50c5a0fcb --- /dev/null +++ b/pkg/management/list_rules.go @@ -0,0 +1,66 @@ +package management + +import ( + "context" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func (c *client) ListRules(ctx context.Context, prOptions PrometheusRuleOptions, arOptions AlertRuleOptions) ([]monitoringv1.Rule, error) { + if prOptions.Name != "" && prOptions.Namespace == "" { + return nil, &ValidationError{Message: "namespace is required when prometheusRuleName is specified"} + } + + allRules := c.k8sClient.RelabeledRules().List(ctx) + var filteredRules []monitoringv1.Rule + + for _, rule := range allRules { + // Filter by PrometheusRule name and namespace if specified + if prOptions.Name != "" && prOptions.Namespace != "" { + namespace := rule.Labels[k8s.PrometheusRuleLabelNamespace] + name := rule.Labels[k8s.PrometheusRuleLabelName] + if namespace != prOptions.Namespace || name != prOptions.Name { + continue + } + } + + // Apply alert rule filters + if !c.matchesAlertRuleFilters(rule, arOptions) { + continue + } + + filteredRules = append(filteredRules, rule) + } + + return filteredRules, nil +} + +func (c *client) matchesAlertRuleFilters(rule monitoringv1.Rule, arOptions AlertRuleOptions) bool { + // Filter by alert name + if arOptions.Name != "" && string(rule.Alert) != arOptions.Name { + return false + } + + // Filter by source (platform) + if arOptions.Source == k8s.AlertSourcePlatform { + source, exists := rule.Labels[k8s.AlertSourceLabel] + if !exists { + return false + } + + return source == k8s.AlertSourcePlatform + } + + // Filter by labels + if len(arOptions.Labels) > 0 { + for key, value := range arOptions.Labels { + ruleValue, exists := rule.Labels[key] + if !exists || ruleValue != value { + return false + } + } + } + + return true +} diff --git a/pkg/management/list_rules_test.go b/pkg/management/list_rules_test.go new file mode 100644 index 000000000..75a4fc2aa --- /dev/null +++ b/pkg/management/list_rules_test.go @@ -0,0 +1,286 @@ +package management_test + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("ListRules", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + rule1 = monitoringv1.Rule{ + Alert: "Alert1", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + k8s.PrometheusRuleLabelNamespace: "namespace1", + k8s.PrometheusRuleLabelName: "rule1", + }, + } + + rule2 = monitoringv1.Rule{ + Alert: "Alert2", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "critical", + k8s.PrometheusRuleLabelNamespace: "namespace1", + k8s.PrometheusRuleLabelName: "rule2", + }, + } + + rule3 = monitoringv1.Rule{ + Alert: "Alert3", + Expr: intstr.FromString("down == 1"), + Labels: map[string]string{ + "severity": "warning", + k8s.PrometheusRuleLabelNamespace: "namespace2", + k8s.PrometheusRuleLabelName: "rule3", + }, + } + + platformRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Expr: intstr.FromString("node_down == 1"), + Labels: map[string]string{ + "severity": "critical", + k8s.AlertSourceLabel: k8s.AlertSourcePlatform, + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + }, + } + + customLabelRule = monitoringv1.Rule{ + Alert: "CustomLabelAlert", + Expr: intstr.FromString("custom == 1"), + Labels: map[string]string{ + "severity": "info", + "team": "backend", + "env": "production", + k8s.PrometheusRuleLabelNamespace: "namespace1", + k8s.PrometheusRuleLabelName: "rule1", + }, + } + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{rule1, rule2, rule3, platformRule, customLabelRule} + }, + } + } + }) + + Context("when PrometheusRule Name is provided without Namespace", func() { + It("returns a ValidationError", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "rule1", + } + arOptions := management.AlertRuleOptions{} + + _, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).To(HaveOccurred()) + + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue(), "expected error to be a ValidationError") + Expect(err.Error()).To(ContainSubstring("namespace is required when prometheusRuleName is specified")) + }) + }) + + Context("when no filters are provided", func() { + It("returns all rules", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{} + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(5)) + }) + }) + + Context("when filtering by PrometheusRule Name and Namespace", func() { + It("returns only rules from the specified PrometheusRule", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "rule1", + Namespace: "namespace1", + } + arOptions := management.AlertRuleOptions{} + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(2)) + Expect(rules[0].Alert).To(BeElementOf("Alert1", "CustomLabelAlert")) + Expect(rules[1].Alert).To(BeElementOf("Alert1", "CustomLabelAlert")) + }) + + It("returns empty list when no rules match", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "nonexistent", + Namespace: "namespace1", + } + arOptions := management.AlertRuleOptions{} + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(0)) + }) + }) + + Context("when filtering by alert name", func() { + It("returns only rules with matching alert name", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Name: "Alert1", + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(1)) + Expect(rules[0].Alert).To(Equal("Alert1")) + }) + + It("returns empty list when alert name doesn't match", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Name: "NonexistentAlert", + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(0)) + }) + }) + + Context("when filtering by source=platform", func() { + It("returns only platform rules", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Source: k8s.AlertSourcePlatform, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(1)) + Expect(rules[0].Alert).To(Equal("PlatformAlert")) + Expect(rules[0].Labels[k8s.AlertSourceLabel]).To(Equal(k8s.AlertSourcePlatform)) + }) + }) + + Context("when filtering by labels", func() { + It("returns rules matching a single label", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Labels: map[string]string{ + "severity": "warning", + }, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(2)) + }) + + It("returns rules matching multiple labels", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Labels: map[string]string{ + "team": "backend", + "env": "production", + }, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(1)) + Expect(rules[0].Alert).To(Equal("CustomLabelAlert")) + }) + + It("returns empty list when labels don't match", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{ + Labels: map[string]string{ + "nonexistent": "value", + }, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(0)) + }) + }) + + Context("when combining multiple filters", func() { + It("returns rules matching all filters", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "rule1", + Namespace: "namespace1", + } + arOptions := management.AlertRuleOptions{ + Labels: map[string]string{ + "severity": "warning", + }, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(1)) + Expect(rules[0].Alert).To(Equal("Alert1")) + }) + + It("returns empty list when some filters don't match", func() { + prOptions := management.PrometheusRuleOptions{ + Name: "rule1", + Namespace: "namespace1", + } + arOptions := management.AlertRuleOptions{ + Labels: map[string]string{ + "severity": "critical", + }, + } + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(0)) + }) + }) + + Context("when RelabeledRules returns empty list", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + ListFunc: func(ctx context.Context) []monitoringv1.Rule { + return []monitoringv1.Rule{} + }, + } + } + }) + + It("returns empty list", func() { + prOptions := management.PrometheusRuleOptions{} + arOptions := management.AlertRuleOptions{} + + rules, err := client.ListRules(ctx, prOptions, arOptions) + Expect(err).NotTo(HaveOccurred()) + Expect(rules).To(HaveLen(0)) + }) + }) +}) diff --git a/pkg/management/management.go b/pkg/management/management.go new file mode 100644 index 000000000..cb47521b4 --- /dev/null +++ b/pkg/management/management.go @@ -0,0 +1,16 @@ +package management + +import ( + "k8s.io/apimachinery/pkg/types" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +type client struct { + k8sClient k8s.Client + overrideNamespace string +} + +func (c *client) IsPlatformAlertRule(prId types.NamespacedName) bool { + return c.k8sClient.Namespace().IsClusterMonitoringNamespace(prId.Namespace) +} diff --git a/pkg/management/management_suite_test.go b/pkg/management/management_suite_test.go new file mode 100644 index 000000000..b2dd05b63 --- /dev/null +++ b/pkg/management/management_suite_test.go @@ -0,0 +1,19 @@ +package management_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/prometheus/common/model" +) + +var _ = BeforeSuite(func() { + // Set validation scheme globally for all tests that use relabel configs + model.NameValidationScheme = model.LegacyValidation +}) + +func TestManagement(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Management Suite") +} diff --git a/pkg/management/override_namespace.go b/pkg/management/override_namespace.go new file mode 100644 index 000000000..8141b57fb --- /dev/null +++ b/pkg/management/override_namespace.go @@ -0,0 +1,36 @@ +package management + +import ( + "os" + "strings" +) + +const ( + // envMonitoringPluginNamespace allows explicit override in dev/test and in unusual deployments. + envMonitoringPluginNamespace = "MONITORING_PLUGIN_NAMESPACE" + // envPodNamespace is typically injected by Kubernetes (e.g. via the Downward API) and reflects the running pod namespace. + envPodNamespace = "POD_NAMESPACE" +) + +const serviceAccountNamespacePath = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + +// detectOverrideNamespace returns the namespace used to store/read shared override resources (e.g. ConfigMaps). +// +// Precedence is: +// - MONITORING_PLUGIN_NAMESPACE: explicit operator/dev override (most intentional) +// - POD_NAMESPACE: injected runtime namespace for the pod (common case) +// - serviceAccount namespace file: fallback when POD_NAMESPACE isn't set +func detectOverrideNamespace() string { + if ns := strings.TrimSpace(os.Getenv(envMonitoringPluginNamespace)); ns != "" { + return ns + } + if ns := strings.TrimSpace(os.Getenv(envPodNamespace)); ns != "" { + return ns + } + if data, err := os.ReadFile(serviceAccountNamespacePath); err == nil { + if ns := strings.TrimSpace(string(data)); ns != "" { + return ns + } + } + return "default" +} diff --git a/pkg/management/testutils/k8s_client_mock.go b/pkg/management/testutils/k8s_client_mock.go new file mode 100644 index 000000000..fcff2d303 --- /dev/null +++ b/pkg/management/testutils/k8s_client_mock.go @@ -0,0 +1,539 @@ +package testutils + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/prometheus/model/relabel" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +// MockClient is a mock implementation of k8s.Client interface +type MockClient struct { + TestConnectionFunc func(ctx context.Context) error + AlertingHealthFunc func(ctx context.Context) (k8s.AlertingHealth, error) + PrometheusAlertsFunc func() k8s.PrometheusAlertsInterface + PrometheusRulesFunc func() k8s.PrometheusRuleInterface + AlertRelabelConfigsFunc func() k8s.AlertRelabelConfigInterface + AlertingRulesFunc func() k8s.AlertingRuleInterface + RelabeledRulesFunc func() k8s.RelabeledRulesInterface + NamespaceFunc func() k8s.NamespaceInterface + ConfigMapsFunc func() k8s.ConfigMapInterface +} + +// TestConnection mocks the TestConnection method +func (m *MockClient) TestConnection(ctx context.Context) error { + if m.TestConnectionFunc != nil { + return m.TestConnectionFunc(ctx) + } + return nil +} + +// AlertingHealth mocks the AlertingHealth method +func (m *MockClient) AlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) { + if m.AlertingHealthFunc != nil { + return m.AlertingHealthFunc(ctx) + } + return k8s.AlertingHealth{}, nil +} + +// PrometheusAlerts mocks the PrometheusAlerts method +func (m *MockClient) PrometheusAlerts() k8s.PrometheusAlertsInterface { + if m.PrometheusAlertsFunc != nil { + return m.PrometheusAlertsFunc() + } + return &MockPrometheusAlertsInterface{} +} + +// PrometheusRules mocks the PrometheusRules method +func (m *MockClient) PrometheusRules() k8s.PrometheusRuleInterface { + if m.PrometheusRulesFunc != nil { + return m.PrometheusRulesFunc() + } + return &MockPrometheusRuleInterface{} +} + +// AlertRelabelConfigs mocks the AlertRelabelConfigs method +func (m *MockClient) AlertRelabelConfigs() k8s.AlertRelabelConfigInterface { + if m.AlertRelabelConfigsFunc != nil { + return m.AlertRelabelConfigsFunc() + } + return &MockAlertRelabelConfigInterface{} +} + +// AlertingRules mocks the AlertingRules method +func (m *MockClient) AlertingRules() k8s.AlertingRuleInterface { + if m.AlertingRulesFunc != nil { + return m.AlertingRulesFunc() + } + return &MockAlertingRuleInterface{} +} + +// RelabeledRules mocks the RelabeledRules method +func (m *MockClient) RelabeledRules() k8s.RelabeledRulesInterface { + if m.RelabeledRulesFunc != nil { + return m.RelabeledRulesFunc() + } + return &MockRelabeledRulesInterface{} +} + +// Namespace mocks the Namespace method +func (m *MockClient) Namespace() k8s.NamespaceInterface { + if m.NamespaceFunc != nil { + return m.NamespaceFunc() + } + return &MockNamespaceInterface{} +} + +// ConfigMaps mocks the ConfigMaps method +func (m *MockClient) ConfigMaps() k8s.ConfigMapInterface { + if m.ConfigMapsFunc != nil { + return m.ConfigMapsFunc() + } + return &MockConfigMapInterface{} +} + +// MockPrometheusAlertsInterface is a mock implementation of k8s.PrometheusAlertsInterface +type MockPrometheusAlertsInterface struct { + GetAlertsFunc func(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) + GetRulesFunc func(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) + + // Storage for test data + ActiveAlerts []k8s.PrometheusAlert + RuleGroups []k8s.PrometheusRuleGroup +} + +func (m *MockPrometheusAlertsInterface) SetActiveAlerts(alerts []k8s.PrometheusAlert) { + m.ActiveAlerts = alerts +} + +func (m *MockPrometheusAlertsInterface) SetRuleGroups(groups []k8s.PrometheusRuleGroup) { + m.RuleGroups = groups +} + +// GetAlerts mocks the GetAlerts method +func (m *MockPrometheusAlertsInterface) GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) { + if m.GetAlertsFunc != nil { + return m.GetAlertsFunc(ctx, req) + } + + if m.ActiveAlerts != nil { + return m.ActiveAlerts, nil + } + return []k8s.PrometheusAlert{}, nil +} + +// GetRules mocks the GetRules method +func (m *MockPrometheusAlertsInterface) GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) { + if m.GetRulesFunc != nil { + return m.GetRulesFunc(ctx, req) + } + if m.RuleGroups != nil { + return m.RuleGroups, nil + } + return []k8s.PrometheusRuleGroup{}, nil +} + +// MockPrometheusRuleInterface is a mock implementation of k8s.PrometheusRuleInterface +type MockPrometheusRuleInterface struct { + ListFunc func(ctx context.Context, namespace string) ([]monitoringv1.PrometheusRule, error) + GetFunc func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) + UpdateFunc func(ctx context.Context, pr monitoringv1.PrometheusRule) error + DeleteFunc func(ctx context.Context, namespace string, name string) error + AddRuleFunc func(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error + + // Storage for test data + PrometheusRules map[string]*monitoringv1.PrometheusRule +} + +func (m *MockPrometheusRuleInterface) SetPrometheusRules(rules map[string]*monitoringv1.PrometheusRule) { + m.PrometheusRules = rules +} + +// List mocks the List method +func (m *MockPrometheusRuleInterface) List(ctx context.Context, namespace string) ([]monitoringv1.PrometheusRule, error) { + if m.ListFunc != nil { + return m.ListFunc(ctx, namespace) + } + + var rules []monitoringv1.PrometheusRule + if m.PrometheusRules != nil { + for _, rule := range m.PrometheusRules { + if namespace == "" || rule.Namespace == namespace { + rules = append(rules, *rule) + } + } + } + return rules, nil +} + +// Get mocks the Get method +func (m *MockPrometheusRuleInterface) Get(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + if m.GetFunc != nil { + return m.GetFunc(ctx, namespace, name) + } + + key := namespace + "/" + name + if m.PrometheusRules != nil { + if rule, exists := m.PrometheusRules[key]; exists { + return rule, true, nil + } + } + + return nil, false, nil +} + +// Update mocks the Update method +func (m *MockPrometheusRuleInterface) Update(ctx context.Context, pr monitoringv1.PrometheusRule) error { + if m.UpdateFunc != nil { + return m.UpdateFunc(ctx, pr) + } + + key := pr.Namespace + "/" + pr.Name + if m.PrometheusRules == nil { + m.PrometheusRules = make(map[string]*monitoringv1.PrometheusRule) + } + m.PrometheusRules[key] = &pr + return nil +} + +// Delete mocks the Delete method +func (m *MockPrometheusRuleInterface) Delete(ctx context.Context, namespace string, name string) error { + if m.DeleteFunc != nil { + return m.DeleteFunc(ctx, namespace, name) + } + + key := namespace + "/" + name + if m.PrometheusRules != nil { + delete(m.PrometheusRules, key) + } + return nil +} + +// AddRule mocks the AddRule method +func (m *MockPrometheusRuleInterface) AddRule(ctx context.Context, namespacedName types.NamespacedName, groupName string, rule monitoringv1.Rule) error { + if m.AddRuleFunc != nil { + return m.AddRuleFunc(ctx, namespacedName, groupName, rule) + } + + key := namespacedName.Namespace + "/" + namespacedName.Name + if m.PrometheusRules == nil { + m.PrometheusRules = make(map[string]*monitoringv1.PrometheusRule) + } + + // Get or create PrometheusRule + pr, exists := m.PrometheusRules[key] + if !exists { + pr = &monitoringv1.PrometheusRule{ + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{}, + }, + } + pr.Name = namespacedName.Name + pr.Namespace = namespacedName.Namespace + m.PrometheusRules[key] = pr + } + + // Find or create the group + var group *monitoringv1.RuleGroup + for i := range pr.Spec.Groups { + if pr.Spec.Groups[i].Name == groupName { + group = &pr.Spec.Groups[i] + break + } + } + if group == nil { + pr.Spec.Groups = append(pr.Spec.Groups, monitoringv1.RuleGroup{ + Name: groupName, + Rules: []monitoringv1.Rule{}, + }) + group = &pr.Spec.Groups[len(pr.Spec.Groups)-1] + } + + // Add the new rule to the group + group.Rules = append(group.Rules, rule) + + return nil +} + +// MockAlertRelabelConfigInterface is a mock implementation of k8s.AlertRelabelConfigInterface +type MockAlertRelabelConfigInterface struct { + ListFunc func(ctx context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) + GetFunc func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) + CreateFunc func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) + UpdateFunc func(ctx context.Context, arc osmv1.AlertRelabelConfig) error + DeleteFunc func(ctx context.Context, namespace string, name string) error + + // Storage for test data + AlertRelabelConfigs map[string]*osmv1.AlertRelabelConfig +} + +func (m *MockAlertRelabelConfigInterface) SetAlertRelabelConfigs(configs map[string]*osmv1.AlertRelabelConfig) { + m.AlertRelabelConfigs = configs +} + +// List mocks the List method +func (m *MockAlertRelabelConfigInterface) List(ctx context.Context, namespace string) ([]osmv1.AlertRelabelConfig, error) { + if m.ListFunc != nil { + return m.ListFunc(ctx, namespace) + } + + var configs []osmv1.AlertRelabelConfig + if m.AlertRelabelConfigs != nil { + for _, config := range m.AlertRelabelConfigs { + if namespace == "" || config.Namespace == namespace { + configs = append(configs, *config) + } + } + } + return configs, nil +} + +// Get mocks the Get method +func (m *MockAlertRelabelConfigInterface) Get(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if m.GetFunc != nil { + return m.GetFunc(ctx, namespace, name) + } + + key := namespace + "/" + name + if m.AlertRelabelConfigs != nil { + if config, exists := m.AlertRelabelConfigs[key]; exists { + return config, true, nil + } + } + + return nil, false, nil +} + +// Create mocks the Create method +func (m *MockAlertRelabelConfigInterface) Create(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + if m.CreateFunc != nil { + return m.CreateFunc(ctx, arc) + } + + key := arc.Namespace + "/" + arc.Name + if m.AlertRelabelConfigs == nil { + m.AlertRelabelConfigs = make(map[string]*osmv1.AlertRelabelConfig) + } + m.AlertRelabelConfigs[key] = &arc + return &arc, nil +} + +// Update mocks the Update method +func (m *MockAlertRelabelConfigInterface) Update(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + if m.UpdateFunc != nil { + return m.UpdateFunc(ctx, arc) + } + + key := arc.Namespace + "/" + arc.Name + if m.AlertRelabelConfigs == nil { + m.AlertRelabelConfigs = make(map[string]*osmv1.AlertRelabelConfig) + } + m.AlertRelabelConfigs[key] = &arc + return nil +} + +// Delete mocks the Delete method +func (m *MockAlertRelabelConfigInterface) Delete(ctx context.Context, namespace string, name string) error { + if m.DeleteFunc != nil { + return m.DeleteFunc(ctx, namespace, name) + } + + key := namespace + "/" + name + if m.AlertRelabelConfigs != nil { + delete(m.AlertRelabelConfigs, key) + } + return nil +} + +// MockAlertingRuleInterface is a mock implementation of k8s.AlertingRuleInterface +type MockAlertingRuleInterface struct { + ListFunc func(ctx context.Context) ([]osmv1.AlertingRule, error) + GetFunc func(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) + CreateFunc func(ctx context.Context, ar osmv1.AlertingRule) (*osmv1.AlertingRule, error) + UpdateFunc func(ctx context.Context, ar osmv1.AlertingRule) error + DeleteFunc func(ctx context.Context, name string) error + + // Storage for test data + AlertingRules map[string]*osmv1.AlertingRule +} + +func (m *MockAlertingRuleInterface) SetAlertingRules(rules map[string]*osmv1.AlertingRule) { + m.AlertingRules = rules +} + +// List mocks the List method +func (m *MockAlertingRuleInterface) List(ctx context.Context) ([]osmv1.AlertingRule, error) { + if m.ListFunc != nil { + return m.ListFunc(ctx) + } + + var rules []osmv1.AlertingRule + if m.AlertingRules != nil { + for _, rule := range m.AlertingRules { + if rule.Namespace == k8s.ClusterMonitoringNamespace { + rules = append(rules, *rule) + } + } + } + return rules, nil +} + +// Get mocks the Get method +func (m *MockAlertingRuleInterface) Get(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + if m.GetFunc != nil { + return m.GetFunc(ctx, name) + } + + key := k8s.ClusterMonitoringNamespace + "/" + name + if m.AlertingRules != nil { + if rule, exists := m.AlertingRules[key]; exists { + return rule, true, nil + } + } + + return nil, false, nil +} + +// Create mocks the Create method +func (m *MockAlertingRuleInterface) Create(ctx context.Context, ar osmv1.AlertingRule) (*osmv1.AlertingRule, error) { + if m.CreateFunc != nil { + return m.CreateFunc(ctx, ar) + } + + key := ar.Namespace + "/" + ar.Name + if m.AlertingRules == nil { + m.AlertingRules = make(map[string]*osmv1.AlertingRule) + } + m.AlertingRules[key] = &ar + return &ar, nil +} + +// Update mocks the Update method +func (m *MockAlertingRuleInterface) Update(ctx context.Context, ar osmv1.AlertingRule) error { + if m.UpdateFunc != nil { + return m.UpdateFunc(ctx, ar) + } + + key := ar.Namespace + "/" + ar.Name + if m.AlertingRules == nil { + m.AlertingRules = make(map[string]*osmv1.AlertingRule) + } + m.AlertingRules[key] = &ar + return nil +} + +// Delete mocks the Delete method +func (m *MockAlertingRuleInterface) Delete(ctx context.Context, name string) error { + if m.DeleteFunc != nil { + return m.DeleteFunc(ctx, name) + } + + key := k8s.ClusterMonitoringNamespace + "/" + name + if m.AlertingRules != nil { + delete(m.AlertingRules, key) + } + return nil +} + +// MockRelabeledRulesInterface is a mock implementation of k8s.RelabeledRulesInterface +type MockRelabeledRulesInterface struct { + ListFunc func(ctx context.Context) []monitoringv1.Rule + GetFunc func(ctx context.Context, id string) (monitoringv1.Rule, bool) + ConfigFunc func() []*relabel.Config +} + +func (m *MockRelabeledRulesInterface) List(ctx context.Context) []monitoringv1.Rule { + if m.ListFunc != nil { + return m.ListFunc(ctx) + } + return []monitoringv1.Rule{} +} + +func (m *MockRelabeledRulesInterface) Get(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if m.GetFunc != nil { + return m.GetFunc(ctx, id) + } + return monitoringv1.Rule{}, false +} + +func (m *MockRelabeledRulesInterface) Config() []*relabel.Config { + if m.ConfigFunc != nil { + return m.ConfigFunc() + } + return []*relabel.Config{} +} + +// MockNamespaceInterface is a mock implementation of k8s.NamespaceInterface +type MockNamespaceInterface struct { + IsClusterMonitoringNamespaceFunc func(name string) bool + + // Storage for test data + MonitoringNamespaces map[string]bool +} + +func (m *MockNamespaceInterface) SetMonitoringNamespaces(namespaces map[string]bool) { + m.MonitoringNamespaces = namespaces +} + +// IsClusterMonitoringNamespace mocks the IsClusterMonitoringNamespace method +func (m *MockNamespaceInterface) IsClusterMonitoringNamespace(name string) bool { + if m.IsClusterMonitoringNamespaceFunc != nil { + return m.IsClusterMonitoringNamespaceFunc(name) + } + return m.MonitoringNamespaces[name] +} + +// MockConfigMapInterface is a mock implementation of k8s.ConfigMapInterface +type MockConfigMapInterface struct { + GetFunc func(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, bool, error) + UpdateFunc func(ctx context.Context, cm corev1.ConfigMap) error + CreateFunc func(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) + + // Storage + ConfigMaps map[string]*corev1.ConfigMap +} + +func (m *MockConfigMapInterface) Get(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, bool, error) { + if m.GetFunc != nil { + return m.GetFunc(ctx, namespace, name) + } + key := namespace + "/" + name + if m.ConfigMaps != nil { + if cm, ok := m.ConfigMaps[key]; ok { + return cm, true, nil + } + } + return nil, false, nil +} + +func (m *MockConfigMapInterface) Update(ctx context.Context, cm corev1.ConfigMap) error { + if m.UpdateFunc != nil { + return m.UpdateFunc(ctx, cm) + } + key := cm.Namespace + "/" + cm.Name + if m.ConfigMaps == nil { + m.ConfigMaps = make(map[string]*corev1.ConfigMap) + } + copy := cm + m.ConfigMaps[key] = © + return nil +} + +func (m *MockConfigMapInterface) Create(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) { + if m.CreateFunc != nil { + return m.CreateFunc(ctx, cm) + } + key := cm.Namespace + "/" + cm.Name + if m.ConfigMaps == nil { + m.ConfigMaps = make(map[string]*corev1.ConfigMap) + } + copy := cm + m.ConfigMaps[key] = © + return ©, nil +} diff --git a/pkg/management/types.go b/pkg/management/types.go new file mode 100644 index 000000000..473437e33 --- /dev/null +++ b/pkg/management/types.go @@ -0,0 +1,77 @@ +package management + +import ( + "context" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + + "github.com/openshift/monitoring-plugin/pkg/k8s" +) + +// Client is the interface for managing alert rules +type Client interface { + // ListRules lists all alert rules in the specified PrometheusRule resource + ListRules(ctx context.Context, prOptions PrometheusRuleOptions, arOptions AlertRuleOptions) ([]monitoringv1.Rule, error) + + // GetRuleById retrieves a specific alert rule by its ID + GetRuleById(ctx context.Context, alertRuleId string) (monitoringv1.Rule, error) + + // CreateUserDefinedAlertRule creates a new user-defined alert rule + CreateUserDefinedAlertRule(ctx context.Context, alertRule monitoringv1.Rule, prOptions PrometheusRuleOptions) (alertRuleId string, err error) + + // UpdateUserDefinedAlertRule updates an existing user-defined alert rule by its ID + // Returns the new rule ID after the update + UpdateUserDefinedAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) (newRuleId string, err error) + + // DeleteUserDefinedAlertRuleById deletes a user-defined alert rule by its ID + DeleteUserDefinedAlertRuleById(ctx context.Context, alertRuleId string) error + + // CreatePlatformAlertRule creates a new platform alert rule + CreatePlatformAlertRule(ctx context.Context, alertRule monitoringv1.Rule) (alertRuleId string, err error) + + // UpdatePlatformAlertRule updates an existing platform alert rule by its ID + // Platform alert rules can only have the labels updated through AlertRelabelConfigs + UpdatePlatformAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) error + + // DropPlatformAlertRule hides a platform alert by adding a scoped Drop relabel entry + DropPlatformAlertRule(ctx context.Context, alertRuleId string) error + + // RestorePlatformAlertRule restores a previously dropped platform alert by removing its Drop relabel entry + RestorePlatformAlertRule(ctx context.Context, alertRuleId string) error + + // GetAlerts retrieves Prometheus alerts + GetAlerts(ctx context.Context, req k8s.GetAlertsRequest) ([]k8s.PrometheusAlert, error) + // GetRules retrieves Prometheus alerting rules and active alerts + GetRules(ctx context.Context, req k8s.GetRulesRequest) ([]k8s.PrometheusRuleGroup, error) + + // GetAlertingHealth retrieves alerting health details + GetAlertingHealth(ctx context.Context) (k8s.AlertingHealth, error) + + // UpdateAlertRuleClassification updates component/layer for a single alert rule id + UpdateAlertRuleClassification(ctx context.Context, req UpdateRuleClassificationRequest) error + // BulkUpdateAlertRuleClassification updates classification for multiple rule ids + BulkUpdateAlertRuleClassification(ctx context.Context, items []UpdateRuleClassificationRequest) []error +} + +// PrometheusRuleOptions specifies options for selecting PrometheusRule resources and groups +type PrometheusRuleOptions struct { + // Name of the PrometheusRule resource where the alert rule will be added/listed from + Name string `json:"prometheusRuleName"` + + // Namespace of the PrometheusRule resource where the alert rule will be added/listed from + Namespace string `json:"prometheusRuleNamespace"` + + // GroupName of the RuleGroup within the PrometheusRule resource + GroupName string `json:"groupName"` +} + +type AlertRuleOptions struct { + // Name filters alert rules by alert name + Name string `json:"name,omitempty"` + + // Source filters alert rules by source type (platform or user-defined) + Source string `json:"source,omitempty"` + + // Labels filters alert rules by arbitrary label key-value pairs + Labels map[string]string `json:"labels,omitempty"` +} diff --git a/pkg/management/update_classification.go b/pkg/management/update_classification.go new file mode 100644 index 000000000..b789a7f57 --- /dev/null +++ b/pkg/management/update_classification.go @@ -0,0 +1,183 @@ +package management + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/openshift/monitoring-plugin/pkg/classification" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +// UpdateRuleClassificationRequest represents a single classification update +type UpdateRuleClassificationRequest struct { + RuleId string `json:"ruleId"` + Component *string `json:"openshift_io_alert_rule_component,omitempty"` + ComponentSet bool `json:"-"` + Layer *string `json:"openshift_io_alert_rule_layer,omitempty"` + LayerSet bool `json:"-"` + ComponentFrom *string `json:"openshift_io_alert_rule_component_from,omitempty"` + ComponentFromSet bool `json:"-"` + LayerFrom *string `json:"openshift_io_alert_rule_layer_from,omitempty"` + LayerFromSet bool `json:"-"` +} + +// UpdateAlertRuleClassification updates component/layer for a single alertRuleId +func (c *client) UpdateAlertRuleClassification(ctx context.Context, req UpdateRuleClassificationRequest) error { + if req.RuleId == "" { + return &ValidationError{Message: "ruleId is required"} + } + // Validate inputs if provided + if req.Component != nil && !classification.ValidateComponent(*req.Component) { + return &ValidationError{Message: fmt.Sprintf("invalid component %q", *req.Component)} + } + if req.Layer != nil && !classification.ValidateLayer(*req.Layer) { + return &ValidationError{Message: fmt.Sprintf("invalid layer %q (allowed: cluster, namespace)", *req.Layer)} + } + if req.ComponentFrom != nil { + v := strings.TrimSpace(*req.ComponentFrom) + if v != "" && !classification.ValidatePromLabelName(v) { + return &ValidationError{Message: fmt.Sprintf("invalid openshift_io_alert_rule_component_from %q (must be a valid Prometheus label name)", *req.ComponentFrom)} + } + } + if req.LayerFrom != nil { + v := strings.TrimSpace(*req.LayerFrom) + if v != "" && !classification.ValidatePromLabelName(v) { + return &ValidationError{Message: fmt.Sprintf("invalid openshift_io_alert_rule_layer_from %q (must be a valid Prometheus label name)", *req.LayerFrom)} + } + } + + // Find the base rule to locate its PrometheusRule namespace + rule, found := c.k8sClient.RelabeledRules().Get(ctx, req.RuleId) + if !found { + return &NotFoundError{Resource: "AlertRule", Id: req.RuleId} + } + + // Nothing to update. Treat as a no-op and avoid creating/updating ConfigMaps. + if !req.ComponentSet && !req.LayerSet && !req.ComponentFromSet && !req.LayerFromSet { + return nil + } + + ns := rule.Labels[k8s.PrometheusRuleLabelNamespace] + cmName := OverrideConfigMapName(ns) + overrideNamespace := c.overrideNamespace + + for i := 0; i < 3; i++ { + cm, exists, err := c.k8sClient.ConfigMaps().Get(ctx, overrideNamespace, cmName) + if err != nil { + return err + } + if !exists { + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: overrideNamespace, + Labels: map[string]string{ + managementlabels.AlertClassificationOverridesTypeLabelKey: managementlabels.AlertClassificationOverridesTypeLabelValue, + managementlabels.AlertClassificationOverridesManagedByLabelKey: managementlabels.AlertClassificationOverridesManagedByLabelValue, + k8s.PrometheusRuleLabelNamespace: ns, + }, + }, + Data: map[string]string{}, + } + } + + key := classificationOverrideKey(req.RuleId) + var entry alertRuleClassificationOverridePayload + if raw, ok := cm.Data[key]; ok && raw != "" { + _ = json.Unmarshal([]byte(raw), &entry) + } + + if req.ComponentSet { + if req.Component == nil { + entry.Classification.Component = "" + } else { + entry.Classification.Component = *req.Component + } + } + if req.LayerSet { + if req.Layer == nil { + entry.Classification.Layer = "" + } else { + entry.Classification.Layer = strings.ToLower(strings.TrimSpace(*req.Layer)) + } + } + if req.ComponentFromSet { + if req.ComponentFrom == nil { + entry.Classification.ComponentFrom = "" + } else { + entry.Classification.ComponentFrom = strings.TrimSpace(*req.ComponentFrom) + } + } + if req.LayerFromSet { + if req.LayerFrom == nil { + entry.Classification.LayerFrom = "" + } else { + entry.Classification.LayerFrom = strings.TrimSpace(*req.LayerFrom) + } + } + + if entry.Classification.Component == "" && + entry.Classification.Layer == "" && + entry.Classification.ComponentFrom == "" && + entry.Classification.LayerFrom == "" { + delete(cm.Data, key) + } else { + entry.AlertName = rule.Alert + entry.RuleName = rule.Labels[k8s.PrometheusRuleLabelName] + entry.RuleNamespace = ns + encoded, err := json.Marshal(entry) + if err != nil { + return fmt.Errorf("failed to marshal updated classification: %w", err) + } + if cm.Data == nil { + cm.Data = make(map[string]string) + } + cm.Data[key] = string(encoded) + } + + if exists { + if cm.Labels == nil { + cm.Labels = map[string]string{} + } + cm.Labels[managementlabels.AlertClassificationOverridesTypeLabelKey] = managementlabels.AlertClassificationOverridesTypeLabelValue + cm.Labels[managementlabels.AlertClassificationOverridesManagedByLabelKey] = managementlabels.AlertClassificationOverridesManagedByLabelValue + cm.Labels[k8s.PrometheusRuleLabelNamespace] = ns + if err := c.k8sClient.ConfigMaps().Update(ctx, *cm); err != nil { + if apierrors.IsConflict(err) { + continue + } + return err + } + return nil + } + + if len(cm.Data) == 0 { + return nil + } + if _, err := c.k8sClient.ConfigMaps().Create(ctx, *cm); err != nil { + if apierrors.IsAlreadyExists(err) { + continue + } + return err + } + return nil + } + + return fmt.Errorf("failed to update %s after retries", cmName) +} + +// BulkUpdateAlertRuleClassification updates multiple entries; returns per-item errors collected by caller +func (c *client) BulkUpdateAlertRuleClassification(ctx context.Context, items []UpdateRuleClassificationRequest) []error { + errs := make([]error, len(items)) + for i := range items { + errs[i] = c.UpdateAlertRuleClassification(ctx, items[i]) + } + return errs +} diff --git a/pkg/management/update_classification_test.go b/pkg/management/update_classification_test.go new file mode 100644 index 000000000..d258d2bb3 --- /dev/null +++ b/pkg/management/update_classification_test.go @@ -0,0 +1,339 @@ +package management_test + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "os" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + corev1 "k8s.io/api/core/v1" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" +) + +var _ = Describe("UpdateAlertRuleClassification", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + + overrideNamespace = "plugin-test-ns" + ruleNamespace = "openshift-cluster-version" + ruleName = "cluster-version-operator" + ) + + makeRule := func(ruleId string) monitoringv1.Rule { + return monitoringv1.Rule{ + Alert: "CannotRetrieveUpdates", + Labels: map[string]string{ + k8s.AlertRuleLabelId: ruleId, + k8s.PrometheusRuleLabelNamespace: ruleNamespace, + k8s.PrometheusRuleLabelName: ruleName, + }, + } + } + + encodeKey := func(ruleId string) string { + return base64.RawURLEncoding.EncodeToString([]byte(ruleId)) + } + + BeforeEach(func() { + Expect(os.Setenv("MONITORING_PLUGIN_NAMESPACE", overrideNamespace)).To(Succeed()) + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + }) + + AfterEach(func() { + Expect(os.Unsetenv("MONITORING_PLUGIN_NAMESPACE")).To(Succeed()) + }) + + Context("validation", func() { + It("returns ValidationError when ruleId is empty", func() { + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{}) + Expect(err).To(HaveOccurred()) + + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue()) + }) + + It("returns ValidationError on invalid layer", func() { + rule := makeRule("rid-1") + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + bad := "invalid" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: "rid-1", + Layer: &bad, + LayerSet: true, + Component: nil, + }) + Expect(err).To(HaveOccurred()) + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue()) + }) + + It("returns ValidationError on invalid component", func() { + rule := makeRule("rid-1") + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + empty := "" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: "rid-1", + Component: &empty, + ComponentSet: true, + Layer: nil, + LayerSet: false, + }) + Expect(err).To(HaveOccurred()) + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue()) + }) + + It("returns ValidationError on invalid openshift_io_alert_rule_component_from", func() { + rule := makeRule("rid-1") + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + bad := "bad-label" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: "rid-1", + ComponentFrom: &bad, + ComponentFromSet: true, + LayerFrom: nil, + LayerFromSet: false, + Component: nil, + ComponentSet: false, + Layer: nil, + LayerSet: false, + }) + Expect(err).To(HaveOccurred()) + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue()) + }) + + It("returns ValidationError on invalid openshift_io_alert_rule_layer_from", func() { + rule := makeRule("rid-1") + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + bad := "1layer" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: "rid-1", + LayerFrom: &bad, + LayerFromSet: true, + }) + Expect(err).To(HaveOccurred()) + var ve *management.ValidationError + Expect(errors.As(err, &ve)).To(BeTrue()) + }) + }) + + It("returns NotFoundError when the base rule cannot be found", func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + + val := "cluster" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: "missing", + Layer: &val, + LayerSet: true, + }) + Expect(err).To(HaveOccurred()) + + var nf *management.NotFoundError + Expect(errors.As(err, &nf)).To(BeTrue()) + Expect(nf.Resource).To(Equal("AlertRule")) + }) + + It("treats empty payload as a no-op (no ConfigMap calls)", func() { + rule := makeRule("rid-1") + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + + calls := 0 + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { + return &testutils.MockConfigMapInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, bool, error) { + calls++ + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, cm corev1.ConfigMap) error { + calls++ + return nil + }, + CreateFunc: func(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) { + calls++ + return &cm, nil + }, + } + } + + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{RuleId: "rid-1"}) + Expect(err).NotTo(HaveOccurred()) + Expect(calls).To(Equal(0)) + }) + + It("persists normalized layer and component into the overrides ConfigMap", func() { + ruleId := "rid-1" + rule := makeRule(ruleId) + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + Expect(id).To(Equal(ruleId)) + return rule, true + }, + } + } + + cmStore := &testutils.MockConfigMapInterface{ConfigMaps: map[string]*corev1.ConfigMap{}} + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { return cmStore } + + component := "team-a" + layer := " NaMeSpAcE " + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: ruleId, + Component: &component, + ComponentSet: true, + Layer: &layer, + LayerSet: true, + }) + Expect(err).NotTo(HaveOccurred()) + + cmName := management.OverrideConfigMapName(ruleNamespace) + key := overrideNamespace + "/" + cmName + cm, ok := cmStore.ConfigMaps[key] + Expect(ok).To(BeTrue()) + + raw := cm.Data[encodeKey(ruleId)] + Expect(raw).NotTo(BeEmpty()) + + var payload struct { + Classification struct { + Component string `json:"openshift_io_alert_rule_component"` + Layer string `json:"openshift_io_alert_rule_layer"` + } `json:"classification"` + } + Expect(json.Unmarshal([]byte(raw), &payload)).To(Succeed()) + Expect(payload.Classification.Component).To(Equal("team-a")) + Expect(payload.Classification.Layer).To(Equal("namespace")) + }) + + It("persists component_from and layer_from into the overrides ConfigMap", func() { + ruleId := "rid-1" + rule := makeRule(ruleId) + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + + cmStore := &testutils.MockConfigMapInterface{ConfigMaps: map[string]*corev1.ConfigMap{}} + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { return cmStore } + + componentFrom := "NaMe" + layerFrom := "LaYeR" + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: ruleId, + ComponentFrom: &componentFrom, + ComponentFromSet: true, + LayerFrom: &layerFrom, + LayerFromSet: true, + }) + Expect(err).NotTo(HaveOccurred()) + + cmName := management.OverrideConfigMapName(ruleNamespace) + key := overrideNamespace + "/" + cmName + cm, ok := cmStore.ConfigMaps[key] + Expect(ok).To(BeTrue()) + + raw := cm.Data[encodeKey(ruleId)] + Expect(raw).NotTo(BeEmpty()) + + var payload struct { + Classification struct { + ComponentFrom string `json:"openshift_io_alert_rule_component_from"` + LayerFrom string `json:"openshift_io_alert_rule_layer_from"` + } `json:"classification"` + } + Expect(json.Unmarshal([]byte(raw), &payload)).To(Succeed()) + Expect(payload.Classification.ComponentFrom).To(Equal("NaMe")) + Expect(payload.Classification.LayerFrom).To(Equal("LaYeR")) + }) + + It("does not create an overrides ConfigMap when clearing a non-existent entry", func() { + ruleId := "rid-1" + rule := makeRule(ruleId) + + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return rule, true + }, + } + } + + createCalls := 0 + updateCalls := 0 + cmStore := &testutils.MockConfigMapInterface{ + CreateFunc: func(ctx context.Context, cm corev1.ConfigMap) (*corev1.ConfigMap, error) { + createCalls++ + return &cm, nil + }, + UpdateFunc: func(ctx context.Context, cm corev1.ConfigMap) error { + updateCalls++ + return nil + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { return cmStore } + + err := client.UpdateAlertRuleClassification(ctx, management.UpdateRuleClassificationRequest{ + RuleId: ruleId, + Component: nil, + ComponentSet: true, + Layer: nil, + LayerSet: true, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(createCalls).To(Equal(0)) + Expect(updateCalls).To(Equal(0)) + }) +}) diff --git a/pkg/management/update_platform_alert_rule.go b/pkg/management/update_platform_alert_rule.go new file mode 100644 index 000000000..3bb0b500e --- /dev/null +++ b/pkg/management/update_platform_alert_rule.go @@ -0,0 +1,727 @@ +package management + +import ( + "context" + "fmt" + "regexp" + "sort" + "strings" + + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +func (c *client) UpdatePlatformAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) error { + rule, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found { + return &NotFoundError{Resource: "AlertRule", Id: alertRuleId} + } + + namespace := rule.Labels[k8s.PrometheusRuleLabelNamespace] + name := rule.Labels[k8s.PrometheusRuleLabelName] + + if !c.IsPlatformAlertRule(types.NamespacedName{Namespace: namespace, Name: name}) { + return &NotAllowedError{Message: "cannot update non-platform alert rule from " + namespace + "/" + name} + } + + // Fetch PR to validate metadata constraints as part of preconditions + var prMeta *monitoringv1.PrometheusRule + if pr, found, err := c.k8sClient.PrometheusRules().Get(ctx, namespace, name); err != nil { + return err + } else if found { + prMeta = pr + } + // Early validation on rule/PR (ARC checked later in applyLabelChangesViaAlertRelabelConfig) + if err := validatePlatformUpdatePreconditions(rule, prMeta, nil); err != nil { + return err + } + + originalRule, err := getOriginalPlatformRuleFromPR(prMeta, namespace, name, alertRuleId) + if err != nil { + return err + } + + // If alertname is explicitly provided and differs, reject + if v, ok := alertRule.Labels[managementlabels.AlertNameLabel]; ok { + if v != originalRule.Alert { + return &ValidationError{Message: fmt.Sprintf("label %q is immutable for platform alerts", managementlabels.AlertNameLabel)} + } + } + + // AlertRelabelConfigs for platform alerts must live in the central platform namespace + // Choose update strategy based on owning AlertingRule management: + // - GitOps-managed: block + // - Operator-managed: use ARC + // - Unmanaged: update AlertingRule directly + arName := rule.Labels[managementlabels.AlertingRuleLabelName] + if arName == "" { + arName = defaultAlertingRuleName + } + ar, arFound, arErr := c.getAlertingRule(ctx, arName) + if arErr != nil { + return arErr + } + if arFound && ar != nil { + if gitOpsManaged, operatorManaged := k8s.IsExternallyManagedObject(ar); gitOpsManaged { + return &NotAllowedError{Message: "This alert is managed by GitOps; edit it in Git."} + } else if operatorManaged { + // ARC path: update via AlertRelabelConfig + return c.applyLabelChangesViaAlertRelabelConfig(ctx, k8s.ClusterMonitoringNamespace, alertRuleId, *originalRule, alertRule.Labels) + } + // Direct AR path: update labels on the owning AlertingRule + return c.updateAlertingRuleLabels(ctx, ar, originalRule.Alert, alertRuleId, alertRule.Labels, arName) + } + + // No AR found: fall back to ARC path + return c.applyLabelChangesViaAlertRelabelConfig(ctx, k8s.ClusterMonitoringNamespace, alertRuleId, *originalRule, alertRule.Labels) +} + +// filterAndValidatePlatformLabelChanges filters out protected labels and validates platform-specific rules +func filterAndValidatePlatformLabelChanges(labels map[string]string) (map[string]string, error) { + filtered := make(map[string]string) + for k, v := range labels { + if !isProtectedLabel(k) { + filtered[k] = v + } + } + for k, v := range filtered { + if k == managementlabels.AlertNameLabel { + continue + } + if k == "severity" { + if v == "" { + return nil, &NotAllowedError{Message: fmt.Sprintf("label %q cannot be dropped for platform alerts", k)} + } + if !isValidSeverity(v) { + return nil, &ValidationError{Message: fmt.Sprintf("invalid severity %q: must be one of critical|warning|info|none", v)} + } + } + } + return filtered, nil +} + +// getAlertingRule wraps AlertingRule fetch with consistent error formatting. +func (c *client) getAlertingRule(ctx context.Context, name string) (*osmv1.AlertingRule, bool, error) { + ar, found, err := c.k8sClient.AlertingRules().Get(ctx, name) + if err != nil { + return nil, false, fmt.Errorf("failed to get AlertingRule %s: %w", name, err) + } + return ar, found, nil +} + +func (c *client) getOriginalPlatformRule(ctx context.Context, namespace string, name string, alertRuleId string) (*monitoringv1.Rule, error) { + pr, found, err := c.k8sClient.PrometheusRules().Get(ctx, namespace, name) + if err != nil { + return nil, fmt.Errorf("failed to get PrometheusRule %s/%s: %w", namespace, name, err) + } + + if !found { + return nil, &NotFoundError{ + Resource: "PrometheusRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("PrometheusRule %s/%s not found", namespace, name), + } + } + + for groupIdx := range pr.Spec.Groups { + for ruleIdx := range pr.Spec.Groups[groupIdx].Rules { + rule := &pr.Spec.Groups[groupIdx].Rules[ruleIdx] + if ruleMatchesAlertRuleID(*rule, alertRuleId) { + return rule, nil + } + } + } + + return nil, &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("in PrometheusRule %s/%s", namespace, name), + } +} + +// updateAlertingRuleLabels updates labels for the rule (by alert name) in the given AlertingRule. +func (c *client) updateAlertingRuleLabels( + ctx context.Context, + ar *osmv1.AlertingRule, + originalAlertName string, + alertRuleId string, + rawLabels map[string]string, + arName string, +) error { + filteredLabels, err := filterAndValidatePlatformLabelChanges(rawLabels) + if err != nil { + return err + } + target, found := findAlertByNameInAlertingRule(ar, originalAlertName) + if !found || target == nil { + return &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("alert %q not found in AlertingRule %s", originalAlertName, arName), + } + } + // Apply label updates + if target.Labels == nil { + target.Labels = map[string]string{} + } + for k, v := range filteredLabels { + if v == "" { + delete(target.Labels, k) + } else { + target.Labels[k] = v + } + } + if err := c.k8sClient.AlertingRules().Update(ctx, *ar); err != nil { + return fmt.Errorf("failed to update AlertingRule %s: %w", ar.Name, err) + } + return nil +} + +// findAlertByNameInAlertingRule returns a pointer to the rule with the given alert name within the AlertingRule. +func findAlertByNameInAlertingRule(ar *osmv1.AlertingRule, alertName string) (*osmv1.Rule, bool) { + for gi := range ar.Spec.Groups { + for ri := range ar.Spec.Groups[gi].Rules { + r := &ar.Spec.Groups[gi].Rules[ri] + if r.Alert == alertName { + return r, true + } + } + } + return nil, false +} + +// getOriginalPlatformRuleFromPR returns the original rule from a pre-fetched PrometheusRule +func getOriginalPlatformRuleFromPR(pr *monitoringv1.PrometheusRule, namespace string, name string, alertRuleId string) (*monitoringv1.Rule, error) { + if pr == nil { + return nil, &NotFoundError{ + Resource: "PrometheusRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("PrometheusRule %s/%s not found", namespace, name), + } + } + for groupIdx := range pr.Spec.Groups { + for ruleIdx := range pr.Spec.Groups[groupIdx].Rules { + rule := &pr.Spec.Groups[groupIdx].Rules[ruleIdx] + if ruleMatchesAlertRuleID(*rule, alertRuleId) { + return rule, nil + } + } + } + return nil, &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("in PrometheusRule %s/%s", namespace, name), + } +} + +type labelChange struct { + action string + sourceLabel string + targetLabel string + value string +} + +func (c *client) applyLabelChangesViaAlertRelabelConfig(ctx context.Context, namespace string, alertRuleId string, originalRule monitoringv1.Rule, rawLabels map[string]string) error { + filtered, err := filterAndValidatePlatformLabelChanges(rawLabels) + if err != nil { + return err + } + // Build human-friendly, short ARC name: arc-- + relabeled, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found || relabeled.Labels == nil { + return &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: "relabeled rule not found or has no labels", + } + } + prName := relabeled.Labels[k8s.PrometheusRuleLabelName] + arcName := k8s.GetAlertRelabelConfigName(prName, alertRuleId) + + existingArc, found, err := c.k8sClient.AlertRelabelConfigs().Get(ctx, namespace, arcName) + if err != nil { + return fmt.Errorf("failed to get AlertRelabelConfig %s/%s: %w", namespace, arcName, err) + } + // If ARC is GitOps-managed, block updates via API (centralized) + if err := validatePlatformUpdatePreconditions(relabeled, nil, relabelConfigIfFound(found, existingArc)); err != nil { + return err + } + + original := copyStringMap(originalRule.Labels) + existingOverrides, existingDrops := collectExistingFromARC(found, existingArc) + existingRuleDrops := getExistingRuleDrops(existingArc, alertRuleId) + effective := computeEffectiveLabels(original, existingOverrides, existingDrops) + + // If no actual label changes leave existing ARC as-is + if len(filtered) == 0 { + return nil + } + + desired := buildDesiredLabels(effective, filtered) + nextChanges := buildNextLabelChanges(original, desired) + + // If no changes remove ARC if it exists + if len(nextChanges) == 0 { + if found { + if err := c.k8sClient.AlertRelabelConfigs().Delete(ctx, namespace, arcName); err != nil { + return fmt.Errorf("failed to delete AlertRelabelConfig %s/%s: %w", namespace, arcName, err) + } + } + return nil + } + + relabelConfigs := c.buildRelabelConfigs(originalRule.Alert, original, alertRuleId, nextChanges) + relabelConfigs = appendPreservedRuleDrops(relabelConfigs, existingRuleDrops) + + if err := c.upsertAlertRelabelConfig(ctx, namespace, arcName, prName, originalRule.Alert, alertRuleId, found, existingArc, relabelConfigs); err != nil { + return err + } + + return nil +} + +// relabelConfigIfFound returns the ARC when found is true; otherwise returns nil. +func relabelConfigIfFound(found bool, arc *osmv1.AlertRelabelConfig) *osmv1.AlertRelabelConfig { + if found { + return arc + } + return nil +} + +func copyStringMap(in map[string]string) map[string]string { + out := make(map[string]string, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + +func collectExistingFromARC(found bool, arc *osmv1.AlertRelabelConfig) (map[string]string, map[string]struct{}) { + overrides := map[string]string{} + drops := map[string]struct{}{} + if found && arc != nil { + for _, rc := range arc.Spec.Configs { + switch rc.Action { + case "Replace": + if rc.TargetLabel != "" && rc.Replacement != "" { + overrides[string(rc.TargetLabel)] = rc.Replacement + } + case "LabelDrop": + if rc.Regex != "" { + drops[rc.Regex] = struct{}{} + } + } + } + } + return overrides, drops +} + +func computeEffectiveLabels(original map[string]string, overrides map[string]string, drops map[string]struct{}) map[string]string { + effective := copyStringMap(original) + for k, v := range overrides { + effective[k] = v + } + for dropKey := range drops { + delete(effective, dropKey) + } + return effective +} + +func buildDesiredLabels(effective map[string]string, newLabels map[string]string) map[string]string { + desired := copyStringMap(effective) + for k, v := range newLabels { + if v == "" { + delete(desired, k) + } else { + desired[k] = v + } + } + return desired +} + +func buildNextLabelChanges(original map[string]string, desired map[string]string) []labelChange { + var changes []labelChange + for k, v := range desired { + if k == k8s.AlertRuleLabelId { + continue + } + if ov, ok := original[k]; !ok || ov != v { + changes = append(changes, labelChange{ + action: "Replace", + targetLabel: k, + value: v, + }) + } + } + return changes +} + +func getExistingRuleDrops(arc *osmv1.AlertRelabelConfig, alertRuleId string) []osmv1.RelabelConfig { + if arc == nil { + return nil + } + var out []osmv1.RelabelConfig + escaped := regexp.QuoteMeta(alertRuleId) + for _, rc := range arc.Spec.Configs { + if rc.Action != "Drop" { + continue + } + if len(rc.SourceLabels) == 1 && rc.SourceLabels[0] == k8s.AlertRuleLabelId && + (rc.Regex == alertRuleId || rc.Regex == escaped) { + out = append(out, rc) + } + } + return out +} + +func appendPreservedRuleDrops(configs []osmv1.RelabelConfig, drops []osmv1.RelabelConfig) []osmv1.RelabelConfig { + if len(drops) == 0 { + return configs + } +nextDrop: + for _, d := range drops { + for _, cfg := range configs { + if cfg.Action == "Drop" && cfg.Regex == d.Regex && + len(cfg.SourceLabels) == 1 && cfg.SourceLabels[0] == k8s.AlertRuleLabelId { + continue nextDrop + } + } + configs = append(configs, d) + } + return configs +} + +func (c *client) upsertAlertRelabelConfig( + ctx context.Context, + namespace string, + arcName string, + prName string, + alertName string, + alertRuleId string, + found bool, + existingArc *osmv1.AlertRelabelConfig, + relabelConfigs []osmv1.RelabelConfig, +) error { + if found { + arc := existingArc + arc.Spec = osmv1.AlertRelabelConfigSpec{Configs: relabelConfigs} + if arc.Labels == nil { + arc.Labels = map[string]string{} + } + arc.Labels[managementlabels.ARCLabelPrometheusRuleNameKey] = prName + arc.Labels[managementlabels.ARCLabelAlertNameKey] = alertName + if arc.Annotations == nil { + arc.Annotations = map[string]string{} + } + arc.Annotations[managementlabels.ARCAnnotationAlertRuleIDKey] = alertRuleId + if err := c.k8sClient.AlertRelabelConfigs().Update(ctx, *arc); err != nil { + return fmt.Errorf("failed to update AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + return nil + } + + arc := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: arcName, + Namespace: namespace, + Labels: map[string]string{ + managementlabels.ARCLabelPrometheusRuleNameKey: prName, + managementlabels.ARCLabelAlertNameKey: alertName, + }, + Annotations: map[string]string{ + managementlabels.ARCAnnotationAlertRuleIDKey: alertRuleId, + }, + }, + Spec: osmv1.AlertRelabelConfigSpec{Configs: relabelConfigs}, + } + if _, err := c.k8sClient.AlertRelabelConfigs().Create(ctx, *arc); err != nil { + return fmt.Errorf("failed to create AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + return nil +} +func (c *client) buildRelabelConfigs(alertName string, originalLabels map[string]string, alertRuleId string, changes []labelChange) []osmv1.RelabelConfig { + var configs []osmv1.RelabelConfig + + // 1) Conditionally stamp the rule id only for the exact rule by matching alertname + original static labels + // Build ordered source labels and exact anchored pattern for conditional Replace (non-dropping) + var keys []string + for k := range originalLabels { + // Do not rely on namespace for scoping; runtime alert namespace may differ from PR or be absent + if k == "namespace" { + continue + } + keys = append(keys, k) + } + sort.Strings(keys) + // Scope by alertname + original static labels only (ARCs apply to platform stack) + source := []osmv1.LabelName{managementlabels.AlertNameLabel} + values := []string{alertName} + for _, k := range keys { + source = append(source, osmv1.LabelName(k)) + values = append(values, originalLabels[k]) + } + pat := "^" + regexp.QuoteMeta(strings.Join(values, ";")) + "$" + configs = append(configs, osmv1.RelabelConfig{ + SourceLabels: source, + Regex: pat, + TargetLabel: k8s.AlertRuleLabelId, + Replacement: alertRuleId, + Action: "Replace", + }) + + for _, change := range changes { + switch change.action { + case "Replace": + config := osmv1.RelabelConfig{ + // Tight match by exact ruleId + SourceLabels: []osmv1.LabelName{k8s.AlertRuleLabelId}, + Regex: regexp.QuoteMeta(alertRuleId), + TargetLabel: change.targetLabel, + Replacement: change.value, + Action: "Replace", + } + configs = append(configs, config) + case "LabelDrop": + // Drop the specific label name, scoped by prior Keep + config := osmv1.RelabelConfig{ + Regex: change.sourceLabel, + Action: "LabelDrop", + } + configs = append(configs, config) + } + } + + return configs +} + +func ensureStampAndDrop(next *[]osmv1.RelabelConfig, stamp osmv1.RelabelConfig, dropCfg osmv1.RelabelConfig, alertRuleId string) bool { + stampExists := false + dropExists := false + for _, rc := range *next { + if rc.Action == "Replace" && rc.TargetLabel == k8s.AlertRuleLabelId && + rc.Regex == stamp.Regex && rc.Replacement == alertRuleId { + stampExists = true + } + if rc.Action == "Drop" && rc.Regex == dropCfg.Regex && + len(rc.SourceLabels) == 1 && rc.SourceLabels[0] == k8s.AlertRuleLabelId { + dropExists = true + } + } + changed := false + if !stampExists { + *next = append(*next, stamp) + changed = true + } + if !dropExists { + *next = append(*next, dropCfg) + changed = true + } + return changed +} + +func filterOutDrop(configs []osmv1.RelabelConfig, alertRuleId string) ([]osmv1.RelabelConfig, bool) { + target := regexp.QuoteMeta(alertRuleId) + var out []osmv1.RelabelConfig + removed := false + for _, rc := range configs { + if rc.Action == "Drop" && (rc.Regex == target || rc.Regex == alertRuleId) { + removed = true + continue + } + out = append(out, rc) + } + return out, removed +} + +func isStampOnly(configs []osmv1.RelabelConfig) bool { + if len(configs) == 0 { + return true + } + for _, rc := range configs { + if !(rc.Action == "Replace" && rc.TargetLabel == k8s.AlertRuleLabelId) { + return false + } + } + return true +} + +func (c *client) DropPlatformAlertRule(ctx context.Context, alertRuleId string) error { + relabeled, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found || relabeled.Labels == nil { + return &NotFoundError{Resource: "AlertRule", Id: alertRuleId} + } + + namespace := relabeled.Labels[k8s.PrometheusRuleLabelNamespace] + name := relabeled.Labels[k8s.PrometheusRuleLabelName] + + if !c.IsPlatformAlertRule(types.NamespacedName{Namespace: namespace, Name: name}) { + return &NotAllowedError{Message: "cannot drop non-platform alert rule from " + namespace + "/" + name} + } + + originalRule, err := c.getOriginalPlatformRule(ctx, namespace, name, alertRuleId) + if err != nil { + return err + } + + prName := relabeled.Labels[k8s.PrometheusRuleLabelName] + arcName := k8s.GetAlertRelabelConfigName(prName, alertRuleId) + + existingArc, arcExists, err := c.k8sClient.AlertRelabelConfigs().Get(ctx, k8s.ClusterMonitoringNamespace, arcName) + if err != nil { + return fmt.Errorf("failed to get AlertRelabelConfig %s/%s: %w", k8s.ClusterMonitoringNamespace, arcName, err) + } + // If ARC is GitOps-managed, block updates via API + if err := validatePlatformUpdatePreconditions(relabeled, nil, relabelConfigIfFound(arcExists, existingArc)); err != nil { + return err + } + + original := map[string]string{} + for k, v := range originalRule.Labels { + original[k] = v + } + stampOnly := c.buildRelabelConfigs(originalRule.Alert, original, alertRuleId, nil) + var stamp osmv1.RelabelConfig + if len(stampOnly) > 0 { + stamp = stampOnly[0] + } + + dropCfg := osmv1.RelabelConfig{ + SourceLabels: []osmv1.LabelName{"openshift_io_alert_rule_id"}, + Regex: regexp.QuoteMeta(alertRuleId), + Action: "Drop", + } + + var next []osmv1.RelabelConfig + if arcExists && existingArc != nil { + next = append(next, existingArc.Spec.Configs...) + } + + changed := ensureStampAndDrop(&next, stamp, dropCfg, alertRuleId) + + if !changed { + return nil + } + + if arcExists { + arc := existingArc + arc.Spec = osmv1.AlertRelabelConfigSpec{Configs: next} + if arc.Labels == nil { + arc.Labels = map[string]string{} + } + arc.Labels[managementlabels.ARCLabelPrometheusRuleNameKey] = prName + arc.Labels[managementlabels.ARCLabelAlertNameKey] = originalRule.Alert + if arc.Annotations == nil { + arc.Annotations = map[string]string{} + } + arc.Annotations[managementlabels.ARCAnnotationAlertRuleIDKey] = alertRuleId + + if err := c.k8sClient.AlertRelabelConfigs().Update(ctx, *arc); err != nil { + return fmt.Errorf("failed to update AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + return nil + } + + arc := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: arcName, + Namespace: k8s.ClusterMonitoringNamespace, + Labels: map[string]string{ + managementlabels.ARCLabelPrometheusRuleNameKey: prName, + managementlabels.ARCLabelAlertNameKey: originalRule.Alert, + }, + Annotations: map[string]string{ + managementlabels.ARCAnnotationAlertRuleIDKey: alertRuleId, + }, + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: next, + }, + } + if _, err := c.k8sClient.AlertRelabelConfigs().Create(ctx, *arc); err != nil { + return fmt.Errorf("failed to create AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + return nil +} + +func (c *client) RestorePlatformAlertRule(ctx context.Context, alertRuleId string) error { + relabeled, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + var existingArc *osmv1.AlertRelabelConfig + var arcName string + var err error + if found && relabeled.Labels != nil { + namespace := relabeled.Labels[k8s.PrometheusRuleLabelNamespace] + name := relabeled.Labels[k8s.PrometheusRuleLabelName] + if !c.IsPlatformAlertRule(types.NamespacedName{Namespace: namespace, Name: name}) { + return &NotAllowedError{Message: "cannot restore non-platform alert rule from " + namespace + "/" + name} + } + prName := relabeled.Labels[k8s.PrometheusRuleLabelName] + arcName = k8s.GetAlertRelabelConfigName(prName, alertRuleId) + var arcExists bool + existingArc, arcExists, err = c.k8sClient.AlertRelabelConfigs().Get(ctx, k8s.ClusterMonitoringNamespace, arcName) + if err != nil { + return fmt.Errorf("failed to get AlertRelabelConfig %s/%s: %w", k8s.ClusterMonitoringNamespace, arcName, err) + } + if !arcExists || existingArc == nil { + return nil + } + // If ARC is GitOps-managed, block updates via API + if err := validatePlatformUpdatePreconditions(relabeled, nil, existingArc); err != nil { + return err + } + } else { + arcs, lerr := c.k8sClient.AlertRelabelConfigs().List(ctx, k8s.ClusterMonitoringNamespace) + if lerr != nil { + return fmt.Errorf("failed to list AlertRelabelConfigs: %w", lerr) + } + for i := range arcs { + arc := arcs[i] + if arc.Annotations != nil && arc.Annotations[managementlabels.ARCAnnotationAlertRuleIDKey] == alertRuleId { + arcCopy := arc + existingArc = &arcCopy + arcName = arc.Name + break + } + } + if existingArc == nil { + return nil + } + } + + filtered, removed := filterOutDrop(existingArc.Spec.Configs, alertRuleId) + + if !removed { + return nil + } + + if len(filtered) == 0 { + if err := c.k8sClient.AlertRelabelConfigs().Delete(ctx, k8s.ClusterMonitoringNamespace, arcName); err != nil { + return fmt.Errorf("failed to delete AlertRelabelConfig %s/%s: %w", k8s.ClusterMonitoringNamespace, arcName, err) + } + return nil + } + + // If only the stamp Replace remains, delete the ARC + if isStampOnly(filtered) { + if err := c.k8sClient.AlertRelabelConfigs().Delete(ctx, k8s.ClusterMonitoringNamespace, arcName); err != nil { + return fmt.Errorf("failed to delete AlertRelabelConfig %s/%s: %w", k8s.ClusterMonitoringNamespace, arcName, err) + } + return nil + } + + arc := existingArc + arc.Spec = osmv1.AlertRelabelConfigSpec{Configs: filtered} + if arc.Annotations == nil { + arc.Annotations = map[string]string{} + } + arc.Annotations[managementlabels.ARCAnnotationAlertRuleIDKey] = alertRuleId + + if err := c.k8sClient.AlertRelabelConfigs().Update(ctx, *arc); err != nil { + return fmt.Errorf("failed to update AlertRelabelConfig %s/%s: %w", arc.Namespace, arc.Name, err) + } + return nil +} diff --git a/pkg/management/update_platform_alert_rule_test.go b/pkg/management/update_platform_alert_rule_test.go new file mode 100644 index 000000000..998aad069 --- /dev/null +++ b/pkg/management/update_platform_alert_rule_test.go @@ -0,0 +1,1086 @@ +package management_test + +import ( + "context" + "errors" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var _ = Describe("UpdatePlatformAlertRule", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + // Original platform rule as stored in PrometheusRule (without k8s labels) + originalPlatformRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Expr: intstr.FromString("node_down == 1"), + Labels: map[string]string{ + "severity": "critical", + }, + } + originalPlatformRuleId = alertrule.GetAlertingRuleId(&originalPlatformRule) + + // Platform rule as seen by RelabeledRules (with k8s labels added) + platformRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Expr: intstr.FromString("node_down == 1"), + Labels: map[string]string{ + "severity": "critical", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + k8s.AlertRuleLabelId: originalPlatformRuleId, + }, + } + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + + userRule = monitoringv1.Rule{ + Alert: "UserAlert", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "user-namespace", + k8s.PrometheusRuleLabelName: "user-rule", + }, + } + userRuleId = alertrule.GetAlertingRuleId(&userRule) + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + } + }) + + Context("Operator-managed platform rule with GitOps PR metadata and no ARC", func() { + BeforeEach(func() { + // Relabeled rule marked as operator-managed at rule level + opRule := platformRule + opRule.Labels = make(map[string]string) + for k, v := range platformRule.Labels { + opRule.Labels[k] = v + } + opRule.Labels[managementlabels.RuleManagedByLabel] = managementlabels.ManagedByOperator + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return opRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + // Original PR exists and is GitOps-managed via metadata + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + Annotations: map[string]string{"argocd.argoproj.io/tracking-id": "gitops-track"}, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "grp", + Rules: []monitoringv1.Rule{originalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + // No ARC yet + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + } + } + }) + + It("blocks platform update due to GitOps PR metadata when managed_by=operator", func() { + updatedRule := originalPlatformRule + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by GitOps")) + }) + }) + Context("blocks update when ARC is GitOps-managed", func() { + BeforeEach(func() { + // Relabeled rule in platform namespace + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + // Mark as operator-managed at rule level; ARC GitOps must still block + opRule := platformRule + if opRule.Labels == nil { + opRule.Labels = map[string]string{} + } + opRule.Labels[managementlabels.RuleManagedByLabel] = managementlabels.ManagedByOperator + return opRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + // Original PR exists and contains the platform rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "grp", + Rules: []monitoringv1.Rule{originalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + // ARC exists and is GitOps-managed via metadata + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Annotations: map[string]string{"argocd.argoproj.io/tracking-id": "abc"}, + }, + }, true, nil + }, + } + } + }) + + It("blocks platform update when ARC is GitOps-managed", func() { + updatedRule := originalPlatformRule + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by GitOps")) + }) + }) + + Context("GitOps-managed at rule level (no ARC yet)", func() { + BeforeEach(func() { + // Relabeled rule marked as GitOps-managed at rule level + gitopsRule := platformRule + gitopsRule.Labels = make(map[string]string) + for k, v := range platformRule.Labels { + gitopsRule.Labels[k] = v + } + gitopsRule.Labels[managementlabels.RuleManagedByLabel] = managementlabels.ManagedByGitOps + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return gitopsRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + // Original PR exists with the rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "grp", + Rules: []monitoringv1.Rule{originalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + // No ARC yet + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + } + } + }) + + It("blocks platform update early when rule managed_by=gitops and ARC missing", func() { + updatedRule := originalPlatformRule + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by GitOps")) + }) + }) + Context("when rule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns NotFoundError", func() { + updatedRule := platformRule + err := client.UpdatePlatformAlertRule(ctx, "nonexistent-id", updatedRule) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("AlertRule")) + }) + }) + + Context("when trying to update a non-platform rule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns an error", func() { + updatedRule := userRule + err := client.UpdatePlatformAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot update non-platform alert rule")) + }) + }) + + Context("when PrometheusRule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, nil + }, + } + } + }) + + It("returns NotFoundError", func() { + updatedRule := platformRule + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("PrometheusRule")) + }) + }) + + Context("when PrometheusRule Get returns an error", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, errors.New("failed to get PrometheusRule") + }, + } + } + }) + + It("returns the error", func() { + updatedRule := platformRule + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get PrometheusRule")) + }) + }) + + Context("when no label changes are detected", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + }) + + It("deletes existing ARC when reverting to original", func() { + // Simulate an existing ARC present + deleted := false + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: osmv1.AlertRelabelConfigSpec{Configs: []osmv1.RelabelConfig{}}, + }, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + deleted = true + return nil + }, + } + } + + updatedRule := originalPlatformRule // revert to original + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).To(BeTrue()) + }) + }) + + Context("when updating platform rule labels", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + }) + + Context("when creating new AlertRelabelConfig", func() { + BeforeEach(func() { + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + return &arc, nil + }, + } + } + }) + + It("creates AlertRelabelConfig for label changes", func() { + var createdARC *osmv1.AlertRelabelConfig + + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + createdARC = &arc + return &arc, nil + }, + } + } + + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "warning", + "new_label": "new_value", + } + + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(createdARC).NotTo(BeNil()) + Expect(createdARC.Namespace).To(Equal("openshift-monitoring")) + Expect(strings.HasPrefix(createdARC.Name, "arc-")).To(BeTrue()) + Expect(createdARC.Spec.Configs).NotTo(BeEmpty()) + }) + + It("scopes id stamp by alertname + all original static labels (excluding namespace)", func() { + var createdARC *osmv1.AlertRelabelConfig + + // Override PR getter to return a rule with extra stable labels + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + orig := originalPlatformRule + orig.Labels = map[string]string{ + "severity": "critical", + "component": "kube", + "team": "sre", + } + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{orig}, + }, + }, + }, + }, true, nil + }, + } + } + + // Compute the id for the PR's original rule (with extra stable labels) + origWithExtras := originalPlatformRule + origWithExtras.Labels = map[string]string{ + "severity": "critical", + "component": "kube", + "team": "sre", + } + idForExtras := alertrule.GetAlertingRuleId(&origWithExtras) + + // RelabeledRules should resolve using the same id + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == idForExtras { + return monitoringv1.Rule{ + Alert: "PlatformAlert", + Expr: intstr.FromString("node_down == 1"), + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + k8s.AlertRuleLabelId: idForExtras, + "severity": "critical", + }, + }, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + createdARC = &arc + return &arc, nil + }, + } + } + + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "info", + } + + err := client.UpdatePlatformAlertRule(ctx, idForExtras, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(createdARC).NotTo(BeNil()) + // Expect two entries: id-stamp Replace, then severity Replace + Expect(createdARC.Spec.Configs).To(HaveLen(2)) + + idCfg := createdARC.Spec.Configs[0] + Expect(string(idCfg.Action)).To(Equal("Replace")) + Expect(string(idCfg.TargetLabel)).To(Equal("openshift_io_alert_rule_id")) + // SourceLabels must include alertname and all original static labels + var sl []string + for _, s := range idCfg.SourceLabels { + sl = append(sl, string(s)) + } + Expect(sl).To(ContainElements("alertname", "component", "severity", "team")) + Expect(sl).NotTo(ContainElement("namespace")) + // Regex must be anchored and include alertname; then values for component,severity,team in sorted key order + Expect(strings.HasPrefix(idCfg.Regex, "^")).To(BeTrue()) + Expect(strings.HasSuffix(idCfg.Regex, "$")).To(BeTrue()) + // sorted(keys: component, severity, team) => values after alertname: kube;critical;sre + Expect(idCfg.Regex).To(ContainSubstring("^PlatformAlert;kube;critical;sre$")) + }) + + It("emits id setter then a single Replace for simple severity change", func() { + var createdARC *osmv1.AlertRelabelConfig + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return nil, false, nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + createdARC = &arc + return &arc, nil + }, + } + } + + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "info", + } + + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(createdARC).NotTo(BeNil()) + // Expect two entries: id setter Replace, then severity Replace + Expect(createdARC.Spec.Configs).To(HaveLen(2)) + cfg0 := createdARC.Spec.Configs[0] + Expect(string(cfg0.Action)).To(Equal("Replace")) + Expect(string(cfg0.TargetLabel)).To(Equal("openshift_io_alert_rule_id")) + Expect(cfg0.Replacement).To(Equal(platformRuleId)) + cfg1 := createdARC.Spec.Configs[1] + Expect(string(cfg1.Action)).To(Equal("Replace")) + Expect(string(cfg1.TargetLabel)).To(Equal("severity")) + Expect(cfg1.Replacement).To(Equal("info")) + }) + }) + + Context("when updating existing AlertRelabelConfig", func() { + BeforeEach(func() { + expectedArcName := k8s.GetAlertRelabelConfigName("platform-rule", platformRuleId) + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + existingARC := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: expectedArcName, + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + TargetLabel: "testing2", + Replacement: "newlabel2", + Action: "Replace", + }, + }, + }, + } + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if name == expectedArcName { + return existingARC, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + return nil + }, + } + } + }) + + It("updates existing AlertRelabelConfig", func() { + var updatedARC *osmv1.AlertRelabelConfig + expectedArcName := k8s.GetAlertRelabelConfigName("platform-rule", platformRuleId) + + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + existingARC := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: expectedArcName, + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + TargetLabel: "testing2", + Replacement: "newlabel2", + Action: "Replace", + }, + }, + }, + } + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if name == expectedArcName { + return existingARC, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + updatedARC = &arc + return nil + }, + } + } + + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "info", + } + + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(updatedARC).NotTo(BeNil()) + Expect(updatedARC.Spec.Configs).NotTo(BeEmpty()) + }) + + It("removes override-only label (explicit delete) and deletes ARC when no other overrides remain", func() { + var updatedARC *osmv1.AlertRelabelConfig + deleted := false + expectedArcName := k8s.GetAlertRelabelConfigName("platform-rule", platformRuleId) + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + existingARC := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: expectedArcName, + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + TargetLabel: "testing2", + Replacement: "newlabel2", + Action: "Replace", + }, + }, + }, + } + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if name == expectedArcName { + return existingARC, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + updatedARC = &arc + return nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + deleted = true + return nil + }, + } + } + + // Explicitly drop testing2; keep severity unchanged (no override) + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "critical", + "testing2": "", + } + + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + // No more overrides remain (severity unchanged), ARC should be deleted + Expect(updatedARC).To(BeNil()) + Expect(deleted).To(BeTrue()) + }) + }) + + Context("when dropping labels", func() { + It("rejects dropping severity label", func() { + updatedRule := originalPlatformRule + // Attempt to drop severity explicitly (K8s-style) + updatedRule.Labels = map[string]string{"severity": ""} + + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("label \"severity\" cannot be dropped")) + }) + }) + + Context("when attempting to modify protected labels", func() { + It("ignores provenance/identity labels merged from relabeled state", func() { + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "severity": "critical", + "openshift_io_alert_rule_id": "fake", + } + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + }) + + It("rejects changing alertname via labels", func() { + updatedRule := originalPlatformRule + updatedRule.Labels = map[string]string{ + "alertname": "NewName", + } + err := client.UpdatePlatformAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("immutable")) + }) + }) + }) +}) + +var _ = Describe("Drop/Restore Platform Alert Rule", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + drOriginalPlatformRule = monitoringv1.Rule{ + Alert: "PlatformAlertDrop", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + "team": "sre", + }, + } + drOriginalPlatformRuleId = alertrule.GetAlertingRuleId(&drOriginalPlatformRule) + + // Platform rule as seen by RelabeledRules (with k8s labels added) + drPlatformRule = monitoringv1.Rule{ + Alert: "PlatformAlertDrop", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + "team": "sre", + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule-drop", + k8s.AlertRuleLabelId: drOriginalPlatformRuleId, + }, + } + drPlatformRuleId = alertrule.GetAlertingRuleId(&drPlatformRule) + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + } + + // Relabeled rule lookup by id + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == drPlatformRuleId { + return drPlatformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + // Original PR with the original rule + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "grp", + Rules: []monitoringv1.Rule{drOriginalPlatformRule}, + }, + }, + }, + }, true, nil + }, + } + } + }) + + It("creates ARC with id-stamp Replace and scoped Drop, preserving existing entries", func() { + var createdOrUpdated *osmv1.AlertRelabelConfig + + existingARC := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "arc-platform-rule-drop-xxxx", + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + TargetLabel: "component", + Replacement: "kube-apiserver", + Action: "Replace", + }, + }, + }, + } + + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if namespace == "openshift-monitoring" && strings.HasPrefix(name, "arc-") { + return existingARC, true, nil + } + return nil, false, nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + createdOrUpdated = &arc + return nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + createdOrUpdated = &arc + return &arc, nil + }, + } + } + + err := client.DropPlatformAlertRule(ctx, drPlatformRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(createdOrUpdated).NotTo(BeNil()) + Expect(createdOrUpdated.Namespace).To(Equal("openshift-monitoring")) + Expect(strings.HasPrefix(createdOrUpdated.Name, "arc-")).To(BeTrue()) + + var hasPriorReplace, hasIdStamp, hasDrop bool + for _, rc := range createdOrUpdated.Spec.Configs { + switch string(rc.Action) { + case "Replace": + if string(rc.TargetLabel) == "component" && rc.Replacement == "kube-apiserver" { + hasPriorReplace = true + } + if string(rc.TargetLabel) == "openshift_io_alert_rule_id" && rc.Replacement == drPlatformRuleId { + hasIdStamp = true + } + case "Drop": + if len(rc.SourceLabels) == 1 && + string(rc.SourceLabels[0]) == "openshift_io_alert_rule_id" && + rc.Regex == drPlatformRuleId { + hasDrop = true + } + } + } + Expect(hasPriorReplace).To(BeTrue()) + Expect(hasIdStamp).To(BeTrue()) + Expect(hasDrop).To(BeTrue()) + }) + + It("is idempotent when dropping twice", func() { + var last *osmv1.AlertRelabelConfig + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + var stored *osmv1.AlertRelabelConfig + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + if stored == nil { + return nil, false, nil + } + return stored, true, nil + }, + CreateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) (*osmv1.AlertRelabelConfig, error) { + stored = &arc + last = &arc + return &arc, nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + last = &arc + stored = &arc + return nil + }, + } + } + + err := client.DropPlatformAlertRule(ctx, drPlatformRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(last).NotTo(BeNil()) + cfgCount := len(last.Spec.Configs) + + // Drop again; expect same number of configs + err = client.DropPlatformAlertRule(ctx, drPlatformRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(last.Spec.Configs).To(HaveLen(cfgCount)) + }) + + It("restores by removing only the Drop entry, preserving others; deletes ARC when becomes empty", func() { + deleted := false + var updated *osmv1.AlertRelabelConfig + + // Case A: existing ARC has only Drop -> restore should delete ARC + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + onlyDrop := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "arc-to-delete", + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + SourceLabels: []osmv1.LabelName{"openshift_io_alert_rule_id"}, + Regex: drPlatformRuleId, + Action: "Drop", + }, + }, + }, + } + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return onlyDrop, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + deleted = true + return nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + updated = &arc + return nil + }, + } + } + + err := client.RestorePlatformAlertRule(ctx, drPlatformRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).To(BeTrue()) + Expect(updated).To(BeNil()) + + // Case B: existing ARC has other Replace; restore should keep it and only remove Drop + deleted = false + mockK8s.AlertRelabelConfigsFunc = func() k8s.AlertRelabelConfigInterface { + withOthers := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "arc-keep", + Namespace: "openshift-monitoring", + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + TargetLabel: "component", + Replacement: "kube-apiserver", + Action: "Replace", + }, + { + SourceLabels: []osmv1.LabelName{"openshift_io_alert_rule_id"}, + Regex: drPlatformRuleId, + Action: "Drop", + }, + }, + }, + } + return &testutils.MockAlertRelabelConfigInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*osmv1.AlertRelabelConfig, bool, error) { + return withOthers, true, nil + }, + DeleteFunc: func(ctx context.Context, namespace string, name string) error { + deleted = true + return nil + }, + UpdateFunc: func(ctx context.Context, arc osmv1.AlertRelabelConfig) error { + updated = &arc + return nil + }, + } + } + + err = client.RestorePlatformAlertRule(ctx, drPlatformRuleId) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).To(BeFalse()) + Expect(updated).NotTo(BeNil()) + // Ensure Drop removed, other Replace preserved + var hasDrop, hasReplace bool + for _, rc := range updated.Spec.Configs { + if string(rc.Action) == "Drop" { + hasDrop = true + } + if string(rc.Action) == "Replace" && string(rc.TargetLabel) == "component" && rc.Replacement == "kube-apiserver" { + hasReplace = true + } + } + Expect(hasDrop).To(BeFalse()) + Expect(hasReplace).To(BeTrue()) + }) +}) diff --git a/pkg/management/update_user_defined_alert_rule.go b/pkg/management/update_user_defined_alert_rule.go new file mode 100644 index 000000000..13c310a17 --- /dev/null +++ b/pkg/management/update_user_defined_alert_rule.go @@ -0,0 +1,192 @@ +package management + +import ( + "context" + "encoding/json" + "fmt" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" +) + +func (c *client) UpdateUserDefinedAlertRule(ctx context.Context, alertRuleId string, alertRule monitoringv1.Rule) (string, error) { + rule, found := c.k8sClient.RelabeledRules().Get(ctx, alertRuleId) + if !found { + return "", &NotFoundError{Resource: "AlertRule", Id: alertRuleId} + } + + namespace := rule.Labels[k8s.PrometheusRuleLabelNamespace] + name := rule.Labels[k8s.PrometheusRuleLabelName] + + // Common preconditions on relabeled rule (labels-based) + if err := validateUserUpdatePreconditions(rule, nil); err != nil { + return "", err + } + + if c.IsPlatformAlertRule(types.NamespacedName{Namespace: namespace, Name: name}) { + return "", &NotAllowedError{Message: "cannot update alert rule in a platform-managed PrometheusRule"} + } + + pr, found, err := c.k8sClient.PrometheusRules().Get(ctx, namespace, name) + if err != nil { + return "", err + } + + if !found { + return "", &NotFoundError{ + Resource: "PrometheusRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("PrometheusRule %s/%s not found", namespace, name), + } + } + + // After fetching the PR, block edits for operator-managed PrometheusRules (they will be reconciled) + if err := validateUserUpdatePreconditions(rule, pr); err != nil { + return "", err + } + + // Locate the target rule once and update it after validation + var foundGroupIdx, foundRuleIdx int + ruleFound := false + for groupIdx := range pr.Spec.Groups { + for ruleIdx := range pr.Spec.Groups[groupIdx].Rules { + rule := &pr.Spec.Groups[groupIdx].Rules[ruleIdx] + if ruleMatchesAlertRuleID(*rule, alertRuleId) { + foundGroupIdx = groupIdx + foundRuleIdx = ruleIdx + ruleFound = true + break + } + } + if ruleFound { + break + } + } + + if !ruleFound { + return "", &NotFoundError{ + Resource: "AlertRule", + Id: alertRuleId, + AdditionalInfo: fmt.Sprintf("in PrometheusRule %s/%s", namespace, name), + } + } + + // Validate severity if present + if sev, ok := alertRule.Labels["severity"]; ok && sev != "" { + if !isValidSeverity(sev) { + return "", &ValidationError{Message: fmt.Sprintf("invalid severity %q: must be one of critical|warning|info|none", sev)} + } + } + + computedId := alertrule.GetAlertingRuleId(&alertRule) + + // Treat "true clones" (spec-identical rules that compute to the same id) as unsupported. + // If the updated rule would collide with some other existing rule, reject the update. + if computedId != "" && computedId != alertRuleId { + // Check within the same PrometheusRule first (authoritative). + for groupIdx := range pr.Spec.Groups { + for ruleIdx := range pr.Spec.Groups[groupIdx].Rules { + if groupIdx == foundGroupIdx && ruleIdx == foundRuleIdx { + continue + } + existing := pr.Spec.Groups[groupIdx].Rules[ruleIdx] + // Treat "true clones" as unsupported: identical definitions compute to the same id. + if existing.Alert != "" && alertrule.GetAlertingRuleId(&existing) == computedId { + return "", &ConflictError{Message: "alert rule with exact config already exists"} + } + } + } + + _, found := c.k8sClient.RelabeledRules().Get(ctx, computedId) + if found { + return "", &ConflictError{Message: "alert rule with exact config already exists"} + } + } + + if alertRule.Labels == nil { + alertRule.Labels = map[string]string{} + } + alertRule.Labels[k8s.AlertRuleLabelId] = computedId + + // Perform the update in-place exactly once + pr.Spec.Groups[foundGroupIdx].Rules[foundRuleIdx] = alertRule + + err = c.k8sClient.PrometheusRules().Update(ctx, *pr) + if err != nil { + return "", fmt.Errorf("failed to update PrometheusRule %s/%s: %w", pr.Namespace, pr.Name, err) + } + + if err := c.migrateClassificationOverrideIfRuleIDChanged(ctx, namespace, name, alertRuleId, computedId, alertRule.Alert); err != nil { + return "", err + } + + return computedId, nil +} + +func (c *client) migrateClassificationOverrideIfRuleIDChanged( + ctx context.Context, + ruleNamespace string, + prometheusRuleName string, + oldRuleId string, + newRuleId string, + alertName string, +) error { + if oldRuleId == "" || newRuleId == "" || oldRuleId == newRuleId { + return nil + } + + overrideNamespace := c.overrideNamespace + cmName := OverrideConfigMapName(ruleNamespace) + oldKey := classificationOverrideKey(oldRuleId) + newKey := classificationOverrideKey(newRuleId) + + for i := 0; i < 3; i++ { + cm, exists, err := c.k8sClient.ConfigMaps().Get(ctx, overrideNamespace, cmName) + if err != nil { + return err + } + if !exists || cm == nil || cm.Data == nil { + return nil + } + + raw, ok := cm.Data[oldKey] + if !ok || raw == "" { + return nil + } + + if _, already := cm.Data[newKey]; !already { + var entry alertRuleClassificationOverridePayload + if err := json.Unmarshal([]byte(raw), &entry); err == nil { + entry.AlertName = alertName + entry.RuleName = prometheusRuleName + entry.RuleNamespace = ruleNamespace + if encoded, err := json.Marshal(entry); err == nil { + raw = string(encoded) + } + } + cm.Data[newKey] = raw + } + delete(cm.Data, oldKey) + + if cm.Labels == nil { + cm.Labels = map[string]string{} + } + cm.Labels[managementlabels.AlertClassificationOverridesTypeLabelKey] = managementlabels.AlertClassificationOverridesTypeLabelValue + cm.Labels[managementlabels.AlertClassificationOverridesManagedByLabelKey] = managementlabels.AlertClassificationOverridesManagedByLabelValue + cm.Labels[k8s.PrometheusRuleLabelNamespace] = ruleNamespace + + if err := c.k8sClient.ConfigMaps().Update(ctx, *cm); err != nil { + if apierrors.IsConflict(err) { + continue + } + return err + } + return nil + } + + return fmt.Errorf("failed to migrate classification override after retries") +} diff --git a/pkg/management/update_user_defined_alert_rule_test.go b/pkg/management/update_user_defined_alert_rule_test.go new file mode 100644 index 000000000..72ebf6fe1 --- /dev/null +++ b/pkg/management/update_user_defined_alert_rule_test.go @@ -0,0 +1,617 @@ +package management_test + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "os" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + + alertrule "github.com/openshift/monitoring-plugin/pkg/alert_rule" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/management/testutils" + "github.com/openshift/monitoring-plugin/pkg/managementlabels" +) + +var _ = Describe("UpdateUserDefinedAlertRule", func() { + var ( + ctx context.Context + mockK8s *testutils.MockClient + client management.Client + ) + + var ( + // Original user rule as stored in PrometheusRule (without k8s labels) + originalUserRule = monitoringv1.Rule{ + Alert: "UserAlert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + }, + } + originalUserRuleId = alertrule.GetAlertingRuleId(&originalUserRule) + + // User rule as seen by RelabeledRules (with k8s labels added) + userRule = monitoringv1.Rule{ + Alert: "UserAlert", + Expr: intstr.FromString("up == 0"), + Labels: map[string]string{ + "severity": "warning", + k8s.PrometheusRuleLabelNamespace: "user-namespace", + k8s.PrometheusRuleLabelName: "user-rule", + }, + } + userRuleId = originalUserRuleId + + platformRule = monitoringv1.Rule{ + Alert: "PlatformAlert", + Labels: map[string]string{ + k8s.PrometheusRuleLabelNamespace: "openshift-monitoring", + k8s.PrometheusRuleLabelName: "platform-rule", + }, + } + platformRuleId = alertrule.GetAlertingRuleId(&platformRule) + ) + + BeforeEach(func() { + ctx = context.Background() + mockK8s = &testutils.MockClient{} + client = management.New(ctx, mockK8s) + + mockK8s.NamespaceFunc = func() k8s.NamespaceInterface { + return &testutils.MockNamespaceInterface{ + IsClusterMonitoringNamespaceFunc: func(name string) bool { + return name == "openshift-monitoring" + }, + } + } + }) + + Context("managed-by enforcement", func() { + It("blocks update when rule is GitOps-managed", func() { + gitopsRule := userRule + // Deep copy labels to avoid mutating shared map across tests + gitopsRule.Labels = make(map[string]string) + for k, v := range userRule.Labels { + gitopsRule.Labels[k] = v + } + gitopsRule.Labels[managementlabels.RuleManagedByLabel] = managementlabels.ManagedByGitOps + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return gitopsRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + updated := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updated) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by GitOps")) + }) + + It("blocks update when rule is operator-managed", func() { + opRule := userRule + // Deep copy labels to avoid mutating shared map across tests + opRule.Labels = make(map[string]string) + for k, v := range userRule.Labels { + opRule.Labels[k] = v + } + opRule.Labels[managementlabels.RuleManagedByLabel] = managementlabels.ManagedByOperator + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return opRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + updated := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updated) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("managed by an operator")) + }) + }) + Context("when rule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns NotFoundError", func() { + updatedRule := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, "nonexistent-id", updatedRule) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("AlertRule")) + }) + }) + + Context("when trying to update a platform rule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == platformRuleId { + return platformRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("returns an error", func() { + updatedRule := platformRule + _, err := client.UpdateUserDefinedAlertRule(ctx, platformRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot update alert rule in a platform-managed PrometheusRule")) + }) + }) + + Context("when PrometheusRule is not found", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, nil + }, + } + } + }) + + It("returns NotFoundError", func() { + updatedRule := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + + var notFoundErr *management.NotFoundError + Expect(errors.As(err, ¬FoundErr)).To(BeTrue()) + Expect(notFoundErr.Resource).To(Equal("PrometheusRule")) + }) + }) + + Context("when PrometheusRule Get returns an error", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return nil, false, errors.New("failed to get PrometheusRule") + }, + } + } + }) + + It("returns the error", func() { + updatedRule := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get PrometheusRule")) + }) + }) + + Context("when rule is not found in PrometheusRule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + // Return PrometheusRule but without the rule we're looking for + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{}, + }, + }, + }, + }, true, nil + }, + } + } + }) + + It("returns an error", func() { + updatedRule := userRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("AlertRule with id %s not found", userRuleId))) + }) + }) + + Context("when PrometheusRule Update fails", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalUserRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + return errors.New("failed to update PrometheusRule") + }, + } + } + }) + + It("returns the error", func() { + updatedRule := originalUserRule + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to update PrometheusRule")) + }) + }) + + Context("when successfully updating a rule", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + }) + + It("updates the rule in the PrometheusRule", func() { + var updatedPR *monitoringv1.PrometheusRule + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalUserRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + updatedPR = &pr + return nil + }, + } + } + + updatedRule := originalUserRule + // Create a deep copy of the Labels map to avoid modifying the original + updatedRule.Labels = make(map[string]string) + for k, v := range originalUserRule.Labels { + updatedRule.Labels[k] = v + } + updatedRule.Labels["severity"] = "critical" + updatedRule.Expr = intstr.FromString("up == 1") + + expectedNewRuleId := alertrule.GetAlertingRuleId(&updatedRule) + newRuleId, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(newRuleId).To(Equal(expectedNewRuleId)) + Expect(updatedPR).NotTo(BeNil()) + Expect(updatedPR.Spec.Groups[0].Rules[0].Labels["severity"]).To(Equal("critical")) + Expect(updatedPR.Spec.Groups[0].Rules[0].Expr.String()).To(Equal("up == 1")) + }) + + It("migrates classification override when rule id changes", func() { + Expect(os.Setenv("MONITORING_PLUGIN_NAMESPACE", "plugin-ns")).To(Succeed()) + DeferCleanup(func() { + _ = os.Unsetenv("MONITORING_PLUGIN_NAMESPACE") + }) + client = management.New(ctx, mockK8s) + + updatedRule := originalUserRule + updatedRule.Labels = make(map[string]string) + for k, v := range originalUserRule.Labels { + updatedRule.Labels[k] = v + } + updatedRule.Labels["severity"] = "critical" + updatedRule.Expr = intstr.FromString("up == 1") + + expectedNewRuleId := alertrule.GetAlertingRuleId(&updatedRule) + + cmName := management.OverrideConfigMapName("user-namespace") + oldKey := base64.RawURLEncoding.EncodeToString([]byte(userRuleId)) + overrideJSON, err := json.Marshal(map[string]any{ + "classification": map[string]any{ + "openshift_io_alert_rule_component": "api", + "openshift_io_alert_rule_layer": "cluster", + }, + }) + Expect(err).NotTo(HaveOccurred()) + + mockCM := &testutils.MockConfigMapInterface{ + ConfigMaps: map[string]*corev1.ConfigMap{ + "plugin-ns/" + cmName: { + ObjectMeta: metav1.ObjectMeta{ + Namespace: "plugin-ns", + Name: cmName, + Labels: map[string]string{ + managementlabels.AlertClassificationOverridesTypeLabelKey: managementlabels.AlertClassificationOverridesTypeLabelValue, + managementlabels.AlertClassificationOverridesManagedByLabelKey: managementlabels.AlertClassificationOverridesManagedByLabelValue, + k8s.PrometheusRuleLabelNamespace: "user-namespace", + }, + }, + Data: map[string]string{ + oldKey: string(overrideJSON), + }, + }, + }, + } + mockK8s.ConfigMapsFunc = func() k8s.ConfigMapInterface { return mockCM } + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalUserRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + return nil + }, + } + } + + newRuleId, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(newRuleId).To(Equal(expectedNewRuleId)) + + newKey := base64.RawURLEncoding.EncodeToString([]byte(expectedNewRuleId)) + cm := mockCM.ConfigMaps["plugin-ns/"+cmName] + Expect(cm).NotTo(BeNil()) + Expect(cm.Data).NotTo(HaveKey(oldKey)) + Expect(cm.Data).To(HaveKey(newKey)) + }) + + It("updates only the matching rule when multiple rules exist", func() { + anotherRule := monitoringv1.Rule{ + Alert: "AnotherAlert", + Expr: intstr.FromString("down == 1"), + } + + var updatedPR *monitoringv1.PrometheusRule + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalUserRule, anotherRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + updatedPR = &pr + return nil + }, + } + } + + updatedRule := originalUserRule + // Create a deep copy of the Labels map to avoid modifying the original + updatedRule.Labels = make(map[string]string) + for k, v := range originalUserRule.Labels { + updatedRule.Labels[k] = v + } + updatedRule.Labels["severity"] = "info" + + expectedNewRuleId := alertrule.GetAlertingRuleId(&updatedRule) + newRuleId, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(newRuleId).To(Equal(expectedNewRuleId)) + Expect(updatedPR).NotTo(BeNil()) + Expect(updatedPR.Spec.Groups[0].Rules).To(HaveLen(2)) + Expect(updatedPR.Spec.Groups[0].Rules[0].Labels["severity"]).To(Equal("info")) + Expect(updatedPR.Spec.Groups[0].Rules[1].Alert).To(Equal("AnotherAlert")) + }) + + It("updates rule in the correct group when multiple groups exist", func() { + var updatedPR *monitoringv1.PrometheusRule + + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "group1", + Rules: []monitoringv1.Rule{}, + }, + { + Name: "group2", + Rules: []monitoringv1.Rule{originalUserRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + updatedPR = &pr + return nil + }, + } + } + + updatedRule := originalUserRule + // Create a deep copy of the Labels map to avoid modifying the original + updatedRule.Labels = make(map[string]string) + for k, v := range originalUserRule.Labels { + updatedRule.Labels[k] = v + } + updatedRule.Labels["new_label"] = "new_value" + + expectedNewRuleId := alertrule.GetAlertingRuleId(&updatedRule) + newRuleId, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).NotTo(HaveOccurred()) + Expect(newRuleId).To(Equal(expectedNewRuleId)) + Expect(updatedPR).NotTo(BeNil()) + Expect(updatedPR.Spec.Groups).To(HaveLen(2)) + Expect(updatedPR.Spec.Groups[0].Rules).To(HaveLen(0)) + Expect(updatedPR.Spec.Groups[1].Rules).To(HaveLen(1)) + Expect(updatedPR.Spec.Groups[1].Rules[0].Labels["new_label"]).To(Equal("new_value")) + }) + }) + + Context("severity validation", func() { + BeforeEach(func() { + mockK8s.RelabeledRulesFunc = func() k8s.RelabeledRulesInterface { + return &testutils.MockRelabeledRulesInterface{ + GetFunc: func(ctx context.Context, id string) (monitoringv1.Rule, bool) { + if id == userRuleId { + return userRule, true + } + return monitoringv1.Rule{}, false + }, + } + } + mockK8s.PrometheusRulesFunc = func() k8s.PrometheusRuleInterface { + return &testutils.MockPrometheusRuleInterface{ + GetFunc: func(ctx context.Context, namespace string, name string) (*monitoringv1.PrometheusRule, bool, error) { + return &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{originalUserRule}, + }, + }, + }, + }, true, nil + }, + UpdateFunc: func(ctx context.Context, pr monitoringv1.PrometheusRule) error { + return nil + }, + } + } + }) + + It("rejects invalid severity", func() { + updatedRule := originalUserRule + updatedRule.Labels = map[string]string{ + "severity": "urgent", + } + _, err := client.UpdateUserDefinedAlertRule(ctx, userRuleId, updatedRule) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("invalid severity")) + }) + }) +}) diff --git a/pkg/managementlabels/management_labels.go b/pkg/managementlabels/management_labels.go new file mode 100644 index 000000000..cd704ab22 --- /dev/null +++ b/pkg/managementlabels/management_labels.go @@ -0,0 +1,30 @@ +package managementlabels + +const ( + // Label keys + RuleManagedByLabel = "openshift_io_rule_managed_by" + RelabelConfigManagedByLabel = "openshift_io_relabel_config_managed_by" + AlertNameLabel = "alertname" + AlertingRuleLabelName = "openshift_io_alerting_rule_name" + + // label values + ManagedByOperator = "operator" + ManagedByGitOps = "gitops" +) + +// ARC-related label and annotation keys +const ( + ARCLabelPrometheusRuleNameKey = "monitoring.openshift.io/prometheusrule-name" + ARCLabelAlertNameKey = "monitoring.openshift.io/alertname" + ARCAnnotationAlertRuleIDKey = "monitoring.openshift.io/alertRuleId" +) + +// Alert classification overrides ConfigMap metadata +const ( + AlertClassificationOverridesConfigMapName = "alert-classification-overrides" + + AlertClassificationOverridesTypeLabelKey = "monitoring.openshift.io/type" + AlertClassificationOverridesTypeLabelValue = "alert-classification-overrides" + AlertClassificationOverridesManagedByLabelKey = "app.kubernetes.io/managed-by" + AlertClassificationOverridesManagedByLabelValue = "openshift-console" +) diff --git a/pkg/server.go b/pkg/server.go index 653fca843..129d800e3 100644 --- a/pkg/server.go +++ b/pkg/server.go @@ -12,7 +12,6 @@ import ( "github.com/gorilla/handlers" "github.com/gorilla/mux" - "github.com/openshift/monitoring-plugin/pkg/proxy" "github.com/sirupsen/logrus" "gopkg.in/yaml.v2" v1 "k8s.io/api/core/v1" @@ -21,6 +20,12 @@ import ( "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + "github.com/openshift/monitoring-plugin/pkg/management" + "github.com/openshift/monitoring-plugin/pkg/proxy" + + "github.com/openshift/monitoring-plugin/pkg/k8s" ) var log = logrus.WithField("module", "server") @@ -56,10 +61,11 @@ type PluginConfig struct { type Feature string const ( - AcmAlerting Feature = "acm-alerting" - Incidents Feature = "incidents" - DevConfig Feature = "dev-config" - PersesDashboards Feature = "perses-dashboards" + AcmAlerting Feature = "acm-alerting" + Incidents Feature = "incidents" + DevConfig Feature = "dev-config" + PersesDashboards Feature = "perses-dashboards" + AlertManagementAPI Feature = "alert-management-api" ) func (pluginConfig *PluginConfig) MarshalJSON() ([]byte, error) { @@ -103,6 +109,8 @@ func (s *PluginServer) Shutdown(ctx context.Context) error { func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { acmMode := cfg.Features[AcmAlerting] + alertManagementAPIMode := cfg.Features[AlertManagementAPI] + acmLocationsLength := len(cfg.AlertmanagerUrl) + len(cfg.ThanosQuerierUrl) if acmLocationsLength > 0 && !acmMode { @@ -116,15 +124,19 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { return nil, fmt.Errorf("cannot set default port to reserved port %d", cfg.Port) } + var k8sconfig *rest.Config + var err error + // Uncomment the following line for local development: - // k8sconfig, err := clientcmd.BuildConfigFromFlags("", "$HOME/.kube/config") + // k8sconfig, err = clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG")) + // if err != nil { + // return nil, fmt.Errorf("cannot get kubeconfig from file: %w", err) + // } // Comment the following line for local development: var k8sclient *dynamic.DynamicClient - if acmMode { - - k8sconfig, err := rest.InClusterConfig() - + if acmMode || alertManagementAPIMode { + k8sconfig, err = rest.InClusterConfig() if err != nil { return nil, fmt.Errorf("cannot get in cluster config: %w", err) } @@ -137,7 +149,23 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { k8sclient = nil } - router, pluginConfig := setupRoutes(cfg) + // Initialize management client if management API feature is enabled + var managementClient management.Client + if alertManagementAPIMode { + k8sClient, err := k8s.NewClient(ctx, k8sconfig) + if err != nil { + return nil, fmt.Errorf("failed to create k8s client for alert management API: %w", err) + } + + if err := k8sClient.TestConnection(ctx); err != nil { + return nil, fmt.Errorf("failed to connect to kubernetes cluster for alert management API: %w", err) + } + + managementClient = management.New(ctx, k8sClient) + log.Info("alert management API enabled") + } + + router, pluginConfig := setupRoutes(cfg, managementClient) router.Use(corsHeaderMiddleware()) tlsConfig := &tls.Config{} @@ -222,7 +250,7 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { return httpServer, nil } -func setupRoutes(cfg *Config) (*mux.Router, *PluginConfig) { +func setupRoutes(cfg *Config, managementClient management.Client) (*mux.Router, *PluginConfig) { configHandlerFunc, pluginConfig := configHandler(cfg) router := mux.NewRouter() @@ -233,6 +261,12 @@ func setupRoutes(cfg *Config) (*mux.Router, *PluginConfig) { router.PathPrefix("/features").HandlerFunc(featuresHandler(cfg)) router.PathPrefix("/config").HandlerFunc(configHandlerFunc) + + if managementClient != nil { + managementRouter := managementrouter.New(managementClient) + router.PathPrefix("/api/v1/alerting").Handler(managementRouter) + } + router.PathPrefix("/").Handler(filesHandler(http.Dir(cfg.StaticPath))) return router, pluginConfig diff --git a/test/e2e/alert_management_api_test.go b/test/e2e/alert_management_api_test.go new file mode 100644 index 000000000..cbfe56402 --- /dev/null +++ b/test/e2e/alert_management_api_test.go @@ -0,0 +1,334 @@ +package e2e + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "testing" + "time" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/openshift/monitoring-plugin/internal/managementrouter" + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/test/e2e/framework" +) + +func listRulesForAlertMgmt(ctx context.Context, pluginURL string) ([]monitoringv1.Rule, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequestWithContext(ctx, http.MethodGet, pluginURL+"/api/v1/alerting/rules", nil) + if err != nil { + return nil, err + } + + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + var listResp struct { + Data struct { + Rules []monitoringv1.Rule `json:"rules"` + } `json:"data"` + Status string `json:"status"` + } + if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil { + return nil, err + } + + return listResp.Data.Rules, nil +} + +func TestBulkDeleteUserDefinedAlertRules(t *testing.T) { + f, err := framework.New() + if err != nil { + t.Fatalf("Failed to create framework: %v", err) + } + + ctx := context.Background() + + testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-bulk-delete", false) + if err != nil { + t.Fatalf("Failed to create test namespace: %v", err) + } + defer cleanup() + + forDuration := monitoringv1.Duration("5m") + + testRule1 := monitoringv1.Rule{ + Alert: "TestBulkDeleteAlert1", + Expr: intstr.FromString("up == 0"), + For: &forDuration, + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "description": "Test alert 1 for bulk delete testing", + }, + } + + testRule2 := monitoringv1.Rule{ + Alert: "TestBulkDeleteAlert2", + Expr: intstr.FromString("up == 1"), + For: &forDuration, + Labels: map[string]string{ + "severity": "info", + }, + Annotations: map[string]string{ + "description": "Test alert 2 for bulk delete testing", + }, + } + + testRule3 := monitoringv1.Rule{ + Alert: "TestBulkDeleteAlert3", + Expr: intstr.FromString("up == 2"), + For: &forDuration, + Labels: map[string]string{ + "severity": "critical", + }, + Annotations: map[string]string{ + "description": "Test alert 3 for bulk delete testing", + }, + } + + _, err = createPrometheusRule(ctx, f, testNamespace, testRule1, testRule2, testRule3) + if err != nil { + t.Fatalf("Failed to create PrometheusRule: %v", err) + } + + var ruleIdsToDelete []string + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + rules, err := listRulesForAlertMgmt(ctx, f.PluginURL) + if err != nil { + t.Logf("Failed to list rules: %v", err) + return false, nil + } + + foundRuleIds := []string{} + for _, rule := range rules { + if rule.Alert == "TestBulkDeleteAlert1" || rule.Alert == "TestBulkDeleteAlert2" { + ruleId := rule.Labels[k8s.AlertRuleLabelId] + if ruleId != "" { + foundRuleIds = append(foundRuleIds, ruleId) + } + } + } + + if len(foundRuleIds) == 2 { + ruleIdsToDelete = foundRuleIds + t.Logf("Found rule IDs to delete: %v", ruleIdsToDelete) + return true, nil + } + + t.Logf("Found %d/2 test alerts in memory", len(foundRuleIds)) + return false, nil + }) + + if err != nil { + t.Fatalf("Timeout waiting for alerts to appear in memory: %v", err) + } + + reqBody := managementrouter.BulkDeleteUserDefinedAlertRulesRequest{ + RuleIds: ruleIdsToDelete, + } + + reqJSON, err := json.Marshal(reqBody) + if err != nil { + t.Fatalf("Failed to marshal request body: %v", err) + } + + bulkDeleteURL := fmt.Sprintf("%s/api/v1/alerting/rules", f.PluginURL) + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, bulkDeleteURL, bytes.NewBuffer(reqJSON)) + if err != nil { + t.Fatalf("Failed to create HTTP request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + t.Fatalf("Failed to make bulk delete request: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("Expected status code %d, got %d. Response body: %s", http.StatusOK, resp.StatusCode, string(body)) + } + + var bulkDeleteResp managementrouter.BulkDeleteUserDefinedAlertRulesResponse + if err := json.NewDecoder(resp.Body).Decode(&bulkDeleteResp); err != nil { + t.Fatalf("Failed to decode response: %v", err) + } + + if len(bulkDeleteResp.Rules) != 2 { + t.Fatalf("Expected 2 rules in response, got %d", len(bulkDeleteResp.Rules)) + } + + for _, result := range bulkDeleteResp.Rules { + if result.StatusCode != http.StatusNoContent { + t.Errorf("Rule %s deletion failed with status %d: %s", result.Id, result.StatusCode, result.Message) + } else { + t.Logf("Rule %s deleted successfully", result.Id) + } + } + + promRule, err := f.Monitoringv1clientset.MonitoringV1().PrometheusRules(testNamespace).Get( + ctx, + "test-prometheus-rule", + metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("Failed to get PrometheusRule after deletion: %v", err) + } + + if len(promRule.Spec.Groups) != 1 { + t.Fatalf("Expected 1 rule group, got %d", len(promRule.Spec.Groups)) + } + + ruleGroup := promRule.Spec.Groups[0] + if len(ruleGroup.Rules) != 1 { + t.Fatalf("Expected 1 rule remaining, got %d: %+v", len(ruleGroup.Rules), ruleGroup.Rules) + } + + remainingRule := ruleGroup.Rules[0] + if remainingRule.Alert != "TestBulkDeleteAlert3" { + t.Errorf("Expected remaining rule to be TestBulkDeleteAlert3, got %s", remainingRule.Alert) + } + + if remainingRule.Labels["severity"] != "critical" { + t.Errorf("Expected severity=critical, got %s", remainingRule.Labels["severity"]) + } + + t.Log("Bulk delete test completed successfully - only TestBulkDeleteAlert3 remains") +} + +func TestDeleteUserDefinedAlertRuleById(t *testing.T) { + f, err := framework.New() + if err != nil { + t.Fatalf("Failed to create framework: %v", err) + } + + ctx := context.Background() + + testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-delete-by-id", false) + if err != nil { + t.Fatalf("Failed to create test namespace: %v", err) + } + defer cleanup() + + forDuration := monitoringv1.Duration("5m") + + testRule1 := monitoringv1.Rule{ + Alert: "TestDeleteByIdAlert1", + Expr: intstr.FromString("up == 0"), + For: &forDuration, + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "description": "Test alert 1 for delete by id testing", + }, + } + + testRule2 := monitoringv1.Rule{ + Alert: "TestDeleteByIdAlert2", + Expr: intstr.FromString("up == 1"), + For: &forDuration, + Labels: map[string]string{ + "severity": "info", + }, + Annotations: map[string]string{ + "description": "Test alert 2 for delete by id testing", + }, + } + + _, err = createPrometheusRule(ctx, f, testNamespace, testRule1, testRule2) + if err != nil { + t.Fatalf("Failed to create PrometheusRule: %v", err) + } + + var ruleIdToDelete string + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + rules, err := listRulesForAlertMgmt(ctx, f.PluginURL) + if err != nil { + t.Logf("Failed to list rules: %v", err) + return false, nil + } + + for _, rule := range rules { + if rule.Alert == "TestDeleteByIdAlert1" { + ruleIdToDelete = rule.Labels[k8s.AlertRuleLabelId] + t.Logf("Found rule ID to delete: %s", ruleIdToDelete) + return true, nil + } + } + + t.Logf("Test alert not found yet in memory") + return false, nil + }) + + if err != nil { + t.Fatalf("Timeout waiting for alerts to appear in memory: %v", err) + } + + deleteURL := fmt.Sprintf("%s/api/v1/alerting/rules/%s", f.PluginURL, ruleIdToDelete) + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, deleteURL, nil) + if err != nil { + t.Fatalf("Failed to create HTTP request: %v", err) + } + + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + if err != nil { + t.Fatalf("Failed to make delete request: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("Expected status code %d, got %d. Response body: %s", http.StatusNoContent, resp.StatusCode, string(body)) + } + + t.Logf("Rule %s deleted successfully", ruleIdToDelete) + + promRule, err := f.Monitoringv1clientset.MonitoringV1().PrometheusRules(testNamespace).Get( + ctx, + "test-prometheus-rule", + metav1.GetOptions{}, + ) + if err != nil { + t.Fatalf("Failed to get PrometheusRule after deletion: %v", err) + } + + if len(promRule.Spec.Groups) != 1 { + t.Fatalf("Expected 1 rule group, got %d", len(promRule.Spec.Groups)) + } + + ruleGroup := promRule.Spec.Groups[0] + if len(ruleGroup.Rules) != 1 { + t.Fatalf("Expected 1 rule remaining, got %d: %+v", len(ruleGroup.Rules), ruleGroup.Rules) + } + + remainingRule := ruleGroup.Rules[0] + if remainingRule.Alert != "TestDeleteByIdAlert2" { + t.Errorf("Expected remaining rule to be TestDeleteByIdAlert2, got %s", remainingRule.Alert) + } + + if remainingRule.Labels["severity"] != "info" { + t.Errorf("Expected severity=info, got %s", remainingRule.Labels["severity"]) + } + + t.Log("Delete by ID test completed successfully - only TestDeleteByIdAlert2 remains") +} diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go new file mode 100644 index 000000000..1adb98742 --- /dev/null +++ b/test/e2e/framework/framework.go @@ -0,0 +1,95 @@ +package framework + +import ( + "context" + "fmt" + "os" + "strconv" + "time" + + osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" + "github.com/openshift/monitoring-plugin/pkg/k8s" + monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +var f *Framework + +type Framework struct { + Clientset *kubernetes.Clientset + Monitoringv1clientset *monitoringv1client.Clientset + Osmv1clientset *osmv1client.Clientset + + PluginURL string +} + +type CleanupFunc func() error + +func New() (*Framework, error) { + if f != nil { + return f, nil + } + + kubeConfigPath := os.Getenv("KUBECONFIG") + if kubeConfigPath == "" { + return nil, fmt.Errorf("KUBECONFIG environment variable not set") + } + + pluginURL := os.Getenv("PLUGIN_URL") + if pluginURL == "" { + return nil, fmt.Errorf("PLUGIN_URL environment variable not set, skipping management API e2e test") + } + + config, err := clientcmd.BuildConfigFromFlags("", kubeConfigPath) + if err != nil { + return nil, fmt.Errorf("failed to build config: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + monitoringv1clientset, err := monitoringv1client.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create monitoringv1 clientset: %w", err) + } + + osmv1clientset, err := osmv1client.NewForConfig(config) + if err != nil { + return nil, fmt.Errorf("failed to create osmv1 clientset: %w", err) + } + + f = &Framework{ + Clientset: clientset, + Monitoringv1clientset: monitoringv1clientset, + Osmv1clientset: osmv1clientset, + PluginURL: pluginURL, + } + + return f, nil +} + +func (f *Framework) CreateNamespace(ctx context.Context, name string, isClusterMonitoringNamespace bool) (string, CleanupFunc, error) { + testNamespace := fmt.Sprintf("%s-%d", name, time.Now().Unix()) + namespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespace, + Labels: map[string]string{ + k8s.ClusterMonitoringLabel: strconv.FormatBool(isClusterMonitoringNamespace), + }, + }, + } + + _, err := f.Clientset.CoreV1().Namespaces().Create(ctx, namespace, metav1.CreateOptions{}) + if err != nil { + return "", nil, fmt.Errorf("failed to create test namespace: %w", err) + } + + return testNamespace, func() error { + return f.Clientset.CoreV1().Namespaces().Delete(ctx, testNamespace, metav1.DeleteOptions{}) + }, nil +} diff --git a/test/e2e/relabeled_rules_test.go b/test/e2e/relabeled_rules_test.go new file mode 100644 index 000000000..3d114e179 --- /dev/null +++ b/test/e2e/relabeled_rules_test.go @@ -0,0 +1,291 @@ +package e2e + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "testing" + "time" + + osmv1 "github.com/openshift/api/monitoring/v1" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/openshift/monitoring-plugin/pkg/k8s" + "github.com/openshift/monitoring-plugin/test/e2e/framework" +) + +type listRulesResponse struct { + Data listRulesResponseData `json:"data"` + Status string `json:"status"` +} + +type listRulesResponseData struct { + Rules []monitoringv1.Rule `json:"rules"` +} + +func listRules(ctx context.Context, pluginURL string) ([]monitoringv1.Rule, error) { + client := &http.Client{Timeout: 10 * time.Second} + req, err := http.NewRequestWithContext(ctx, http.MethodGet, pluginURL+"/api/v1/alerting/rules", nil) + if err != nil { + return nil, err + } + + resp, err := client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + var listResp listRulesResponse + if err := json.NewDecoder(resp.Body).Decode(&listResp); err != nil { + return nil, err + } + + return listResp.Data.Rules, nil +} + +func TestPrometheusRuleAppearsInMemory(t *testing.T) { + f, err := framework.New() + if err != nil { + t.Fatalf("Failed to create framework: %v", err) + } + + ctx := context.Background() + + testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-prometheus-rule", false) + if err != nil { + t.Fatalf("Failed to create test namespace: %v", err) + } + defer cleanup() + + testAlertName := "TestAlert" + forDuration := monitoringv1.Duration("5m") + testRule := monitoringv1.Rule{ + Alert: testAlertName, + Expr: intstr.FromString("up == 0"), + For: &forDuration, + Labels: map[string]string{ + "severity": "warning", + }, + Annotations: map[string]string{ + "description": "Test alert for e2e testing", + "summary": "Test alert", + }, + } + + _, err = createPrometheusRule(ctx, f, testNamespace, testRule) + if err != nil { + t.Fatalf("Failed to create PrometheusRule: %v", err) + } + + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + rules, err := listRules(ctx, f.PluginURL) + if err != nil { + t.Logf("Failed to list rules: %v", err) + return false, nil + } + + for _, rule := range rules { + if rule.Alert == testAlertName { + expectedLabels := map[string]string{ + k8s.PrometheusRuleLabelNamespace: testNamespace, + k8s.PrometheusRuleLabelName: "test-prometheus-rule", + } + + if err := compareRuleLabels(t, testAlertName, rule.Labels, expectedLabels); err != nil { + return false, err + } + + if _, ok := rule.Labels[k8s.AlertRuleLabelId]; !ok { + t.Errorf("Alert %s missing openshift_io_alert_rule_id label", testAlertName) + return false, fmt.Errorf("alert missing openshift_io_alert_rule_id label") + } + + t.Logf("Found alert %s in memory with all expected labels", testAlertName) + return true, nil + } + } + + t.Logf("Alert %s not found in memory yet (found %d rules)", testAlertName, len(rules)) + return false, nil + }) + + if err != nil { + t.Fatalf("Timeout waiting for alert to appear in memory: %v", err) + } +} + +func TestRelabelAlert(t *testing.T) { + f, err := framework.New() + if err != nil { + t.Fatalf("Failed to create framework: %v", err) + } + + ctx := context.Background() + + testNamespace, cleanup, err := f.CreateNamespace(ctx, "test-relabel-alert", true) + if err != nil { + t.Fatalf("Failed to create test namespace: %v", err) + } + defer cleanup() + + forDuration := monitoringv1.Duration("5m") + + criticalRule := monitoringv1.Rule{ + Alert: "TestRelabelAlert", + Expr: intstr.FromString("up == 0"), + For: &forDuration, + Labels: map[string]string{ + "severity": "critical", + "team": "web", + }, + Annotations: map[string]string{ + "description": "Critical alert for relabel testing", + "summary": "Critical test alert", + }, + } + + warningRule := monitoringv1.Rule{ + Alert: "TestRelabelAlert", + Expr: intstr.FromString("up == 1"), + For: &forDuration, + Labels: map[string]string{ + "severity": "warning", + "team": "web", + }, + Annotations: map[string]string{ + "description": "Warning alert for relabel testing", + "summary": "Warning test alert", + }, + } + + _, err = createPrometheusRule(ctx, f, testNamespace, criticalRule, warningRule) + if err != nil { + t.Fatalf("Failed to create PrometheusRule: %v", err) + } + + relabelConfigName := "change-critical-team" + arc := &osmv1.AlertRelabelConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: relabelConfigName, + Namespace: k8s.ClusterMonitoringNamespace, + }, + Spec: osmv1.AlertRelabelConfigSpec{ + Configs: []osmv1.RelabelConfig{ + { + SourceLabels: []osmv1.LabelName{"alertname", "severity"}, + Regex: "TestRelabelAlert;critical", + Separator: ";", + TargetLabel: "team", + Replacement: "ops", + Action: "Replace", + }, + }, + }, + } + + _, err = f.Osmv1clientset.MonitoringV1().AlertRelabelConfigs(k8s.ClusterMonitoringNamespace).Create( + ctx, + arc, + metav1.CreateOptions{}, + ) + if err != nil { + t.Fatalf("Failed to create AlertRelabelConfig: %v", err) + } + defer func() { + err = f.Osmv1clientset.MonitoringV1().AlertRelabelConfigs(k8s.ClusterMonitoringNamespace).Delete(ctx, relabelConfigName, metav1.DeleteOptions{}) + if err != nil { + t.Fatalf("Failed to delete AlertRelabelConfig: %v", err) + } + }() + + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { + rules, err := listRules(ctx, f.PluginURL) + if err != nil { + t.Logf("Failed to list rules: %v", err) + return false, nil + } + + foundCriticalWithOps := false + foundWarningWithWeb := false + + for _, rule := range rules { + if rule.Alert == "TestRelabelAlert" { + if rule.Labels["team"] == "ops" && rule.Labels["severity"] == "critical" { + t.Logf("Found critical alert with team=ops (relabeling successful)") + foundCriticalWithOps = true + } + + if rule.Labels["team"] == "web" && rule.Labels["severity"] == "warning" { + t.Logf("Found warning alert with team=web") + foundWarningWithWeb = true + } + } + } + + if foundCriticalWithOps { + t.Logf("Relabeling verified: critical alert has team=ops, warning alert has team=web") + return true, nil + } + + t.Logf("Waiting for relabeling to take effect (critical with ops=%v, warning with web=%v)", foundCriticalWithOps, foundWarningWithWeb) + return false, nil + }) + + if err != nil { + t.Fatalf("Timeout waiting for relabeling to take effect: %v", err) + } +} + +func createPrometheusRule(ctx context.Context, f *framework.Framework, namespace string, rules ...monitoringv1.Rule) (*monitoringv1.PrometheusRule, error) { + interval := monitoringv1.Duration("30s") + prometheusRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-prometheus-rule", + Namespace: namespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Interval: &interval, + Rules: rules, + }, + }, + }, + } + + return f.Monitoringv1clientset.MonitoringV1().PrometheusRules(namespace).Create( + ctx, + prometheusRule, + metav1.CreateOptions{}, + ) +} + +func compareRuleLabels(t *testing.T, alertName string, foundLabels map[string]string, wantedLabels map[string]string) error { + if foundLabels == nil { + t.Errorf("Alert %s has no labels", alertName) + return fmt.Errorf("alert has no labels") + } + + for key, wantValue := range wantedLabels { + if gotValue, ok := foundLabels[key]; !ok { + t.Errorf("Alert %s missing %s label", alertName, key) + return fmt.Errorf("alert missing %s label", key) + } else if gotValue != wantValue { + t.Errorf("Alert %s has wrong %s label. Expected %s, got %s", + alertName, key, wantValue, gotValue) + return fmt.Errorf("alert has wrong %s label", key) + } + } + + return nil +}