Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,21 @@ Examples:
| total=2 firing=1 pending=0 inactive=1

Flags:
--exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex.
-h, --help help for alert
-n, --name strings The name of one or more specific alerts to check.
This parameter can be repeated e.G.: '--name alert1 --name alert2'
If no name is given, all alerts will be evaluated
-g, --group strings The name of one or more specific groups to check.
This parameter can be repeated e.G.: '--group group1 --group group2'
If no group is given, all groups will be scanned for alerts
-T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK")
-P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed
--exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex.
--exclude-label stringArray The label of one or more specific alerts to exclude.
This parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'
-g, --group strings The name of one or more specific groups to check for alerts.
This parameter can be repeated e.g.: '--group group1 --group group2'
If no group is given, all groups will be scanned for alerts
-h, --help help for alert
--include-label stringArray The label of one or more specific alerts to include.
This parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'
Note that repeated --include-label are combined using a union.
-n, --name strings The name of one or more specific alerts to check.
This parameter can be repeated e.g.: '--name alert1 --name alert2'
If no name is given, all alerts will be evaluated
-T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK")
-P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed
```

#### Checking all defined alerts
Expand Down
71 changes: 53 additions & 18 deletions cmd/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,29 @@ import (
"errors"
"fmt"
"regexp"
"slices"
"strings"

"github.com/NETWAYS/check_prometheus/internal/alert"
"github.com/NETWAYS/go-check"
"github.com/NETWAYS/go-check/perfdata"
"github.com/NETWAYS/go-check/result"
"github.com/prometheus/common/model"
"github.com/spf13/cobra"
)

type AlertConfig struct {
AlertName []string
Group []string
ExcludeAlerts []string
ExcludeLabels []string
IncludeLabels []string
ProblemsOnly bool
NoAlertsState string
}

var cliAlertConfig AlertConfig

func contains(s string, list []string) bool {
// Tiny helper to see if a string is in a list of strings
for _, elem := range list {
if s == elem {
return true
}
}

return false
}

var alertCmd = &cobra.Command{
Use: "alert",
Short: "Checks the status of a Prometheus alert",
Expand Down Expand Up @@ -112,30 +105,43 @@ inactive = 0`,
var overall result.Overall

for _, rl := range rules {

// If it's not the Alert we're looking for, Skip!
if cliAlertConfig.AlertName != nil {
if !contains(rl.AlertingRule.Name, cliAlertConfig.AlertName) {
if !slices.Contains(cliAlertConfig.AlertName, rl.AlertingRule.Name) {
continue
}
}

labelsMatchedInclude := matchesLabel(rl.AlertingRule.Labels, cliAlertConfig.IncludeLabels)

if len(cliAlertConfig.IncludeLabels) > 0 && !labelsMatchedInclude {
// If the alert labels don't match here we can skip it.
continue
}

// Skip inactive alerts if flag is set
if len(rl.AlertingRule.Alerts) == 0 && cliAlertConfig.ProblemsOnly {
continue
}

alertMatched, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts)
alertMatchedExclude, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts)

if regexErr != nil {
check.ExitRaw(check.Unknown, "Invalid regular expression provided:", regexErr.Error())
}

if alertMatched {
if alertMatchedExclude {
// If the alert matches a regex from the list we can skip it.
continue
}

labelsMatchedExclude := matchesLabel(rl.AlertingRule.Labels, cliAlertConfig.ExcludeLabels)

if len(cliAlertConfig.ExcludeLabels) > 0 && labelsMatchedExclude {
// If the alert labels matches here we can skip it.
continue
}

// Handle Inactive Alerts
if len(rl.AlertingRule.Alerts) == 0 {
// Counting states for perfdata
Expand Down Expand Up @@ -218,18 +224,28 @@ func init() {

fs.StringVarP(&cliAlertConfig.NoAlertsState, "no-alerts-state", "T", "OK", "State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK")

fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{}, "Alerts to ignore. Can be used multiple times and supports regex.")
fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{},
"Alerts to ignore. Can be used multiple times and supports regex.")

fs.StringSliceVarP(&cliAlertConfig.AlertName, "name", "n", nil,
"The name of one or more specific alerts to check."+
"\nThis parameter can be repeated e.G.: '--name alert1 --name alert2'"+
"\nThis parameter can be repeated e.g.: '--name alert1 --name alert2'"+
"\nIf no name is given, all alerts will be evaluated")

fs.StringSliceVarP(&cliAlertConfig.Group, "group", "g", nil,
"The name of one or more specific groups to check for alerts."+
"\nThis parameter can be repeated e.G.: '--group group1 --group group2'"+
"\nThis parameter can be repeated e.g.: '--group group1 --group group2'"+
"\nIf no group is given, all groups will be scanned for alerts")

fs.StringArrayVar(&cliAlertConfig.IncludeLabels, "include-label", []string{},
"The label of one or more specific alerts to include. "+
"\nThis parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'"+
"\nNote that repeated --include-label are combined using a union.")

fs.StringArrayVar(&cliAlertConfig.ExcludeLabels, "exclude-label", []string{},
"The label of one or more specific alerts to exclude."+
"\nThis parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'")

fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false,
"Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed")
}
Expand Down Expand Up @@ -267,3 +283,22 @@ func matches(input string, regexToExclude []string) (bool, error) {

return false, nil
}

// Matches a list of labels against a list of labels
func matchesLabel(labels model.LabelSet, labelsToMatch []string) bool {
for _, lb := range labelsToMatch {
kv := strings.SplitN(lb, "=", 2)

if len(kv) != 2 {
continue
}

key, value := model.LabelName(kv[0]), model.LabelValue(kv[1])

if val, ok := labels[key]; ok && val == value {
return true
}
}

return false
}
70 changes: 70 additions & 0 deletions cmd/alert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,76 @@ exit status 2
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert"},
expected: "[OK] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [OK] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\n",
},
{
name: "alert-include-label",
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(loadTestdata(alertTestDataSet1))
})),
args: []string{"run", "../main.go", "alert", "--include-label", "severity=critical"},
expected: `[CRITICAL] - 2 Alerts: 1 Firing - 0 Pending - 1 Inactive
\_ [OK] [HostOutOfMemory] is inactive
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
|total=2 firing=1 pending=0 inactive=1

exit status 2
`,
},
{
name: "alert-exclude-label",
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(loadTestdata(alertTestDataSet1))
})),
args: []string{"run", "../main.go", "alert", "--exclude-label", "severity=critical"},
expected: `[WARNING] - 1 Alerts: 0 Firing - 1 Pending - 0 Inactive
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
|total=1 firing=0 pending=1 inactive=0

exit status 1
`,
},
{
name: "alert-include-label-multiple",
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(loadTestdata(alertTestDataSet1))
})),
args: []string{"run", "../main.go", "alert", "--include-label", "team=database", "--include-label", "severity=critical"},
expected: `[CRITICAL] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive
\_ [OK] [HostOutOfMemory] is inactive
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
|total=3 firing=1 pending=1 inactive=1

exit status 2
`,
},
{
name: "alert-include-label-multiple-similar",
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(loadTestdata(alertTestDataSet1))
})),
args: []string{"run", "../main.go", "alert", "--include-label", "severity=warning", "--include-label", "severity=critical"},
expected: `[CRITICAL] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive
\_ [OK] [HostOutOfMemory] is inactive
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
|total=3 firing=1 pending=1 inactive=1

exit status 2
`,
},
{
name: "alert-exclude-label-multiple",
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write(loadTestdata(alertTestDataSet1))
})),
args: []string{"run", "../main.go", "alert", "--exclude-label", "team=database", "--exclude-label", "severity=critical"},
expected: "[OK] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive\n\\_ [OK] No alerts retrieved\n|total=0 firing=0 pending=0 inactive=0\n\n",
},
}

for _, test := range tests {
Expand Down
4 changes: 2 additions & 2 deletions testdata/alertmanager/alert.rules
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ groups:
expr: absent(up{job="alertmanager"})
for: 0m
labels:
severity: warning
severity: low
annotations:
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Expand All @@ -33,7 +33,7 @@ groups:
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
severity: extreme
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
9 changes: 6 additions & 3 deletions testdata/unittest/alertDataset1.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"query": "up",
"duration": 120,
"labels": {
"severity": "critical"
"severity": "critical",
"team": "network"
},
"annotations": {
"description": "Foo",
Expand Down Expand Up @@ -40,7 +41,8 @@
"query": "mysql",
"duration": 17280000,
"labels": {
"severity": "warning"
"severity": "warning",
"team": "database"
},
"annotations": {
"description": "MySQL",
Expand Down Expand Up @@ -84,7 +86,8 @@
"query": "SSL",
"duration": 0,
"labels": {
"severity": "critical"
"severity": "critical",
"team": "network"
},
"annotations": {
"description": "TLS",
Expand Down