diff --git a/grafana/README.md b/grafana/README.md index 50ccb1c..3a7fb24 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -1,4 +1,72 @@ # Modules for Grafana alerts and dashboards -## How to use +## Alerting + +Please check documentation about Grafana alerting [here](https://itdoc.schwarz/x/X11nf) and [official documentation](https://grafana.com/docs/grafana/latest/alerting/) for deeper look. + +The Terraform modules are separated per resource type, check README in each module directory for spefic examples. +Below is example for alerts using โ€ž**Prometheus/Thanos**โ€œ datasorce and sending notification to โ€ž**Google Chat**โ€œ. + + + +```hcl title="main.tf" + # Datasource +module "datasource" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main" + datasource_name = "Thanos - Myteam" + datasource_url = var.datasource_url + datasource_username = var.datasource_username + datasource_password = var.datasource_password +} + +# Alert Receiver / Contact Point +module "google-chat-contact-point" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main" + google-chat-url = var.google_chat_url + contact-point-name = "gchat" +} + + +# Alert Rule Folders +module "alert-folder" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main" + alert-folder = "Alerts" +} + + +# Template for messages +module "message-templates" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/message-template?ref=main" + templates_dir = "templates" + disable_provenance = true +} + +# Notification policies +module "notification-policy" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main" + + default_contact_point_uid = module.google-chat-contact-point.contact_name + group_by = ["alertname"] + + folder_policies = { + "Alerts" = module.google-chat-contact-point.contact_name + } +} + +# Alert definition +module "alerting" { + source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main" + + alerts_dir = "alerts" + default_datasource_uid = module.datasource.datasource_uid + default_receiver = module.google-chat-contact-point.contact_name + default_folder_uid = module.alert-folder.folder_uid + default_interval_seconds = 60 + disable_provenance = true +} +``` +With this configuration you need to place your notification templates into `templates` folder and alert definitions in YAML format to `alerts` folder in same directoty where `main.tf`is located. +You can example for both in [examples](./examples/) folder. + +## Dashboard TODO diff --git a/grafana/examples/alerts/alerts.yaml b/grafana/examples/alerts/alerts.yaml new file mode 100644 index 0000000..95844dd --- /dev/null +++ b/grafana/examples/alerts/alerts.yaml @@ -0,0 +1,339 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: "alerts" + interval: 1m + rules: + - uid: KubernetesPodCrashLooping + title: Kubernetes Pod CrashLooping + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + model: + editorMode: code + expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 5m + annotations: + description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently + summary: Pod {{ $labels.pod }} is crash looping + isPaused: false + + - uid: KubernetesPodPending + title: Kubernetes Pod Pending + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + model: + editorMode: code + expr: kube_pod_status_phase{phase="Pending"} > 0 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 10m + isPaused: false + annotations: + description: Pod {{ $labels.pod }} is in Pending state + summary: Pod {{ $labels.pod }} is pending + + - uid: ContainerCPUUsageHigh + title: Container CPU Usage High + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + model: + editorMode: code + expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 5m + isPaused: false + annotations: + summary: "High CPU usage for container {{ $labels.container }}" + description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes." + + - uid: ContainerMemoryUsageHigh + title: Container Memory Usage High + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + model: + editorMode: code + expr: container_memory_usage_bytes{} + / + container_spec_memory_limit_bytes{} + > 0.9 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 5m + isPaused: false + annotations: + summary: "High memory usage for container {{ $labels.container }}" + description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit." + + - uid: PVCStorageAlmostFull + title: PVC Storage Almost Full + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + model: + editorMode: code + expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 5m + isPaused: false + annotations: + summary: "PVC almost full" + description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full." + +# Loki Alerts + - uid: loki-alert + title: Loki-Alert + condition: C + data: + - refId: A + queryType: range + relativeTimeRange: + from: 600 + to: 0 + #datasourceUid: berm4y85oiwaoc + #model: + # datasource: + # type: loki + # uid: berm4y85oiwaoc + editorMode: code + expr: count_over_time({job="controlling"} |= "error while querying DB" [10m]) + hide: false + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + queryType: range + refId: A + - refId: reducer + queryType: expression + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: reducer + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: reducer + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + noDataState: OK + execErrState: Error + for: 5m + isPaused: false diff --git a/grafana/examples/templates/google-chat-body-template.tmpl b/grafana/examples/templates/google-chat-body-template.tmpl new file mode 100644 index 0000000..9c02a8f --- /dev/null +++ b/grafana/examples/templates/google-chat-body-template.tmpl @@ -0,0 +1,26 @@ +{{ define "google-chat-body-template" -}} +{{- $alerts := .Alerts }} +{{- if not $alerts }}{{ $alerts = . }}{{ end }} + +{{- range $alerts }} +๐Ÿšจ *{{ index .Labels "alertname" }}* ({{ .Status }}) + +{{- with index .Labels "pod" }} +๐Ÿ›ข๏ธ Pod: `{{ . }}` +{{- end }} +{{- with index .Labels "container" }} +๐Ÿ“ฆ Container: `{{ . }}` +{{- end }} +{{- with index .Labels "stage" }} +๐Ÿงช Stage: `{{ . }}` +{{- end }} +{{- with index .Labels "cluster" }} +๐ŸŒ K8s cluster: `{{ . }}` +{{- end }} + +๐Ÿ“ {{ with index .Annotations "summary" }}{{ . }}{{ else }}n/a{{ end }} +๐Ÿ“„ {{ with index .Annotations "description" }}{{ . }}{{ else }}n/a{{ end }} + +{{ end -}} +{{ end }} + diff --git a/grafana/examples/templates/google-chat-title-template.tmpl b/grafana/examples/templates/google-chat-title-template.tmpl new file mode 100644 index 0000000..5bbe8bc --- /dev/null +++ b/grafana/examples/templates/google-chat-title-template.tmpl @@ -0,0 +1,36 @@ +{{ define "google-chat-title-template" -}} +{{- if eq .Status "firing" -}} +๐Ÿ”ฅ Firing: +{{- else if eq .Status "resolved" -}} +โœ… Resolved: +{{- else -}} +โš ๏ธ Alert Status: {{ .Status }}: +{{- end }} + +{{- $alerts := .Alerts }} +{{- if not $alerts }}{{ $alerts = . }}{{ end }} + +{{- $a1 := "" }}{{ $a2 := "" }}{{ $a3 := "" }}{{ $a4 := "" }}{{ $a5 := "" }} +{{- $a6 := "" }}{{ $a7 := "" }}{{ $a8 := "" }}{{ $a9 := "" }}{{ $a10 := "" }} +{{- $sep := " " }} + +{{- range $alerts }} + {{- $name := index .Labels "alertname" }} + {{- if and (ne $name $a1) (ne $name $a2) (ne $name $a3) (ne $name $a4) (ne $name $a5) + (ne $name $a6) (ne $name $a7) (ne $name $a8) (ne $name $a9) (ne $name $a10) }} + {{- printf "%s%s" $sep $name }} + {{- $sep = ", " }} + {{- if eq $a1 "" }}{{ $a1 = $name }} + {{- else if eq $a2 "" }}{{ $a2 = $name }} + {{- else if eq $a3 "" }}{{ $a3 = $name }} + {{- else if eq $a4 "" }}{{ $a4 = $name }} + {{- else if eq $a5 "" }}{{ $a5 = $name }} + {{- else if eq $a6 "" }}{{ $a6 = $name }} + {{- else if eq $a7 "" }}{{ $a7 = $name }} + {{- else if eq $a8 "" }}{{ $a8 = $name }} + {{- else if eq $a9 "" }}{{ $a9 = $name }} + {{- else if eq $a10 "" }}{{ $a10 = $name }} + {{- end }} + {{- end }} +{{- end }} +{{- end }}