This commit is contained in:
Stanislav_Kopp 2025-07-16 08:50:43 +02:00
parent 8d05d7e5ad
commit f5ca31d62c
4 changed files with 470 additions and 1 deletions

View file

@ -1,4 +1,72 @@
# Modules for Grafana alerts and dashboards
## How to use
## Alerting
Please check documentation about Grafana alerting [here](https://itdoc.schwarz/x/X11nf) and [official documentation](https://grafana.com/docs/grafana/latest/alerting/) for deeper look.
The Terraform modules are separated per resource type, check README in each module directory for spefic examples.
Below is example for alerts using „**Prometheus/Thanos**“ datasorce and sending notification to „**Google Chat**“.
```hcl title="main.tf"
# Datasource
module "datasource" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main"
datasource_name = "Thanos - Myteam"
datasource_url = var.datasource_url
datasource_username = var.datasource_username
datasource_password = var.datasource_password
}
# Alert Receiver / Contact Point
module "google-chat-contact-point" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main"
google-chat-url = var.google_chat_url
contact-point-name = "gchat"
}
# Alert Rule Folders
module "alert-folder" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main"
alert-folder = "Alerts"
}
# Template for messages
module "message-templates" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/message-template?ref=main"
templates_dir = "templates"
disable_provenance = true
}
# Notification policies
module "notification-policy" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main"
default_contact_point_uid = module.google-chat-contact-point.contact_name
group_by = ["alertname"]
folder_policies = {
"Alerts" = module.google-chat-contact-point.contact_name
}
}
# Alert definition
module "alerting" {
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main"
alerts_dir = "alerts"
default_datasource_uid = module.datasource.datasource_uid
default_receiver = module.google-chat-contact-point.contact_name
default_folder_uid = module.alert-folder.folder_uid
default_interval_seconds = 60
disable_provenance = true
}
```
With this configuration you need to place your notification templates into `templates` folder and alert definitions in YAML format to `alerts` folder in same directoty where `main.tf`is located.
You can example for both in [examples](./examples/) folder.
## Dashboard
TODO

View file

@ -0,0 +1,339 @@
apiVersion: 1
groups:
- orgId: 1
name: "alerts"
interval: 1m
rules:
- uid: KubernetesPodCrashLooping
title: Kubernetes Pod CrashLooping
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
annotations:
description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently
summary: Pod {{ $labels.pod }} is crash looping
isPaused: false
- uid: KubernetesPodPending
title: Kubernetes Pod Pending
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: kube_pod_status_phase{phase="Pending"} > 0
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 10m
isPaused: false
annotations:
description: Pod {{ $labels.pod }} is in Pending state
summary: Pod {{ $labels.pod }} is pending
- uid: ContainerCPUUsageHigh
title: Container CPU Usage High
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "High CPU usage for container {{ $labels.container }}"
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes."
- uid: ContainerMemoryUsageHigh
title: Container Memory Usage High
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: container_memory_usage_bytes{}
/
container_spec_memory_limit_bytes{}
> 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "High memory usage for container {{ $labels.container }}"
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit."
- uid: PVCStorageAlmostFull
title: PVC Storage Almost Full
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "PVC almost full"
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full."
# Loki Alerts
- uid: loki-alert
title: Loki-Alert
condition: C
data:
- refId: A
queryType: range
relativeTimeRange:
from: 600
to: 0
#datasourceUid: berm4y85oiwaoc
#model:
# datasource:
# type: loki
# uid: berm4y85oiwaoc
editorMode: code
expr: count_over_time({job="controlling"} |= "error while querying DB" [10m])
hide: false
instant: true
intervalMs: 1000
maxDataPoints: 43200
queryType: range
refId: A
- refId: reducer
queryType: expression
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: reducer
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: reducer
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false

View file

@ -0,0 +1,26 @@
{{ define "google-chat-body-template" -}}
{{- $alerts := .Alerts }}
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
{{- range $alerts }}
🚨 *{{ index .Labels "alertname" }}* ({{ .Status }})
{{- with index .Labels "pod" }}
🛢️ Pod: `{{ . }}`
{{- end }}
{{- with index .Labels "container" }}
📦 Container: `{{ . }}`
{{- end }}
{{- with index .Labels "stage" }}
🧪 Stage: `{{ . }}`
{{- end }}
{{- with index .Labels "cluster" }}
🌐 K8s cluster: `{{ . }}`
{{- end }}
📝 {{ with index .Annotations "summary" }}{{ . }}{{ else }}n/a{{ end }}
📄 {{ with index .Annotations "description" }}{{ . }}{{ else }}n/a{{ end }}
{{ end -}}
{{ end }}

View file

@ -0,0 +1,36 @@
{{ define "google-chat-title-template" -}}
{{- if eq .Status "firing" -}}
🔥 Firing:
{{- else if eq .Status "resolved" -}}
✅ Resolved:
{{- else -}}
⚠️ Alert Status: {{ .Status }}:
{{- end }}
{{- $alerts := .Alerts }}
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
{{- $a1 := "" }}{{ $a2 := "" }}{{ $a3 := "" }}{{ $a4 := "" }}{{ $a5 := "" }}
{{- $a6 := "" }}{{ $a7 := "" }}{{ $a8 := "" }}{{ $a9 := "" }}{{ $a10 := "" }}
{{- $sep := " " }}
{{- range $alerts }}
{{- $name := index .Labels "alertname" }}
{{- if and (ne $name $a1) (ne $name $a2) (ne $name $a3) (ne $name $a4) (ne $name $a5)
(ne $name $a6) (ne $name $a7) (ne $name $a8) (ne $name $a9) (ne $name $a10) }}
{{- printf "%s%s" $sep $name }}
{{- $sep = ", " }}
{{- if eq $a1 "" }}{{ $a1 = $name }}
{{- else if eq $a2 "" }}{{ $a2 = $name }}
{{- else if eq $a3 "" }}{{ $a3 = $name }}
{{- else if eq $a4 "" }}{{ $a4 = $name }}
{{- else if eq $a5 "" }}{{ $a5 = $name }}
{{- else if eq $a6 "" }}{{ $a6 = $name }}
{{- else if eq $a7 "" }}{{ $a7 = $name }}
{{- else if eq $a8 "" }}{{ $a8 = $name }}
{{- else if eq $a9 "" }}{{ $a9 = $name }}
{{- else if eq $a10 "" }}{{ $a10 = $name }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}