Readme
This commit is contained in:
parent
8d05d7e5ad
commit
f5ca31d62c
4 changed files with 470 additions and 1 deletions
|
|
@ -1,4 +1,72 @@
|
||||||
# Modules for Grafana alerts and dashboards
|
# Modules for Grafana alerts and dashboards
|
||||||
|
|
||||||
## How to use
|
## Alerting
|
||||||
|
|
||||||
|
Please check documentation about Grafana alerting [here](https://itdoc.schwarz/x/X11nf) and [official documentation](https://grafana.com/docs/grafana/latest/alerting/) for deeper look.
|
||||||
|
|
||||||
|
The Terraform modules are separated per resource type, check README in each module directory for spefic examples.
|
||||||
|
Below is example for alerts using „**Prometheus/Thanos**“ datasorce and sending notification to „**Google Chat**“.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
```hcl title="main.tf"
|
||||||
|
# Datasource
|
||||||
|
module "datasource" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main"
|
||||||
|
datasource_name = "Thanos - Myteam"
|
||||||
|
datasource_url = var.datasource_url
|
||||||
|
datasource_username = var.datasource_username
|
||||||
|
datasource_password = var.datasource_password
|
||||||
|
}
|
||||||
|
|
||||||
|
# Alert Receiver / Contact Point
|
||||||
|
module "google-chat-contact-point" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main"
|
||||||
|
google-chat-url = var.google_chat_url
|
||||||
|
contact-point-name = "gchat"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Alert Rule Folders
|
||||||
|
module "alert-folder" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main"
|
||||||
|
alert-folder = "Alerts"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Template for messages
|
||||||
|
module "message-templates" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/message-template?ref=main"
|
||||||
|
templates_dir = "templates"
|
||||||
|
disable_provenance = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Notification policies
|
||||||
|
module "notification-policy" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main"
|
||||||
|
|
||||||
|
default_contact_point_uid = module.google-chat-contact-point.contact_name
|
||||||
|
group_by = ["alertname"]
|
||||||
|
|
||||||
|
folder_policies = {
|
||||||
|
"Alerts" = module.google-chat-contact-point.contact_name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Alert definition
|
||||||
|
module "alerting" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main"
|
||||||
|
|
||||||
|
alerts_dir = "alerts"
|
||||||
|
default_datasource_uid = module.datasource.datasource_uid
|
||||||
|
default_receiver = module.google-chat-contact-point.contact_name
|
||||||
|
default_folder_uid = module.alert-folder.folder_uid
|
||||||
|
default_interval_seconds = 60
|
||||||
|
disable_provenance = true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
With this configuration you need to place your notification templates into `templates` folder and alert definitions in YAML format to `alerts` folder in same directoty where `main.tf`is located.
|
||||||
|
You can example for both in [examples](./examples/) folder.
|
||||||
|
|
||||||
|
## Dashboard
|
||||||
TODO
|
TODO
|
||||||
|
|
|
||||||
339
grafana/examples/alerts/alerts.yaml
Normal file
339
grafana/examples/alerts/alerts.yaml
Normal file
|
|
@ -0,0 +1,339 @@
|
||||||
|
apiVersion: 1
|
||||||
|
groups:
|
||||||
|
- orgId: 1
|
||||||
|
name: "alerts"
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: KubernetesPodCrashLooping
|
||||||
|
title: Kubernetes Pod CrashLooping
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently
|
||||||
|
summary: Pod {{ $labels.pod }} is crash looping
|
||||||
|
isPaused: false
|
||||||
|
|
||||||
|
- uid: KubernetesPodPending
|
||||||
|
title: Kubernetes Pod Pending
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: kube_pod_status_phase{phase="Pending"} > 0
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 10m
|
||||||
|
isPaused: false
|
||||||
|
annotations:
|
||||||
|
description: Pod {{ $labels.pod }} is in Pending state
|
||||||
|
summary: Pod {{ $labels.pod }} is pending
|
||||||
|
|
||||||
|
- uid: ContainerCPUUsageHigh
|
||||||
|
title: Container CPU Usage High
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
isPaused: false
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU usage for container {{ $labels.container }}"
|
||||||
|
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes."
|
||||||
|
|
||||||
|
- uid: ContainerMemoryUsageHigh
|
||||||
|
title: Container Memory Usage High
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: container_memory_usage_bytes{}
|
||||||
|
/
|
||||||
|
container_spec_memory_limit_bytes{}
|
||||||
|
> 0.9
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
isPaused: false
|
||||||
|
annotations:
|
||||||
|
summary: "High memory usage for container {{ $labels.container }}"
|
||||||
|
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit."
|
||||||
|
|
||||||
|
- uid: PVCStorageAlmostFull
|
||||||
|
title: PVC Storage Almost Full
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
editorMode: code
|
||||||
|
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
legendFormat: __auto
|
||||||
|
maxDataPoints: 43200
|
||||||
|
range: false
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
isPaused: false
|
||||||
|
annotations:
|
||||||
|
summary: "PVC almost full"
|
||||||
|
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full."
|
||||||
|
|
||||||
|
# Loki Alerts
|
||||||
|
- uid: loki-alert
|
||||||
|
title: Loki-Alert
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
queryType: range
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
#datasourceUid: berm4y85oiwaoc
|
||||||
|
#model:
|
||||||
|
# datasource:
|
||||||
|
# type: loki
|
||||||
|
# uid: berm4y85oiwaoc
|
||||||
|
editorMode: code
|
||||||
|
expr: count_over_time({job="controlling"} |= "error while querying DB" [10m])
|
||||||
|
hide: false
|
||||||
|
instant: true
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
queryType: range
|
||||||
|
refId: A
|
||||||
|
- refId: reducer
|
||||||
|
queryType: expression
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: avg
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
name: Expression
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
reducer: last
|
||||||
|
refId: reducer
|
||||||
|
type: reduce
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params:
|
||||||
|
- C
|
||||||
|
reducer:
|
||||||
|
params: []
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: reducer
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
refId: C
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
isPaused: false
|
||||||
26
grafana/examples/templates/google-chat-body-template.tmpl
Normal file
26
grafana/examples/templates/google-chat-body-template.tmpl
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
{{ define "google-chat-body-template" -}}
|
||||||
|
{{- $alerts := .Alerts }}
|
||||||
|
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
|
||||||
|
|
||||||
|
{{- range $alerts }}
|
||||||
|
🚨 *{{ index .Labels "alertname" }}* ({{ .Status }})
|
||||||
|
|
||||||
|
{{- with index .Labels "pod" }}
|
||||||
|
🛢️ Pod: `{{ . }}`
|
||||||
|
{{- end }}
|
||||||
|
{{- with index .Labels "container" }}
|
||||||
|
📦 Container: `{{ . }}`
|
||||||
|
{{- end }}
|
||||||
|
{{- with index .Labels "stage" }}
|
||||||
|
🧪 Stage: `{{ . }}`
|
||||||
|
{{- end }}
|
||||||
|
{{- with index .Labels "cluster" }}
|
||||||
|
🌐 K8s cluster: `{{ . }}`
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
📝 {{ with index .Annotations "summary" }}{{ . }}{{ else }}n/a{{ end }}
|
||||||
|
📄 {{ with index .Annotations "description" }}{{ . }}{{ else }}n/a{{ end }}
|
||||||
|
|
||||||
|
{{ end -}}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
36
grafana/examples/templates/google-chat-title-template.tmpl
Normal file
36
grafana/examples/templates/google-chat-title-template.tmpl
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
{{ define "google-chat-title-template" -}}
|
||||||
|
{{- if eq .Status "firing" -}}
|
||||||
|
🔥 Firing:
|
||||||
|
{{- else if eq .Status "resolved" -}}
|
||||||
|
✅ Resolved:
|
||||||
|
{{- else -}}
|
||||||
|
⚠️ Alert Status: {{ .Status }}:
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
{{- $alerts := .Alerts }}
|
||||||
|
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
|
||||||
|
|
||||||
|
{{- $a1 := "" }}{{ $a2 := "" }}{{ $a3 := "" }}{{ $a4 := "" }}{{ $a5 := "" }}
|
||||||
|
{{- $a6 := "" }}{{ $a7 := "" }}{{ $a8 := "" }}{{ $a9 := "" }}{{ $a10 := "" }}
|
||||||
|
{{- $sep := " " }}
|
||||||
|
|
||||||
|
{{- range $alerts }}
|
||||||
|
{{- $name := index .Labels "alertname" }}
|
||||||
|
{{- if and (ne $name $a1) (ne $name $a2) (ne $name $a3) (ne $name $a4) (ne $name $a5)
|
||||||
|
(ne $name $a6) (ne $name $a7) (ne $name $a8) (ne $name $a9) (ne $name $a10) }}
|
||||||
|
{{- printf "%s%s" $sep $name }}
|
||||||
|
{{- $sep = ", " }}
|
||||||
|
{{- if eq $a1 "" }}{{ $a1 = $name }}
|
||||||
|
{{- else if eq $a2 "" }}{{ $a2 = $name }}
|
||||||
|
{{- else if eq $a3 "" }}{{ $a3 = $name }}
|
||||||
|
{{- else if eq $a4 "" }}{{ $a4 = $name }}
|
||||||
|
{{- else if eq $a5 "" }}{{ $a5 = $name }}
|
||||||
|
{{- else if eq $a6 "" }}{{ $a6 = $name }}
|
||||||
|
{{- else if eq $a7 "" }}{{ $a7 = $name }}
|
||||||
|
{{- else if eq $a8 "" }}{{ $a8 = $name }}
|
||||||
|
{{- else if eq $a9 "" }}{{ $a9 = $name }}
|
||||||
|
{{- else if eq $a10 "" }}{{ $a10 = $name }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
Loading…
Reference in a new issue