Readme
This commit is contained in:
parent
8d05d7e5ad
commit
f5ca31d62c
4 changed files with 470 additions and 1 deletions
|
|
@ -1,4 +1,72 @@
|
|||
# Modules for Grafana alerts and dashboards
|
||||
|
||||
## How to use
|
||||
## Alerting
|
||||
|
||||
Please check documentation about Grafana alerting [here](https://itdoc.schwarz/x/X11nf) and [official documentation](https://grafana.com/docs/grafana/latest/alerting/) for deeper look.
|
||||
|
||||
The Terraform modules are separated per resource type, check README in each module directory for spefic examples.
|
||||
Below is example for alerts using „**Prometheus/Thanos**“ datasorce and sending notification to „**Google Chat**“.
|
||||
|
||||
|
||||
|
||||
```hcl title="main.tf"
|
||||
# Datasource
|
||||
module "datasource" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main"
|
||||
datasource_name = "Thanos - Myteam"
|
||||
datasource_url = var.datasource_url
|
||||
datasource_username = var.datasource_username
|
||||
datasource_password = var.datasource_password
|
||||
}
|
||||
|
||||
# Alert Receiver / Contact Point
|
||||
module "google-chat-contact-point" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main"
|
||||
google-chat-url = var.google_chat_url
|
||||
contact-point-name = "gchat"
|
||||
}
|
||||
|
||||
|
||||
# Alert Rule Folders
|
||||
module "alert-folder" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main"
|
||||
alert-folder = "Alerts"
|
||||
}
|
||||
|
||||
|
||||
# Template for messages
|
||||
module "message-templates" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/message-template?ref=main"
|
||||
templates_dir = "templates"
|
||||
disable_provenance = true
|
||||
}
|
||||
|
||||
# Notification policies
|
||||
module "notification-policy" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main"
|
||||
|
||||
default_contact_point_uid = module.google-chat-contact-point.contact_name
|
||||
group_by = ["alertname"]
|
||||
|
||||
folder_policies = {
|
||||
"Alerts" = module.google-chat-contact-point.contact_name
|
||||
}
|
||||
}
|
||||
|
||||
# Alert definition
|
||||
module "alerting" {
|
||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main"
|
||||
|
||||
alerts_dir = "alerts"
|
||||
default_datasource_uid = module.datasource.datasource_uid
|
||||
default_receiver = module.google-chat-contact-point.contact_name
|
||||
default_folder_uid = module.alert-folder.folder_uid
|
||||
default_interval_seconds = 60
|
||||
disable_provenance = true
|
||||
}
|
||||
```
|
||||
With this configuration you need to place your notification templates into `templates` folder and alert definitions in YAML format to `alerts` folder in same directoty where `main.tf`is located.
|
||||
You can example for both in [examples](./examples/) folder.
|
||||
|
||||
## Dashboard
|
||||
TODO
|
||||
|
|
|
|||
339
grafana/examples/alerts/alerts.yaml
Normal file
339
grafana/examples/alerts/alerts.yaml
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
apiVersion: 1
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: "alerts"
|
||||
interval: 1m
|
||||
rules:
|
||||
- uid: KubernetesPodCrashLooping
|
||||
title: Kubernetes Pod CrashLooping
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
model:
|
||||
editorMode: code
|
||||
expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
annotations:
|
||||
description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently
|
||||
summary: Pod {{ $labels.pod }} is crash looping
|
||||
isPaused: false
|
||||
|
||||
- uid: KubernetesPodPending
|
||||
title: Kubernetes Pod Pending
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
model:
|
||||
editorMode: code
|
||||
expr: kube_pod_status_phase{phase="Pending"} > 0
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 10m
|
||||
isPaused: false
|
||||
annotations:
|
||||
description: Pod {{ $labels.pod }} is in Pending state
|
||||
summary: Pod {{ $labels.pod }} is pending
|
||||
|
||||
- uid: ContainerCPUUsageHigh
|
||||
title: Container CPU Usage High
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
model:
|
||||
editorMode: code
|
||||
expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
isPaused: false
|
||||
annotations:
|
||||
summary: "High CPU usage for container {{ $labels.container }}"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes."
|
||||
|
||||
- uid: ContainerMemoryUsageHigh
|
||||
title: Container Memory Usage High
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
model:
|
||||
editorMode: code
|
||||
expr: container_memory_usage_bytes{}
|
||||
/
|
||||
container_spec_memory_limit_bytes{}
|
||||
> 0.9
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
isPaused: false
|
||||
annotations:
|
||||
summary: "High memory usage for container {{ $labels.container }}"
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit."
|
||||
|
||||
- uid: PVCStorageAlmostFull
|
||||
title: PVC Storage Almost Full
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
model:
|
||||
editorMode: code
|
||||
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: false
|
||||
refId: A
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
isPaused: false
|
||||
annotations:
|
||||
summary: "PVC almost full"
|
||||
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full."
|
||||
|
||||
# Loki Alerts
|
||||
- uid: loki-alert
|
||||
title: Loki-Alert
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: range
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
#datasourceUid: berm4y85oiwaoc
|
||||
#model:
|
||||
# datasource:
|
||||
# type: loki
|
||||
# uid: berm4y85oiwaoc
|
||||
editorMode: code
|
||||
expr: count_over_time({job="controlling"} |= "error while querying DB" [10m])
|
||||
hide: false
|
||||
instant: true
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
queryType: range
|
||||
refId: A
|
||||
- refId: reducer
|
||||
queryType: expression
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
reducer:
|
||||
params: []
|
||||
type: avg
|
||||
type: query
|
||||
datasource:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: reducer
|
||||
type: reduce
|
||||
- refId: C
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- C
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: reducer
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: C
|
||||
type: threshold
|
||||
noDataState: OK
|
||||
execErrState: Error
|
||||
for: 5m
|
||||
isPaused: false
|
||||
26
grafana/examples/templates/google-chat-body-template.tmpl
Normal file
26
grafana/examples/templates/google-chat-body-template.tmpl
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
{{ define "google-chat-body-template" -}}
|
||||
{{- $alerts := .Alerts }}
|
||||
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
|
||||
|
||||
{{- range $alerts }}
|
||||
🚨 *{{ index .Labels "alertname" }}* ({{ .Status }})
|
||||
|
||||
{{- with index .Labels "pod" }}
|
||||
🛢️ Pod: `{{ . }}`
|
||||
{{- end }}
|
||||
{{- with index .Labels "container" }}
|
||||
📦 Container: `{{ . }}`
|
||||
{{- end }}
|
||||
{{- with index .Labels "stage" }}
|
||||
🧪 Stage: `{{ . }}`
|
||||
{{- end }}
|
||||
{{- with index .Labels "cluster" }}
|
||||
🌐 K8s cluster: `{{ . }}`
|
||||
{{- end }}
|
||||
|
||||
📝 {{ with index .Annotations "summary" }}{{ . }}{{ else }}n/a{{ end }}
|
||||
📄 {{ with index .Annotations "description" }}{{ . }}{{ else }}n/a{{ end }}
|
||||
|
||||
{{ end -}}
|
||||
{{ end }}
|
||||
|
||||
36
grafana/examples/templates/google-chat-title-template.tmpl
Normal file
36
grafana/examples/templates/google-chat-title-template.tmpl
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
{{ define "google-chat-title-template" -}}
|
||||
{{- if eq .Status "firing" -}}
|
||||
🔥 Firing:
|
||||
{{- else if eq .Status "resolved" -}}
|
||||
✅ Resolved:
|
||||
{{- else -}}
|
||||
⚠️ Alert Status: {{ .Status }}:
|
||||
{{- end }}
|
||||
|
||||
{{- $alerts := .Alerts }}
|
||||
{{- if not $alerts }}{{ $alerts = . }}{{ end }}
|
||||
|
||||
{{- $a1 := "" }}{{ $a2 := "" }}{{ $a3 := "" }}{{ $a4 := "" }}{{ $a5 := "" }}
|
||||
{{- $a6 := "" }}{{ $a7 := "" }}{{ $a8 := "" }}{{ $a9 := "" }}{{ $a10 := "" }}
|
||||
{{- $sep := " " }}
|
||||
|
||||
{{- range $alerts }}
|
||||
{{- $name := index .Labels "alertname" }}
|
||||
{{- if and (ne $name $a1) (ne $name $a2) (ne $name $a3) (ne $name $a4) (ne $name $a5)
|
||||
(ne $name $a6) (ne $name $a7) (ne $name $a8) (ne $name $a9) (ne $name $a10) }}
|
||||
{{- printf "%s%s" $sep $name }}
|
||||
{{- $sep = ", " }}
|
||||
{{- if eq $a1 "" }}{{ $a1 = $name }}
|
||||
{{- else if eq $a2 "" }}{{ $a2 = $name }}
|
||||
{{- else if eq $a3 "" }}{{ $a3 = $name }}
|
||||
{{- else if eq $a4 "" }}{{ $a4 = $name }}
|
||||
{{- else if eq $a5 "" }}{{ $a5 = $name }}
|
||||
{{- else if eq $a6 "" }}{{ $a6 = $name }}
|
||||
{{- else if eq $a7 "" }}{{ $a7 = $name }}
|
||||
{{- else if eq $a8 "" }}{{ $a8 = $name }}
|
||||
{{- else if eq $a9 "" }}{{ $a9 = $name }}
|
||||
{{- else if eq $a10 "" }}{{ $a10 = $name }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
Loading…
Reference in a new issue