terraform-modules/grafana/examples/alerts/alerts.yaml
2025-07-16 08:50:43 +02:00

339 lines
9.8 KiB
YAML

apiVersion: 1
groups:
- orgId: 1
name: "alerts"
interval: 1m
rules:
- uid: KubernetesPodCrashLooping
title: Kubernetes Pod CrashLooping
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
annotations:
description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently
summary: Pod {{ $labels.pod }} is crash looping
isPaused: false
- uid: KubernetesPodPending
title: Kubernetes Pod Pending
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: kube_pod_status_phase{phase="Pending"} > 0
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 10m
isPaused: false
annotations:
description: Pod {{ $labels.pod }} is in Pending state
summary: Pod {{ $labels.pod }} is pending
- uid: ContainerCPUUsageHigh
title: Container CPU Usage High
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "High CPU usage for container {{ $labels.container }}"
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes."
- uid: ContainerMemoryUsageHigh
title: Container Memory Usage High
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: container_memory_usage_bytes{}
/
container_spec_memory_limit_bytes{}
> 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "High memory usage for container {{ $labels.container }}"
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit."
- uid: PVCStorageAlmostFull
title: PVC Storage Almost Full
condition: C
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
model:
editorMode: code
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false
annotations:
summary: "PVC almost full"
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full."
# Loki Alerts
- uid: loki-alert
title: Loki-Alert
condition: C
data:
- refId: A
queryType: range
relativeTimeRange:
from: 600
to: 0
#datasourceUid: berm4y85oiwaoc
#model:
# datasource:
# type: loki
# uid: berm4y85oiwaoc
editorMode: code
expr: count_over_time({job="controlling"} |= "error while querying DB" [10m])
hide: false
instant: true
intervalMs: 1000
maxDataPoints: 43200
queryType: range
refId: A
- refId: reducer
queryType: expression
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params: []
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: reducer
type: reduce
- refId: C
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- C
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: reducer
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: threshold
noDataState: OK
execErrState: Error
for: 5m
isPaused: false