339 lines
9.8 KiB
YAML
339 lines
9.8 KiB
YAML
apiVersion: 1
|
|
groups:
|
|
- orgId: 1
|
|
name: "alerts"
|
|
interval: 1m
|
|
rules:
|
|
- uid: KubernetesPodCrashLooping
|
|
title: Kubernetes Pod CrashLooping
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
editorMode: code
|
|
expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3
|
|
instant: true
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: false
|
|
refId: A
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
annotations:
|
|
description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently
|
|
summary: Pod {{ $labels.pod }} is crash looping
|
|
isPaused: false
|
|
|
|
- uid: KubernetesPodPending
|
|
title: Kubernetes Pod Pending
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
editorMode: code
|
|
expr: kube_pod_status_phase{phase="Pending"} > 0
|
|
instant: true
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: false
|
|
refId: A
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 10m
|
|
isPaused: false
|
|
annotations:
|
|
description: Pod {{ $labels.pod }} is in Pending state
|
|
summary: Pod {{ $labels.pod }} is pending
|
|
|
|
- uid: ContainerCPUUsageHigh
|
|
title: Container CPU Usage High
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
editorMode: code
|
|
expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9
|
|
instant: true
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: false
|
|
refId: A
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
isPaused: false
|
|
annotations:
|
|
summary: "High CPU usage for container {{ $labels.container }}"
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes."
|
|
|
|
- uid: ContainerMemoryUsageHigh
|
|
title: Container Memory Usage High
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
editorMode: code
|
|
expr: container_memory_usage_bytes{}
|
|
/
|
|
container_spec_memory_limit_bytes{}
|
|
> 0.9
|
|
instant: true
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: false
|
|
refId: A
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
isPaused: false
|
|
annotations:
|
|
summary: "High memory usage for container {{ $labels.container }}"
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit."
|
|
|
|
- uid: PVCStorageAlmostFull
|
|
title: PVC Storage Almost Full
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
model:
|
|
editorMode: code
|
|
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9
|
|
instant: true
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: false
|
|
refId: A
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
isPaused: false
|
|
annotations:
|
|
summary: "PVC almost full"
|
|
description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full."
|
|
|
|
# Loki Alerts
|
|
- uid: loki-alert
|
|
title: Loki-Alert
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: range
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
#datasourceUid: berm4y85oiwaoc
|
|
#model:
|
|
# datasource:
|
|
# type: loki
|
|
# uid: berm4y85oiwaoc
|
|
editorMode: code
|
|
expr: count_over_time({job="controlling"} |= "error while querying DB" [10m])
|
|
hide: false
|
|
instant: true
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
queryType: range
|
|
refId: A
|
|
- refId: reducer
|
|
queryType: expression
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params: []
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: reducer
|
|
type: reduce
|
|
- refId: C
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- C
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: reducer
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: C
|
|
type: threshold
|
|
noDataState: OK
|
|
execErrState: Error
|
|
for: 5m
|
|
isPaused: false
|