apiVersion: 1 groups: - orgId: 1 name: "alerts" interval: 1m rules: - uid: KubernetesPodCrashLooping title: Kubernetes Pod CrashLooping condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 model: editorMode: code expr: increase(kube_pod_container_status_restarts_total{}[1h]) > 3 instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 5m annotations: description: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too frequently summary: Pod {{ $labels.pod }} is crash looping isPaused: false - uid: KubernetesPodPending title: Kubernetes Pod Pending condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 model: editorMode: code expr: kube_pod_status_phase{phase="Pending"} > 0 instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 10m isPaused: false annotations: description: Pod {{ $labels.pod }} is in Pending state summary: Pod {{ $labels.pod }} is pending - uid: ContainerCPUUsageHigh title: Container CPU Usage High condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 model: editorMode: code expr: rate(container_cpu_usage_seconds_total{image!=""}[5m]) > 0.9 instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 5m isPaused: false annotations: summary: "High CPU usage for container {{ $labels.container }}" description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% CPU for 5 minutes." - uid: ContainerMemoryUsageHigh title: Container Memory Usage High condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 model: editorMode: code expr: container_memory_usage_bytes{} / container_spec_memory_limit_bytes{} > 0.9 instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 5m isPaused: false annotations: summary: "High memory usage for container {{ $labels.container }}" description: "Container {{ $labels.container }} in pod {{ $labels.pod }} is using >90% of its memory limit." - uid: PVCStorageAlmostFull title: PVC Storage Almost Full condition: C data: - refId: A relativeTimeRange: from: 600 to: 0 model: editorMode: code expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.9 instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 5m isPaused: false annotations: summary: "PVC almost full" description: "PersistentVolumeClaim {{ $labels.persistentvolumeclaim }} in namespace {{ $labels.namespace }} is >90% full." # Loki Alerts - uid: loki-alert title: Loki-Alert condition: C data: - refId: A queryType: range relativeTimeRange: from: 600 to: 0 #datasourceUid: berm4y85oiwaoc #model: # datasource: # type: loki # uid: berm4y85oiwaoc editorMode: code expr: count_over_time({job="controlling"} |= "error while querying DB" [10m]) hide: false instant: true intervalMs: 1000 maxDataPoints: 43200 queryType: range refId: A - refId: reducer queryType: expression datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: [] reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: A intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: reducer type: reduce - refId: C datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: - C reducer: params: [] type: last type: query datasource: type: __expr__ uid: __expr__ expression: reducer intervalMs: 1000 maxDataPoints: 43200 refId: C type: threshold noDataState: OK execErrState: Error for: 5m isPaused: false