updated README
This commit is contained in:
parent
7413af8b2b
commit
92e4465889
1 changed files with 215 additions and 46 deletions
|
|
@ -1,4 +1,6 @@
|
||||||
# Modules for Grafana alerts and dashboards
|
# Modules for Grafana alerts and dashboards
|
||||||
|
<!-- TOC -->
|
||||||
|
|
||||||
|
|
||||||
## Alerting
|
## Alerting
|
||||||
|
|
||||||
|
|
@ -7,77 +9,244 @@ Please check documentation about Grafana alerting [here](https://itdoc.schwarz/x
|
||||||
The Terraform modules are separated per resource type, check README in each module directory for spefic examples.
|
The Terraform modules are separated per resource type, check README in each module directory for spefic examples.
|
||||||
Below is example for alerts using „**Prometheus/Thanos**“ datasorce and sending notification to „**Google Chat**“.
|
Below is example for alerts using „**Prometheus/Thanos**“ datasorce and sending notification to „**Google Chat**“.
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
Set Grafana credentials as Terraform variables:
|
||||||
|
|
||||||
```hcl title="main.tf"
|
```bash
|
||||||
# Datasource
|
export TF_VAR_grafana_url="https://grafana.example.com"
|
||||||
|
export TF_VAR_grafana_username="admin"
|
||||||
|
export TF_VAR_grafana_password="super-secret"
|
||||||
|
```
|
||||||
|
|
||||||
|
These credentials are used by all modules to authenticate with the Grafana API.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
Organize alerts, templates, and Terraform code as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
.
|
||||||
|
├── alerts/
|
||||||
|
│ ├── infra/
|
||||||
|
│ │ ├── loki/
|
||||||
|
│ │ │ └── alert-loki.yaml
|
||||||
|
│ │ └── thanos/
|
||||||
|
│ │ └── alert-thanos.yaml
|
||||||
|
│ ├── oncall/
|
||||||
|
│ │ └── alert-oncall.yaml
|
||||||
|
│ └── heartbeats/
|
||||||
|
│ └── alert-heartbeat.yaml
|
||||||
|
├── templates/
|
||||||
|
│ └── myteam/
|
||||||
|
│ └── gchat-message.tmpl
|
||||||
|
└── main.tf
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Alerts**: YAML files defining rule groups (`apiVersion: 1, groups: [...]`).
|
||||||
|
- **Templates**: Notification templates for Google Chat contact points.
|
||||||
|
- **Terraform code**: References modules and binds everything together.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Defining Secrets
|
||||||
|
|
||||||
|
Datasource URLs and credentials should be stored in Terraform variables, not hardcoded.
|
||||||
|
|
||||||
|
**Example: Environment Variables**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export TF_VAR_thanos_coin_prd_url="https://thanos.example.com"
|
||||||
|
export TF_VAR_thanos_coin_prd_user="reader"
|
||||||
|
export TF_VAR_thanos_coin_prd_pass="password"
|
||||||
|
|
||||||
|
export TF_VAR_loki_coin_prd_url="https://loki.example.com"
|
||||||
|
export TF_VAR_loki_coin_prd_user="reader"
|
||||||
|
export TF_VAR_loki_coin_prd_pass="password"
|
||||||
|
|
||||||
|
export TF_VAR_opsgenie_api_key="xxxxxx"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Module Usage
|
||||||
|
|
||||||
|
### Datasources
|
||||||
|
|
||||||
|
Define multiple datasources (Prometheus, Loki, etc.) with unique keys for URL/username/password:
|
||||||
|
|
||||||
|
```hcl
|
||||||
module "datasource" {
|
module "datasource" {
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main"
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/datasource?ref=main"
|
||||||
datasource_name = "Thanos - Myteam"
|
|
||||||
datasource_url = var.datasource_url
|
datasources = {
|
||||||
datasource_username = var.datasource_username
|
Thanos-Common-Infra-PRD = {
|
||||||
datasource_password = var.datasource_password
|
type = "prometheus"
|
||||||
|
url_key = "thanos_coin_prd"
|
||||||
|
user_key = "thanos_coin_prd"
|
||||||
|
pass_key = "thanos_coin_prd"
|
||||||
|
is_default = true
|
||||||
|
}
|
||||||
|
Loki-Common-Infra-PRD = {
|
||||||
|
type = "loki"
|
||||||
|
url_key = "loki_coin_prd"
|
||||||
|
user_key = "loki_coin_prd"
|
||||||
|
pass_key = "loki_coin_prd"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Alert Receiver / Contact Point
|
datasource_urls = {
|
||||||
module "google-chat-contact-point" {
|
thanos_coin_prd = var.thanos_coin_prd_url
|
||||||
|
loki_coin_prd = var.loki_coin_prd_url
|
||||||
|
}
|
||||||
|
|
||||||
|
datasource_users = {
|
||||||
|
thanos_coin_prd = var.thanos_coin_prd_user
|
||||||
|
loki_coin_prd = var.loki_coin_prd_user
|
||||||
|
}
|
||||||
|
|
||||||
|
datasource_passwords = {
|
||||||
|
thanos_coin_prd = var.thanos_coin_prd_pass
|
||||||
|
loki_coin_prd = var.loki_coin_prd_pass
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Contact Points
|
||||||
|
|
||||||
|
**Google Chat**
|
||||||
|
|
||||||
|
Each Google Chat space is configured as a contact point:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "gchat-contact-point-coin" {
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main"
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-gchat?ref=main"
|
||||||
google-chat-url = var.google_chat_url
|
gchat_url = var.gchat_url_coin
|
||||||
contact-point-name = "gchat"
|
contact_point_name = "gchat-coin"
|
||||||
}
|
templates_dir = "templates/coin"
|
||||||
|
template_prefix = "coin-"
|
||||||
|
|
||||||
# Alert Rule Folders
|
|
||||||
module "alert-folder" {
|
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main"
|
|
||||||
alert-folder = "Alerts"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Template for messages
|
|
||||||
module "message-templates" {
|
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/message-template?ref=main"
|
|
||||||
templates_dir = "templates"
|
|
||||||
disable_provenance = true
|
disable_provenance = true
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
# Notification policies
|
**OpsGenie**
|
||||||
|
|
||||||
|
OpsGenie contact points use API keys:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "opsgenie-contact-point" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/contact-point-opsgenie?ref=main"
|
||||||
|
contact_point_name = "opsgenie-dev"
|
||||||
|
opsgenie_api_key = var.opsgenie_api_key
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Alert Folders
|
||||||
|
|
||||||
|
Organize alerts in Grafana folders for logical separation:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "alert-folder" {
|
||||||
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alert-folder?ref=main"
|
||||||
|
alert-folder = "Common-Infra-Alerts"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Notification Policies
|
||||||
|
|
||||||
|
Map folders to contact points (e.g., send “Common-Infra-Alerts” to Google Chat):
|
||||||
|
|
||||||
|
```hcl
|
||||||
module "notification-policy" {
|
module "notification-policy" {
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main"
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/notification-policy?ref=main"
|
||||||
|
default_contact_point_uid = module.gchat-contact-point-coin.contact_point_name
|
||||||
default_contact_point_uid = module.google-chat-contact-point.contact_name
|
|
||||||
group_by = ["alertname"]
|
group_by = ["alertname"]
|
||||||
|
|
||||||
folder_policies = {
|
folder_policies = {
|
||||||
"Alerts" = module.google-chat-contact-point.contact_name
|
"Common-Infra-Alerts" = module.gchat-contact-point-coin.contact_point_name
|
||||||
|
"Common-Infra-OnCall-Alerts" = module.opsgenie-contact-point.contact_name
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
# Alert definition
|
### Alert Definitions
|
||||||
module "alerting" {
|
|
||||||
|
Alert rules are defined in YAML and applied via the module:
|
||||||
|
|
||||||
|
```hcl
|
||||||
|
module "alerting-coin" {
|
||||||
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main"
|
source = "git::https://commerce-platform.git.onstackit.cloud/commerce-platform-public/terraform-modules//grafana/alerts?ref=main"
|
||||||
|
alerts_dir = "alerts/common-infra/thanos"
|
||||||
alerts_dir = "alerts"
|
datasource_uid = module.datasource.datasource_uids["Thanos-Common-Infra-PRD"]
|
||||||
default_datasource_uid = module.datasource.datasource_uid
|
folder_uid = module.alert-folder.folder_uid
|
||||||
default_receiver = module.google-chat-contact-point.contact_name
|
receiver = module.gchat-contact-point-coin.contact_point_name
|
||||||
default_folder_uid = module.alert-folder.folder_uid
|
|
||||||
default_interval_seconds = 60
|
|
||||||
disable_provenance = true
|
disable_provenance = true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
With this configuration you need to place your notification templates into `templates` folder and alert definitions in YAML format to `alerts` folder in same directoty where `main.tf`is located.
|
|
||||||
You can example for both in [examples](./examples/) folder.
|
|
||||||
For this example you need export your secret variables e.g. with lookup in Secret Manager.
|
|
||||||
|
|
||||||
```sh
|
---
|
||||||
export TF_VAR_grafana_url="<GRAFANA URL>"
|
|
||||||
export TF_VAR_grafana_username="admin"
|
## Alert YAML Format
|
||||||
export TF_VAR_grafana_password="xxxxxxx"
|
|
||||||
export TF_VAR_google_chat_url="https://chat.googleapis.com/v1/spaces/xxxxx"
|
Alerts are defined in YAML Grafana format. The easiest way to get example from scratch is to define alert in Grafana UI and then export it using „Export rules“ button.
|
||||||
export TF_VAR_datasource_url="https://xxxxx.stackit.cloud/instances/xxxxxx"
|
However, make sure to remove some fields which are not needed and provided by Terraform module logic automatically:
|
||||||
export TF_VAR_datasource_username="stackit9_xxxxx"
|
|
||||||
export TF_VAR_datasource_password="xxxxxx"
|
- `datasourceUid` (defined with `alerts` module)
|
||||||
|
- `notification_settings` ( defined with `notification policy` module)
|
||||||
|
|
||||||
|
Each file must have `apiVersion: 1` and define groups:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: 1
|
||||||
|
groups:
|
||||||
|
- name: infra-alerts
|
||||||
|
interval: 1m
|
||||||
|
rules:
|
||||||
|
- uid: pod-restart-alert
|
||||||
|
title: "Pod Restart Count High"
|
||||||
|
condition: C
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 600
|
||||||
|
to: 0
|
||||||
|
model:
|
||||||
|
expr: increase(kube_pod_container_status_restarts_total{}[5m]) > 3
|
||||||
|
instant: true
|
||||||
|
refId: A
|
||||||
|
- refId: C
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params: [0]
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: [C]
|
||||||
|
reducer:
|
||||||
|
type: last
|
||||||
|
type: query
|
||||||
|
expression: A
|
||||||
|
type: threshold
|
||||||
|
noDataState: OK
|
||||||
|
execErrState: Error
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
description: "Pod is restarting too often"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Dashboard
|
---
|
||||||
TODO
|
|
||||||
|
|
||||||
|
## Updating Alerts or Contact Points
|
||||||
|
|
||||||
|
- Add new YAML files under `alerts/` for additional rules.
|
||||||
|
- Add new modules in `main.tf` for new datasources or contact points.
|
||||||
|
- Run `terraform apply` to sync changes to Grafana.
|
||||||
|
|
||||||
|
|
||||||
|
You can examples for alerts and templates in [examples](./examples/) folder.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue