Introduction
Je pars du principe que vous utilisez Portainer, que vous savez vous servir d’un éditeur de texte et que vous connaissez vos machines.
Quelques notions
Je vous invite à parcourir mes autres articles :
Ceci afin de vous familiariser avec quelques notions importantes et le style de mes docker-compose et autres fichiers de configurations.
Prérequis
Vous allez avoir besoin de temps pour lire cet article et bien comprendre comment il est articulé.
Je vous laisse aussi adapter les URL présentent dans les différents fichiers de configuration en fonction de votre cas.
J'utilise un conteneur Ntfy pour le système de notification. Si ce n'est pas votre cas il vous faudra soit le mettre en place, soit adapter en fonction du système que vous utilisez ou bien tout simplement commenter les parties en rapport.
Fichiers requis
docker-compose.yml
[Fichier]
version: "3.0"
#
# updated: 2023-05-27
# stack: monitoring
#
x-logging: &x-logging
logging:
driver: loki
options:
loki-url: "http://loki:3100/loki/api/v1/push"
loki-retries: "5"
loki-batch-size: "400"
x-common: &x-common
<<: *x-logging
restart: "no"
stop_grace_period: 5s
stdin_open: true
tty: true
privileged: false
security_opt:
- no-new-privileges=true
cap_drop:
- ALL
cap_add:
- KILL
dns:
- 1.1.1.1
- 8.8.8.8
- 1.0.0.1
- 8.8.4.4
ipc: "shareable"
extra_hosts:
- "template.home:192.168.0.0"
environment:
TZ: "Europe/Paris"
PUID: 1000
PGID: 1000
user: 1000:1000
labels:
com.centurylinklabs.watchtower.enable: true
logging: "promtail"
com.stack.name: "common"
com.stack.service.name: "common"
devices:
- /dev/kmsg:/dev/kmsg
deploy:
resources:
limits:
cpus: "0.50"
memory: 256M
ulimits:
nproc: 65535
nofile:
soft: 20000
hard: 40000
tmpfs:
- /tmp:rw,noexec,nosuid,size=64k
sysctls:
net.core.somaxconn: 1024
net.ipv4.tcp_syncookies: 0
x-volume-timezone: &x-volume-timezone "/etc/timezone:/etc/timezone:ro"
x-volume-localtime: &x-volume-localtime "/etc/localtime:/etc/localtime:ro"
x-volume-docker-socket: &x-volume-docker-socket "/var/run/docker.sock:/var/run/docker.sock:rw"
x-volume-cgroups: &x-volume-cgroups "/proc/cgroups:/cgroup:rw"
x-volume-ssl: &x-volume-ssl "/opt/docker/ssl:/ssl:ro"
services:
notifier:
<<: *x-common
container_name: notifier
hostname: notifier
image: academo/grafana-ntfy:latest
restart: always
ports:
- "8088:8080"
expose:
- "8080"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "notifier"
command:
- "-ntfy-url=http://ntfy:8050/grafana"
volumes:
- *x-volume-timezone
- *x-volume-localtime
cadvisor:
<<: *x-common
user: 0:0
container_name: cadvisor
hostname: cadvisor
image: gcr.io/cadvisor/cadvisor:v0.47.1
restart: always
privileged: true
cap_add:
- SYS_PTRACE
ports:
- "8080:8080"
expose:
- "8080"
command:
- "--storage_duration=1m0s"
- "--allow_dynamic_housekeeping=true"
- "--housekeeping_interval=30s"
- "--global_housekeeping_interval=30s"
- "--event_storage_age_limit=default=0"
- "--event_storage_event_limit=default=0"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "cadvisor"
volumes:
- *x-volume-timezone
- *x-volume-localtime
- *x-volume-docker-socket
- *x-volume-cgroups
- /var/lib/docker/:/var/lib/docker:ro
- /etc/machine-id:/etc/machine-id:ro
- /:/rootfs:ro
- /sys:/sys:ro
- /dev/disk/:/dev/disk:ro
- /var/run:/var/run:ro
node-exporter:
<<: *x-common
user: 0:0
container_name: node-exporter
hostname: node-exporter
image: prom/node-exporter:latest
restart: always
privileged: true
cap_add:
- SYS_ADMIN
ports:
- "9100:9100"
expose:
- "9100"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9100/ \|| exit 1
command:
- "--collector.arp"
- "--collector.bcache"
- "--collector.bonding"
- "--collector.btrfs"
- "--collector.conntrack"
- "--collector.cpu"
- "--collector.cpufreq"
- "--collector.diskstats"
- "--collector.dmi"
- "--collector.edac"
- "--collector.entropy"
- "--collector.fibrechannel"
- "--collector.filefd"
- "--collector.filesystem"
- "--collector.hwmon"
- "--collector.infiniband"
- "--collector.ipvs"
- "--collector.loadavg"
- "--collector.mdadm"
- "--collector.meminfo"
- "--collector.netclass"
- "--collector.netdev"
- "--collector.netstat"
- "--collector.nfs"
- "--collector.nfsd"
- "--collector.nvme"
- "--collector.os"
- "--collector.powersupplyclass"
- "--collector.pressure"
- "--collector.rapl"
- "--collector.schedstat"
- "--collector.selinux"
- "--collector.sockstat"
- "--collector.softnet"
- "--collector.stat"
- "--collector.tapestats"
- "--collector.textfile"
- "--collector.thermal_zone"
- "--collector.time"
- "--collector.timex"
- "--collector.udp_queues"
- "--collector.uname"
- "--collector.vmstat"
- "--collector.xfs"
- "--collector.zfs"
- "--collector.buddyinfo"
- "--collector.cgroups"
- "--collector.drbd"
- "--collector.ethtool"
- "--collector.interrupts"
- "--collector.ksmd"
- "--collector.lnstat"
- "--collector.logind"
- "--collector.meminfo_numa"
- "--collector.mountstats"
- "--collector.network_route"
- "--collector.ntp"
- "--collector.perf"
- "--collector.processes"
- "--collector.qdisc"
- "--collector.runit"
#- "--collector.slabinfo"
- "--collector.supervisord"
- "--collector.sysctl"
- "--collector.systemd"
- "--collector.tcpstat"
- "--collector.wifi"
- "--collector.zoneinfo"
- "--path.rootfs=/host"
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- \"--collector.filesystem.ignored-mount-points='^\(/rootfs|/host|)/\(sys|proc|dev|host|etc)\(\$\$|/)'\"
- \"--collector.filesystem.ignored-fs-types='^\(sys|proc|auto|cgroup|devpts|ns|au|fuse\.lxc|mqueue)\(fs|)\$\$'\"
- \"--collector.netdev.device-exclude='^\(lo|veth.\*)\$'\"
- \"--collector.ethtool.device-exclude='^\(lo|br-.\*|docker0|flannel.\*|veth.\*)\$'\"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "node-exporter"
deploy:
resources:
limits:
memory: 512M
volumes:
- *x-volume-timezone
- *x-volume-localtime
- *x-volume-docker-socket
- *x-volume-cgroups
- /etc/service:/etc/service:ro
- /:/host:ro,rslave
- /sys:/host/sys:ro
- /proc:/host/proc:ro
- /run/udev/data:/run/udev/data:ro
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
ntfy-alertmanager:
<<: *x-common
user: 0:0
cap_add:
- DAC_OVERRIDE
container_name: ntfy-alertmanager
hostname: ntfy-alertmanager
image: xenrox/ntfy-alertmanager:latest
restart: always
ports:
- "6080:8080"
expose:
- "8080"
environment:
NTFY_TOPIC: "alertmanager"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "ntfy"
deploy:
resources:
limits:
memory: 512M
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/ntfy-alertmanager.cfg:/etc/ntfy-alertmanager/config
alertmanager:
<<: *x-common
cap_add:
- DAC_OVERRIDE
container_name: alertmanager
hostname: alertmanager
image: prom/alertmanager:latest
restart: always
ports:
- "9093:9093"
expose:
- "9093"
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9093/ \|| exit 1
labels:
com.stack.name: "monitoring"
com.stack.service.name: "alertmanager"
deploy:
resources:
limits:
memory: 512M
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /opt/docker/monitoring/datas/alertmanager:/alertmanager
mimir:
<<: *x-common
stop_grace_period: 60s
user: 0:0
cap_add:
- DAC_OVERRIDE
- SETUID
- SETGID
- CHOWN
- SYS_ADMIN
container_name: mimir
hostname: mimir
image: grafana/mimir:latest
restart: always
ports:
- "9009:9009"
- "9095:9095"
expose:
- "9009"
- "9095"
command:
- "--config.file=/etc/mimir/mimir.yaml"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9009/ \|| exit 1
labels:
com.stack.name: "monitoring"
com.stack.service.name: "mimir"
deploy:
resources:
limits:
memory: 4G
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/mimir.yml:/etc/mimir/mimir.yaml
- /opt/docker/monitoring/datas/mimir:/mimir
prometheus:
<<: *x-common
stop_grace_period: 60s
cap_add:
- DAC_OVERRIDE
container_name: prometheus
hostname: prometheus
image: prom/prometheus:latest
restart: always
depends_on:
- mimir
- alertmanager
- cadvisor
- node-exporter
links:
- mimir
- alertmanager
- cadvisor
- node-exporter
ports:
- "9090:9090"
expose:
- "9090"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9090/ \|| exit 1
command:
- "--storage.tsdb.retention.time=365d"
- "--config.file=/etc/prometheus/prometheus.yml"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "prometheus"
deploy:
resources:
limits:
memory: 2G
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- /opt/docker/monitoring/conf/alertmanager/rules.yml:/alertmanager/rules.yml:ro
- /opt/docker/monitoring/datas/prometheus:/prometheus
grafana:
<<: *x-common
logging:
driver: json-file
options:
max-size: 32m
max-file: "7"
compress: "true"
user: 0:0
cap_add:
- DAC_OVERRIDE
- SETUID
- SETGID
- CHOWN
- SYS_ADMIN
container_name: grafana
hostname: grafana
image: grafana/grafana:latest
restart: always
ports:
- "3000:3000"
expose:
- "3000"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health \|| exit 1
depends_on:
- notifier
- prometheus
links:
- notifier
- prometheus
environment:
GF_USERS_ALLOW_SIGN_UP: false
GF_SERVER_SERVE_FROM_SUB_PATH: true
GF_INSTALL_PLUGINS: "grafana-clock-panel,grafana-piechart-panel,grafana-simple-json-datasource,grafana-worldmap-panel,camptocamp-prometheus-alertmanager-datasource"
GF_SECURITY_ADMIN_USER: "[admin username]"
GF_SECURITY_ADMIN_PASSWORD: "[admin password]"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "grafana"
tmpfs:
- /tmp:rw,noexec,nosuid,size=5120k
deploy:
resources:
limits:
cpus: "4.0"
memory: 1G
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/grafana.ini:/etc/grafana/grafana.ini
- /opt/docker/monitoring/conf/provisioning:/etc/grafana/provisioning
- /opt/docker/monitoring/conf/provisioning/provisioned:/etc/grafana/dashboards/provisioned
- /opt/docker/monitoring/datas/grafana/dashboards:/etc/grafana/dashboards
- /opt/docker/monitoring/datas/grafana/datas:/var/lib/grafana
conf/mimir.yml
[Fichier]
# 2023-05-25
usage_stats:
enabled: false
multitenancy_enabled: true
blocks_storage:
backend: filesystem
bucket_store:
sync_dir: /mimir/tsdb-sync
filesystem:
dir: /mimir/data/tsdb
tsdb:
dir: /mimir/tsdb
compactor:
data_dir: /mimir/compactor
sharding_ring:
kvstore:
store: memberlist
distributor:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
ingester:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
replication_factor: 1
ruler_storage:
backend: filesystem
filesystem:
dir: /tmp/mimir/rules
server:
http_listen_port: 9009
grpc_listen_port: 9095
log_level: error
store_gateway:
sharding_ring:
replication_factor: 1
limits:
max_label_names_per_series: 128
out_of_order_time_window: 90d
conf/alertmanager.yml
[Fichier]
# 2023-05-24
global:
resolve_timeout: 5m
route:
group_by: ["..."]
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: "ntfy"
routes:
- matchers: \['{severity=~\"info|warning|critical\"}']
receivers:
- name: "ntfy"
webhook_configs:
- url: "http://ntfy-alertmanager:6080"
conf/ntfy-alertmanager.cfg
[Fichier]
# 2023-05-24
# Public facing base URL of the service
# This setting is required for the "Silence" feature.
base-url https://ntfyforwarder.domain.com
# http listen address
http-address :6080
# Log level (either debug, info, warning, error)
log-level warning
# When multiple alerts are grouped together by Alertmanager, they can either be sent
# each on their own (single mode) or be kept together (multi mode) (either single or multi; default is single)
alert-mode single
labels {
order "severity,instance"
severity "critical" {
priority 5
tags "rotating_light"
}
severity "info" {
priority 1
}
}
# Settings for resolved alerts
resolved {
tags "resolved,partying_face"
}
ntfy {
# URL of the ntfy topic - required
topic http://ntfy:8050/alerts
}
alertmanager {
# If set, the ntfy message will contain a "Silence" button, which can be used
# to create a silence via the Alertmanager API. Because of limitations in ntfy,
# the request will be proxied through ntfy-alertmanager. Therefore ntfy-alertmanager
# needs to be exposed to external network requests and base-url has to be set.
#
# When alert-mode is set to "single" all alert labels will be used to create the silence.
# When it is "multi" common labels between all the alerts will be used. WARNING: This
# could silence unwanted alerts.
#silence-duration 24h
# By default the Alertmanager URL gets parsed from the webhook. In case that
# Alertmanger is not reachable under that URL, it can be overwritten here.
}
# When the alert-mode is set to single, ntfy-alertmanager will cache each single alert
# to avoid sending recurrences.
cache {
# The type of cache that will be used (either memory or redis; default is memory).
type memory
# How long messages stay in the cache for
duration 24h
# Memory cache settings
# Interval in which the cache is cleaned up
cleanup-interval 1h
# Redis cache settings
# URL to connect to redis (default: redis://localhost:6379)
redis-url redis://redis:6789
}
conf/prometheus.yml
[Fichier]
# 2023-05-25
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
external_labels:
cluster: org
namespace: org
remote_write:
- url: http://mimir:9009/api/v1/push
headers:
X-Scope-OrgID: org
alerting:
alertmanagers:
- static_configs:
- targets: [ "template.home:9093" ]
scheme: http
timeout: 10s
rule_files:
- /alertmanager/rules.yml
scrape_configs:
- job_name: "template"
honor_timestamps: true
honor_labels: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets: ["prometheus:9090", "cadvisor:8080", "node-exporter:9100"]
labels:
alias: Template
ip: 192.168.0.0
conf/grafana.ini
[Fichier]
# 2023-05-21
##################### Grafana Configuration Defaults #####################
#
# Do not modify this file in grafana installs
#
# possible values : production, development
app_mode = production
# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty
instance_name = ${HOSTNAME}
# force migration will run migrations that might cause dataloss
force_migration = false
#################################### Paths ###############################
[paths]
# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used)
data = data
# Temporary files in `data` directory older than given duration will be removed
temp_data_lifetime = 24h
# Directory where grafana can store logs
logs = data/log
# Directory where grafana will automatically scan and look for plugins
plugins = data/plugins
# folder that contains provisioning config files that grafana will apply on startup and while running.
provisioning = conf/provisioning
#################################### Server ##############################
[server]
# Protocol (http, https, h2, socket)
protocol = http
# Minimum TLS version allowed. By default, this value is empty. Accepted values are: TLS1.2, TLS1.3. If nothing is set TLS1.2 would be taken
min_tls_version = ""
# The ip address to bind to, empty will bind to all interfaces
http_addr =
# The http port to use
http_port = 3000
# The public facing domain name used to access grafana from a browser
domain = localhost
# Redirect to correct domain if host header does not match domain
# Prevents DNS rebinding attacks
enforce_domain = false
# The full public facing url
#root_url = %(protocol)s://%(domain)s:%(http_port)s/
root_url = %(protocol)s://%(domain)s:%(http_port)s/
# Serve Grafana from subpath specified in `root_url` setting. By default it is set to `false` for compatibility reasons.
serve_from_sub_path = false
# Log web requests
router_logging = false
# the path relative working path
static_root_path = public
# enable gzip
#enable_gzip = false
enable_gzip = true
# https certs & key file
cert_file =
cert_key =
# Unix socket gid
# Changing the gid of a file without privileges requires that the target group is in the group of the process and that the process is the file owner
# It is recommended to set the gid as http server user gid
# Not set when the value is -1
socket_gid = -1
# Unix socket mode
socket_mode = 0660
# Unix socket path
socket = /tmp/grafana.sock
# CDN Url
cdn_url =
# Sets the maximum time in minutes before timing out read of an incoming request and closing idle connections.
# `0` means there is no timeout for reading the request.
read_timeout = 0
# This setting enables you to specify additional headers that the server adds to HTTP(S) responses.
[server.custom_response_headers]
#exampleHeader1 = exampleValue1
#exampleHeader2 = exampleValue2
#################################### GRPC Server #########################
[grpc_server]
network = "tcp"
address = "127.0.0.1:10000"
use_tls = false
cert_file =
key_file =
#################################### Database ############################
[database]
# You can configure the database connection by specifying type, host, name, user and password
# as separate properties or as on string using the url property.
# Either "mysql", "postgres" or "sqlite3", it's your choice
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =
# Use either URL or the previous fields to configure the database
# Example: mysql://user:secret@host:port/database
url =
# Max idle conn setting default is 2
max_idle_conn = 2
# Max conn setting default is 0 (mean not set)
max_open_conn =
# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours)
conn_max_lifetime = 14400
# Set to true to log the sql calls and execution times.
log_queries =
# For "postgres", use either "disable", "require" or "verify-full"
# For "mysql", use either "true", "false", or "skip-verify".
ssl_mode = disable
# Database drivers may support different transaction isolation levels.
# Currently, only "mysql" driver supports isolation levels.
# If the value is empty - driver's default isolation level is applied.
# For "mysql" use "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ" or "SERIALIZABLE".
isolation_level =
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
# For "sqlite3" only, path relative to data_path setting
path = grafana.db
# For "sqlite3" only. cache mode setting used for connecting to the database
cache_mode = private
# For "sqlite3" only. Enable/disable Write-Ahead Logging, https://sqlite.org/wal.html. Default is false.
wal = false
# For "mysql" only if migrationLocking feature toggle is set. How many seconds to wait before failing to lock the database for the migrations, default is 0.
locking_attempt_timeout_sec = 0
# For "sqlite" only. How many times to retry query in case of database is locked failures. Default is 0 (disabled).
query_retries = 0
# For "sqlite" only. How many times to retry transaction in case of database is locked failures. Default is 5.
transaction_retries = 5
# Set to true to add metrics and tracing for database queries.
instrument_queries = false
#################################### Cache server #############################
[remote_cache]
# Either "redis", "memcached" or "database" default is "database"
#type = database
type = redis
# cache connectionstring options
# database: will use Grafana primary database.
# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=0,ssl=false`. Only addr is required. ssl may be 'true', 'false', or 'insecure'.
# memcache: 127.0.0.1:11211
#connstr =
connstr = addr=redis:6379,pool_size=100,db=0,ssl=false
# prefix prepended to all the keys in the remote cache
prefix =
# This enables encryption of values stored in the remote cache
encryption =
#################################### Data proxy ###########################
[dataproxy]
# This enables data proxy logging, default is false
logging = false
# How long the data proxy waits to read the headers of the response before timing out, default is 30 seconds.
# This setting also applies to core backend HTTP data sources where query requests use an HTTP client with timeout set.
timeout = 30
# How long the data proxy waits to establish a TCP connection before timing out, default is 10 seconds.
dialTimeout = 10
# How many seconds the data proxy waits before sending a keepalive request.
keep_alive_seconds = 30
# How many seconds the data proxy waits for a successful TLS Handshake before timing out.
tls_handshake_timeout_seconds = 10
# How many seconds the data proxy will wait for a server's first response headers after
# fully writing the request headers if the request has an "Expect: 100-continue"
# header. A value of 0 will result in the body being sent immediately, without
# waiting for the server to approve.
expect_continue_timeout_seconds = 1
# Optionally limits the total number of connections per host, including connections in the dialing,
# active, and idle states. On limit violation, dials will block.
# A value of zero (0) means no limit.
max_conns_per_host = 0
# The maximum number of idle connections that Grafana will keep alive.
max_idle_connections = 100
# How many seconds the data proxy keeps an idle connection open before timing out.
idle_conn_timeout_seconds = 90
# If enabled and user is not anonymous, data proxy will add X-Grafana-User header with username into the request.
send_user_header = false
# Limit the amount of bytes that will be read/accepted from responses of outgoing HTTP requests.
response_limit = 0
# Limits the number of rows that Grafana will process from SQL data sources.
row_limit = 1000000
# Sets a custom value for the `User-Agent` header for outgoing data proxy requests. If empty, the default value is `Grafana/<BuildVersion>` (for example `Grafana/9.0.0`).
user_agent =
#################################### Analytics ###########################
[analytics]
# Server reporting, sends usage counters to stats.grafana.org every 24 hours.
# No ip addresses are being tracked, only simple counters to track
# running instances, dashboard and error counts. It is very helpful to us.
# Change this option to false to disable reporting.
#reporting_enabled = true
reporting_enabled = false
# The name of the distributor of the Grafana instance. Ex hosted-grafana, grafana-labs
reporting_distributor = grafana-labs
# Set to false to disable all checks to https://grafana.com
# for new versions of grafana. The check is used
# in some UI views to notify that a grafana update exists.
# This option does not cause any auto updates, nor send any information
# only a GET request to https://raw.githubusercontent.com/grafana/grafana/main/latest.json to get the latest version.
check_for_updates = true
# Set to false to disable all checks to https://grafana.com
# for new versions of plugins. The check is used
# in some UI views to notify that a plugin update exists.
# This option does not cause any auto updates, nor send any information
# only a GET request to https://grafana.com to get the latest versions.
check_for_plugin_updates = true
# Google Analytics universal tracking code, only enabled if you specify an id here
google_analytics_ua_id =
# Google Analytics 4 tracking code, only enabled if you specify an id here
google_analytics_4_id =
# When Google Analytics 4 Enhanced event measurement is enabled, we will try to avoid sending duplicate events and let Google Analytics 4 detect navigation changes, etc.
google_analytics_4_send_manual_page_views = false
# Google Tag Manager ID, only enabled if you specify an id here
google_tag_manager_id =
# Rudderstack write key, enabled only if rudderstack_data_plane_url is also set
rudderstack_write_key =
# Rudderstack data plane url, enabled only if rudderstack_write_key is also set
rudderstack_data_plane_url =
# Rudderstack SDK url, optional, only valid if rudderstack_write_key and rudderstack_data_plane_url is also set
rudderstack_sdk_url =
# Rudderstack Config url, optional, used by Rudderstack SDK to fetch source config
rudderstack_config_url =
# Intercom secret, optional, used to hash user_id before passing to Intercom via Rudderstack
intercom_secret =
# Application Insights connection string. Specify an URL string to enable this feature.
application_insights_connection_string =
# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``.
application_insights_endpoint_url =
# Controls if the UI contains any links to user feedback forms
feedback_links_enabled = true
#################################### Security ############################
[security]
# disable creation of admin user on first start of grafana
disable_initial_admin_creation = false
# default admin user, created on startup
admin_user = admin
# default admin password, can be changed before first start of grafana, or in profile settings
admin_password = admin
# default admin email, created on startup
admin_email = admin@localhost
# used for signing
secret_key =
# current key provider used for envelope encryption, default to static value specified by secret_key
encryption_provider = secretKey.v1
# list of configured key providers, space separated (Enterprise only): e.g., awskms.v1 azurekv.v1
available_encryption_providers =
# disable gravatar profile images
disable_gravatar = false
# data source proxy whitelist (ip_or_domain:port separated by spaces)
data_source_proxy_whitelist =
# disable protection against brute force login attempts
disable_brute_force_login_protection = false
# set to true if you host Grafana behind HTTPS. default is false.
cookie_secure = false
# set cookie SameSite attribute. defaults to `lax`. can be set to "lax", "strict", "none" and "disabled"
cookie_samesite = lax
# set to true if you want to allow browsers to render Grafana in a <frame>, <iframe>, <embed> or <object>. default is false.
allow_embedding = false
# Set to true if you want to enable http strict transport security (HSTS) response header.
# HSTS tells browsers that the site should only be accessed using HTTPS.
strict_transport_security = false
# Sets how long a browser should cache HSTS. Only applied if strict_transport_security is enabled.
strict_transport_security_max_age_seconds = 86400
# Set to true if to enable HSTS preloading option. Only applied if strict_transport_security is enabled.
strict_transport_security_preload = false
# Set to true if to enable the HSTS includeSubDomains option. Only applied if strict_transport_security is enabled.
strict_transport_security_subdomains = false
# Set to true to enable the X-Content-Type-Options response header.
# The X-Content-Type-Options response HTTP header is a marker used by the server to indicate that the MIME types advertised
# in the Content-Type headers should not be changed and be followed.
x_content_type_options = true
# Set to true to enable the X-XSS-Protection header, which tells browsers to stop pages from loading
# when they detect reflected cross-site scripting (XSS) attacks.
x_xss_protection = true
# Enable adding the Content-Security-Policy header to your requests.
# CSP allows to control resources the user agent is allowed to load and helps prevent XSS attacks.
content_security_policy = false
# Set Content Security Policy template used when adding the Content-Security-Policy header to your requests.
# $NONCE in the template includes a random nonce.
# $ROOT_PATH is server.root_url without the protocol.
content_security_policy_template = """script-src 'self' 'unsafe-eval' 'unsafe-inline' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline' blob:;img-src * data:;base-uri 'self';connect-src 'self' grafana.com ws://$ROOT_PATH wss://$ROOT_PATH;manifest-src 'self';media-src 'none';form-action 'self';"""
# Enable adding the Content-Security-Policy-Report-Only header to your requests.
# Allows you to monitor the effects of a policy without enforcing it.
content_security_policy_report_only = false
# Set Content Security Policy Report Only template used when adding the Content-Security-Policy-Report-Only header to your requests.
# $NONCE in the template includes a random nonce.
# $ROOT_PATH is server.root_url without the protocol.
content_security_policy_report_only_template = """script-src 'self' 'unsafe-eval' 'unsafe-inline' 'strict-dynamic' $NONCE;object-src 'none';font-src 'self';style-src 'self' 'unsafe-inline' blob:;img-src * data:;base-uri 'self';connect-src 'self' grafana.com ws://$ROOT_PATH wss://$ROOT_PATH;manifest-src 'self';media-src 'none';form-action 'self';"""
# Controls if old angular plugins are supported or not. This will be disabled by default in future release
angular_support_enabled = true
# The CSRF check will be executed even if the request has no login cookie.
csrf_always_check = false
[security.encryption]
# Defines the time-to-live (TTL) for decrypted data encryption keys stored in memory (cache).
# Please note that small values may cause performance issues due to a high frequency decryption operations.
data_keys_cache_ttl = 15m
# Defines the frequency of data encryption keys cache cleanup interval.
# On every interval, decrypted data encryption keys that reached the TTL are removed from the cache.
data_keys_cache_cleanup_interval = 1m
#################################### Snapshots ###########################
[snapshots]
# set to false to remove snapshot functionality
enabled = true
# snapshot sharing options
external_enabled = true
external_snapshot_url = https://snapshots.raintank.io
external_snapshot_name = Publish to snapshots.raintank.io
# Set to true to enable this Grafana instance act as an external snapshot server and allow unauthenticated requests for
# creating and deleting snapshots.
public_mode = false
# remove expired snapshot
snapshot_remove_expired = true
#################################### Dashboards ##################
[dashboards]
# Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1
versions_to_keep = 20
# Minimum dashboard refresh interval. When set, this will restrict users to set the refresh interval of a dashboard lower than given interval. Per default this is 5 seconds.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
min_refresh_interval = 5s
# Path to the default home dashboard. If this value is empty, then Grafana uses StaticRootPath + "dashboards/home.json"
default_home_dashboard_path =
################################### Data sources #########################
[datasources]
# Upper limit of data sources that Grafana will return. This limit is a temporary configuration and it will be deprecated when pagination will be introduced on the list data sources API.
datasource_limit = 5000
################################### SQL Data Sources #####################
[sql_datasources]
# Default maximum number of open connections maintained in the connection pool
# when connecting to SQL based data sources
max_open_conns_default = 100
# Default maximum number of idle connections maintained in the connection pool
# when connecting to SQL based data sources
max_idle_conns_default = 100
# Default maximum connection lifetime used when connecting
# to SQL based data sources.
max_conn_lifetime_default = 14400
#################################### Users ###############################
[users]
# disable user signup / registration
allow_sign_up = false
# Allow non admin users to create organizations
allow_org_create = false
# Set to true to automatically assign new users to the default organization (id 1)
auto_assign_org = true
# Set this value to automatically add new users to the provided organization (if auto_assign_org above is set to true)
auto_assign_org_id = 1
# Default role new users will be automatically assigned (if auto_assign_org above is set to true)
auto_assign_org_role = Viewer
# Require email validation before sign up completes
verify_email_enabled = false
# Background text for the user field on the login page
login_hint = email or username
password_hint = password
# Default UI theme ("dark" or "light" or "system")
default_theme = dark
# Default UI language (supported IETF language tag, such as en-US)
default_language = en-US
# Path to a custom home page. Users are only redirected to this if the default home dashboard is used. It should match a frontend route and contain a leading slash.
home_page =
# External user management
external_manage_link_url =
external_manage_link_name =
external_manage_info =
# Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard.
viewers_can_edit = false
# Editors can administrate dashboard, folders and teams they create
editors_can_admin = false
# The duration in time a user invitation remains valid before expiring. This setting should be expressed as a duration. Examples: 6h (hours), 2d (days), 1w (week). Default is 24h (24 hours). The minimum supported duration is 15m (15 minutes).
user_invite_max_lifetime_duration = 24h
# Enter a comma-separated list of usernames to hide them in the Grafana UI. These users are shown to Grafana admins and to themselves.
hidden_users =
[secretscan]
# Enable secretscan feature
enabled = false
# Interval to check for token leaks
interval = 5m
# base URL of the grafana token leak check service
base_url = https://secret-scanning.grafana.net
# URL to send outgoing webhooks to in case of detection
oncall_url =
# Whether to revoke the token if a leak is detected or just send a notification
revoke = true
[service_accounts]
# When set, Grafana will not allow the creation of tokens with expiry greater than this setting.
token_expiration_day_limit =
[auth]
# Login cookie name
login_cookie_name = grafana_session
# Disable usage of Grafana build-in login solution.
disable_login = false
# The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation (token_rotation_interval_minutes).
login_maximum_inactive_lifetime_duration =
# The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month).
login_maximum_lifetime_duration =
# How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes.
token_rotation_interval_minutes = 10
# Set to true to disable (hide) the login form, useful if you use OAuth
disable_login_form = false
# Set to true to disable the sign out link in the side menu. Useful if you use auth.proxy or auth.jwt.
disable_signout_menu = false
# URL to redirect the user to after sign out
signout_redirect_url =
# Set to true to attempt login with OAuth automatically, skipping the login screen.
# This setting is ignored if multiple OAuth providers are configured.
# Deprecated, use auto_login option for specific provider instead.
oauth_auto_login = false
# OAuth state max age cookie duration in seconds. Defaults to 600 seconds.
oauth_state_cookie_max_age = 600
# Skip forced assignment of OrgID 1 or 'auto_assign_org_id' for social logins
# Deprecated, use skip_org_role_sync option for specific provider instead.
oauth_skip_org_role_update_sync = false
# limit of api_key seconds to live before expiration
api_key_max_seconds_to_live = -1
# Set to true to enable SigV4 authentication option for HTTP-based datasources
sigv4_auth_enabled = false
# Set to true to enable verbose logging of SigV4 request signing
sigv4_verbose_logging = false
# Set to true to enable Azure authentication option for HTTP-based datasources
azure_auth_enabled = false
#################################### Anonymous Auth ######################
[auth.anonymous]
# enable anonymous access
enabled = false
# specify organization name that should be used for unauthenticated users
org_name = Main Org.
# specify role for unauthenticated users
org_role = Viewer
# mask the Grafana version number for unauthenticated users
hide_version = false
#################################### GitHub Auth #########################
[auth.github]
name = GitHub
icon = github
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_id
client_secret =
scopes = user:email,read:org
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
allowed_domains =
team_ids =
allowed_organizations =
role_attribute_path =
role_attribute_strict = false
allow_assign_grafana_admin = false
tls_skip_verify_insecure = false
#################################### GitLab Auth #########################
[auth.gitlab]
name = GitLab
icon = gitlab
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_id
client_secret =
scopes = api
auth_url = https://gitlab.com/oauth/authorize
token_url = https://gitlab.com/oauth/token
api_url = https://gitlab.com/api/v4
allowed_domains =
allowed_groups =
role_attribute_path =
role_attribute_strict = false
allow_assign_grafana_admin = false
skip_org_role_sync = false
tls_skip_verify_insecure = false
use_pkce = true
#################################### Google Auth #########################
[auth.google]
name = Google
icon = google
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_client_id
client_secret =
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
skip_org_role_sync = false
tls_skip_verify_insecure = false
use_pkce = true
#################################### Grafana.com Auth ####################
# legacy key names (so they work in env variables)
[auth.grafananet]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
[auth.grafana_com]
name = Grafana.com
icon = grafana
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
skip_org_role_sync = false
#################################### Azure AD OAuth #######################
[auth.azuread]
name = Microsoft
icon = microsoft
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_client_id
client_secret =
scopes = openid email profile
auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize
token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token
allowed_domains =
allowed_groups =
role_attribute_strict = false
allow_assign_grafana_admin = false
force_use_graph_api = false
tls_skip_verify_insecure = false
use_pkce = true
#################################### Okta OAuth #######################
[auth.okta]
name = Okta
icon = okta
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_id
client_secret =
scopes = openid profile email groups
auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize
token_url = https://<tenant-id>.okta.com/oauth2/v1/token
api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo
allowed_domains =
allowed_groups =
role_attribute_path =
role_attribute_strict = false
allow_assign_grafana_admin = false
skip_org_role_sync = false
tls_skip_verify_insecure = false
use_pkce = true
#################################### Generic OAuth #######################
[auth.generic_oauth]
name = OAuth
icon = signin
enabled = false
allow_sign_up = true
auto_login = false
client_id = some_id
client_secret =
scopes = user:email
empty_scopes = false
email_attribute_name = email:primary
email_attribute_path =
login_attribute_path =
name_attribute_path =
role_attribute_path =
role_attribute_strict = false
groups_attribute_path =
id_token_attribute_name =
team_ids_attribute_path =
auth_url =
token_url =
api_url =
teams_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
use_pkce = false
auth_style =
allow_assign_grafana_admin = false
skip_org_role_sync = false
#################################### Basic Auth ##########################
[auth.basic]
enabled = true
#################################### Auth Proxy ##########################
[auth.proxy]
enabled = false
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
sync_ttl = 60
whitelist =
headers =
headers_encoded = false
enable_login_token = false
#################################### Auth JWT ##########################
[auth.jwt]
enabled = false
enable_login_token = false
header_name =
email_claim =
username_claim =
jwk_set_url =
jwk_set_file =
cache_ttl = 60m
expect_claims = {}
key_file =
role_attribute_path =
role_attribute_strict = false
auto_sign_up = false
url_login = false
allow_assign_grafana_admin = false
skip_org_role_sync = false
#################################### Auth LDAP ###########################
[auth.ldap]
enabled = false
config_file = /etc/grafana/ldap.toml
allow_sign_up = true
skip_org_role_sync = false
# LDAP background sync (Enterprise only)
# At 1 am every day
sync_cron = "0 1 * * *"
active_sync_enabled = true
#################################### AWS ###########################
[aws]
# Enter a comma-separated list of allowed AWS authentication providers.
# Options are: default (AWS SDK Default), keys (Access && secret key), credentials (Credentials field), ec2_iam_role (EC2 IAM Role)
allowed_auth_providers = default,keys,credentials
# Allow AWS users to assume a role using temporary security credentials.
# If true, assume role will be enabled for all AWS authentication providers that are specified in aws_auth_providers
assume_role_enabled = true
# Specify max no of pages to be returned by the ListMetricPages API
list_metrics_page_limit = 500
#################################### Azure ###############################
[azure]
# Azure cloud environment where Grafana is hosted
# Possible values are AzureCloud, AzureChinaCloud, AzureUSGovernment and AzureGermanCloud
# Default value is AzureCloud (i.e. public cloud)
cloud = AzureCloud
# Specifies whether Grafana hosted in Azure service with Managed Identity configured (e.g. Azure Virtual Machines instance)
# If enabled, the managed identity can be used for authentication of Grafana in Azure services
# Disabled by default, needs to be explicitly enabled
managed_identity_enabled = false
# Client ID to use for user-assigned managed identity
# Should be set for user-assigned identity and should be empty for system-assigned identity
managed_identity_client_id =
# Specifies whether user identity authentication (on behalf of currently signed-in user) should be enabled in datasources
# that support it (requires AAD authentication)
# Disabled by default, needs to be explicitly enabled
user_identity_enabled = false
# Override token URL for Azure Active Directory
# By default is the same as token URL configured for AAD authentication settings
user_identity_token_url =
# Override ADD application ID which would be used to exchange users token to an access token for the datasource
# By default is the same as used in AAD authentication or can be set to another application (for OBO flow)
user_identity_client_id =
# Override the AAD application client secret
# By default is the same as used in AAD authentication or can be set to another application (for OBO flow)
user_identity_client_secret =
#################################### Role-based Access Control ###########
[rbac]
# If enabled, cache permissions in a in memory cache
permission_cache = true
# Reset basic roles permissions on boot
# Warning left to true, basic roles permissions will be reset on every boot
reset_basic_roles = false
#################################### SMTP / Emailing #####################
[smtp]
enabled = false
host = localhost:25
user =
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =
cert_file =
key_file =
skip_verify = false
from_address = [email protected]
from_name = Grafana
ehlo_identity =
startTLS_policy =
[emails]
welcome_email_on_sign_up = false
templates_pattern = emails/*.html, emails/*.txt
content_types = text/html
#################################### Logging ##########################
[log]
# Either "console", "file", "syslog". Default is console and file
# Use space to separate multiple modes, e.g. "console file"
mode = console file
# Either "debug", "info", "warn", "error", "critical", default is "info"
#level = info
level = warn
# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug
filters =
# For "console" mode only
[log.console]
level =
# log line format, valid options are text, console and json
format = console
# For "file" mode only
[log.file]
level =
# log line format, valid options are text, console and json
format = text
# This enables automated log rotate(switch of following options), default is true
log_rotate = true
# Max line number of single file, default is 1000000
max_lines = 1000000
# Max size shift of single file, default is 28 means 1 << 28, 256MB
max_size_shift = 28
# Segment log daily, default is true
daily_rotate = true
# Expired days of log file(delete after max days), default is 7
max_days = 7
[log.syslog]
level =
# log line format, valid options are text, console and json
format = text
# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used.
network =
address =
# Syslog facility. user, daemon and local0 through local7 are valid.
facility =
# Syslog tag. By default, the process' argv[0] is used.
tag =
[log.frontend]
# Should Faro javascript agent be initialized
enabled = false
# Custom HTTP endpoint to send events to. Default will log the events to stdout.
custom_endpoint =
# Requests per second limit enforced per an extended period, for Grafana backend log ingestion endpoint (/log).
log_endpoint_requests_per_second_limit = 3
# Max requests accepted per short interval of time for Grafana backend log ingestion endpoint (/log)
log_endpoint_burst_limit = 15
# Should error instrumentation be enabled, only affects Grafana Javascript Agent
instrumentations_errors_enabled = true
# Should console instrumentation be enabled, only affects Grafana Javascript Agent
instrumentations_console_enabled = false
# Should webvitals instrumentation be enabled, only affects Grafana Javascript Agent
instrumentations_webvitals_enabled = false
# Api Key, only applies to Grafana Javascript Agent provider
api_key =
#################################### Usage Quotas ########################
[quota]
enabled = false
#### set quotas to -1 to make unlimited. ####
# limit number of users per Org.
org_user = 10
# limit number of dashboards per Org.
org_dashboard = 100
# limit number of data_sources per Org.
org_data_source = 10
# limit number of api_keys per Org.
org_api_key = 10
# limit number of alerts per Org.
org_alert_rule = 100
# limit number of orgs a user can create.
user_org = 10
# Global limit of users.
global_user = -1
# global limit of orgs.
global_org = -1
# global limit of dashboards
global_dashboard = -1
# global limit of api_keys
global_api_key = -1
# global limit on number of logged in users.
global_session = -1
# global limit of alerts
global_alert_rule = -1
# global limit of files uploaded to the SQL DB
global_file = 1000
# global limit of correlations
global_correlations = -1
#################################### Unified Alerting ####################
[unified_alerting]
# Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed when switching. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details.
enabled =
# Comma-separated list of organization IDs for which to disable unified alerting. Only supported if unified alerting is enabled.
disabled_orgs =
# Specify the frequency of polling for admin config changes.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
admin_config_poll_interval = 60s
# Specify the frequency of polling for Alertmanager config changes.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
alertmanager_config_poll_interval = 60s
# The redis server address that should be connected to.
ha_redis_address =
# The username that should be used to authenticate with the redis server.
ha_redis_username =
# The password that should be used to authenticate with the redis server.
ha_redis_password =
# The redis database, by default it's 0.
ha_redis_db =
# A prefix that is used for every key or channel that is created on the redis server
# as part of HA for alerting.
ha_redis_prefix =
# The name of the cluster peer that will be used as identifier. If none is
# provided, a random one will be generated.
ha_redis_peer_name =
# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port.
ha_listen_address = "0.0.0.0:9094"
# Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP.
ha_advertise_address = ""
# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
ha_peers = ""
# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
# each instance wait before sending the notification to take into account replication lag.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_peer_timeout = 15s
# The label is an optional string to include on each packet and stream.
# It uniquely identifies the cluster and prevents cross-communication
# issues when sending gossip messages in an enviromenet with multiple clusters.
ha_label =
# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
# across cluster more quickly at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_gossip_interval = 200ms
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
# across larger clusters at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_push_pull_interval = 60s
# Enable or disable alerting rule execution. The alerting UI remains visible. This option has a legacy version in the `[alerting]` section that takes precedence.
execute_alerts = true
# Alert evaluation timeout when fetching data from the datasource. This option has a legacy version in the `[alerting]` section that takes precedence.
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
max_attempts = 3
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
min_interval = 10s
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.
# For more information on configuration options, refer to [rendering].
capture = false
# The timeout for capturing screenshots. If a screenshot cannot be captured within the timeout then
# the notification is sent without a screenshot. The maximum duration is 30 seconds. This timeout
# should be less than the minimum Interval of all Evaluation Groups to avoid back pressure on alert
# rule evaluation.
capture_timeout = 10s
# The maximum number of screenshots that can be taken at the same time. This option is different from
# concurrent_render_request_limit as max_concurrent_screenshots sets the number of concurrent screenshots
# that can be taken at the same time for all firing alerts where as concurrent_render_request_limit sets
# the total number of concurrent screenshots across all Grafana services.
max_concurrent_screenshots = 5
# Uploads screenshots to the local Grafana server or remote storage such as Azure, S3 and GCS. Please
# see [external_image_storage] for further configuration options. If this option is false then
# screenshots will be persisted to disk for up to temp_data_lifetime.
upload_external_image_storage = false
[unified_alerting.reserved_labels]
# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled.
# For example: `disabled_labels=grafana_folder`
disabled_labels =
[unified_alerting.state_history]
# Enable the state history functionality in Unified Alerting. The previous states of alert rules will be visible in panels and in the UI.
enabled = true
# Select which pluggable state history backend to use. Either "annotations", "loki", or "multiple"
# "loki" writes state history to an external Loki instance. "multiple" allows history to be written to multiple backends at once.
# Defaults to "annotations".
backend =
# For "multiple" only.
# Indicates the main backend used to serve state history queries.
# Either "annotations" or "loki"
primary =
# For "multiple" only.
# Comma-separated list of additional backends to write state history data to.
secondaries =
# For "loki" only.
# URL of the external Loki instance.
# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend.
loki_remote_url =
# For "loki" only.
# URL of the external Loki's read path. To be used in configurations where Loki has separated read and write URLs.
# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend.
loki_remote_read_url =
# For "loki" only.
# URL of the external Loki's write path. To be used in configurations where Loki has separated read and write URLs.
# Either "loki_remote_url", or both of "loki_remote_read_url" and "loki_remote_write_url" is required for the "loki" backend.
loki_remote_write_url =
# For "loki" only.
# Optional tenant ID to attach to requests sent to Loki.
loki_tenant_id =
# For "loki" only.
# Optional username for basic authentication on requests sent to Loki. Can be left blank to disable basic auth.
loki_basic_auth_username =
# For "loki" only.
# Optional password for basic authentication on requests sent to Loki. Can be left blank.
loki_basic_auth_password =
[unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided.
#
# ex.
# mylabelkey = mylabelvalue
#################################### Alerting ############################
[alerting]
# Enable the legacy alerting sub-system and interface. If Unified Alerting is already enabled and you try to go back to legacy alerting, all data that is part of Unified Alerting will be deleted. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details.
enabled =
# Makes it possible to turn off alert execution but alerting UI is visible
execute_alerts = true
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
error_or_timeout = alerting
# Default setting for how Grafana handles nodata or null values in alerting. (alerting, no_data, keep_state, ok)
nodata_or_nullvalues = no_data
# Alert notifications can include images, but rendering many images at the same time can overload the server
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
concurrent_render_limit = 5
# Default setting for alert calculation timeout. Default value is 30
evaluation_timeout_seconds = 30
# Default setting for alert notification timeout. Default value is 30
notification_timeout_seconds = 30
# Default setting for max attempts to sending alert notifications. Default value is 3
max_attempts = 3
# Makes it possible to enforce a minimal interval between evaluations, to reduce load on the backend
min_interval_seconds = 1
# Configures for how long alert annotations are stored. Default is 0, which keeps them forever.
# This setting should be expressed as an duration. Ex 6h (hours), 10d (days), 2w (weeks), 1M (month).
max_annotation_age =
# Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations.
max_annotations_to_keep =
#################################### Annotations #########################
[annotations]
# Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations.
cleanupjob_batchsize = 100
# Enforces the maximum allowed length of the tags for any newly introduced annotations. It can be between 500 and 4096 inclusive (which is the respective's column length). Default value is 500.
# Setting it to a higher value would impact performance therefore is not recommended.
tags_length = 500
[annotations.dashboard]
# Dashboard annotations means that annotations are associated with the dashboard they are created on.
# Configures how long dashboard annotations are stored. Default is 0, which keeps them forever.
# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month).
max_age =
# Configures max number of dashboard annotations that Grafana stores. Default value is 0, which keeps all dashboard annotations.
max_annotations_to_keep =
[annotations.api]
# API annotations means that the annotations have been created using the API without any
# association with a dashboard.
# Configures how long Grafana stores API annotations. Default is 0, which keeps them forever.
# This setting should be expressed as a duration. Examples: 6h (hours), 10d (days), 2w (weeks), 1M (month).
max_age =
# Configures max number of API annotations that Grafana keeps. Default value is 0, which keeps all API annotations.
max_annotations_to_keep =
#################################### Explore #############################
[explore]
# Enable the Explore section
enabled = true
#################################### Help #############################
[help]
# Enable the Help section
enabled = true
#################################### Profile #############################
[profile]
# Enable the Profile section
enabled = true
#################################### Query History #############################
[query_history]
# Enable the Query history
enabled = true
#################################### Internal Grafana Metrics ############
# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId
[metrics]
enabled = true
interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated
disable_total_stats = false
#If both are set, basic auth will be required for the metrics endpoints.
basic_auth_username =
basic_auth_password =
# Metrics environment info adds dimensions to the `grafana_environment_info` metric, which
# can expose more information about the Grafana instance.
[metrics.environment_info]
#exampleLabel1 = exampleValue1
#exampleLabel2 = exampleValue2
# Send internal Grafana metrics to graphite
[metrics.graphite]
# Enable by setting the address setting (ex localhost:2003)
address =
prefix = prod.grafana.%(instance_name)s.
#################################### Grafana.com integration ##########################
[grafana_net]
url = https://grafana.com
[grafana_com]
url = https://grafana.com
api_url = https://grafana.com/api
#################################### Distributed tracing ############
# Opentracing is deprecated use opentelemetry instead
[tracing.jaeger]
# jaeger destination (ex localhost:6831)
address =
# tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2)
always_included_tag =
# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote
sampler_type = const
# jaeger samplerconfig param
# for "const" sampler, 0 or 1 for always false/true respectively
# for "probabilistic" sampler, a probability between 0 and 1
# for "rateLimiting" sampler, the number of spans per second
# for "remote" sampler, param is the same as for "probabilistic"
# and indicates the initial sampling rate before the actual one
# is received from the mothership
sampler_param = 1
# sampling_server_url is the URL of a sampling manager providing a sampling strategy.
sampling_server_url =
# Whether or not to use Zipkin span propagation (x-b3- HTTP headers).
zipkin_propagation = false
# Setting this to true disables shared RPC spans.
# Not disabling is the most common setting when using Zipkin elsewhere in your infrastructure.
disable_shared_zipkin_spans = false
[tracing.opentelemetry]
# attributes that will always be included in when creating new spans. ex (key1:value1,key2:value2)
custom_attributes =
[tracing.opentelemetry.jaeger]
# jaeger destination (ex http://localhost:14268/api/traces)
address =
# Propagation specifies the text map propagation format: w3c, jaeger
propagation =
# This is a configuration for OTLP exporter with GRPC protocol
[tracing.opentelemetry.otlp]
# otlp destination (ex localhost:4317)
address =
# Propagation specifies the text map propagation format: w3c, jaeger
propagation =
#################################### External Image Storage ##############
[external_image_storage]
# Used for uploading images to public servers so they can be included in slack/email messages.
# You can choose between (s3, webdav, gcs, azure_blob, local)
provider =
[external_image_storage.s3]
endpoint =
path_style_access =
bucket_url =
bucket =
region =
path =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
[external_image_storage.gcs]
key_file =
bucket =
path =
enable_signed_urls = false
signed_url_expiration =
[external_image_storage.azure_blob]
account_name =
account_key =
container_name =
sas_token_expiration_days =
[external_image_storage.local]
# does not require any configuration
[rendering]
# Options to configure a remote HTTP image rendering service, e.g. using https://github.com/grafana/grafana-image-renderer.
# URL to a remote HTTP image renderer service, e.g. http://localhost:8081/render, will enable Grafana to render panels and dashboards to PNG-images using HTTP requests to an external service.
server_url =
# If the remote HTTP image renderer service runs on a different server than the Grafana server you may have to configure this to a URL where Grafana is reachable, e.g. http://grafana.domain/.
callback_url =
# An auth token that will be sent to and verified by the renderer. The renderer will deny any request without an auth token matching the one configured on the renderer side.
renderer_token = -
# Concurrent render request limit affects when the /render HTTP endpoint is used. Rendering many images at the same time can overload the server,
# which this setting can help protect against by only allowing a certain amount of concurrent requests.
concurrent_render_request_limit = 30
# Determines the lifetime of the render key used by the image renderer to access and render Grafana.
# This setting should be expressed as a duration. Examples: 10s (seconds), 5m (minutes), 2h (hours).
# Default is 5m. This should be more than enough for most deployments.
# Change the value only if image rendering is failing and you see `Failed to get the render key from cache` in Grafana logs.
render_key_lifetime = 5m
[panels]
# here for to support old env variables, can remove after a few months
enable_alpha = false
disable_sanitize_html = false
[plugins]
enable_alpha = false
app_tls_skip_verify_insecure = false
# Enter a comma-separated list of plugin identifiers to identify plugins to load even if they are unsigned. Plugins with modified signatures are never loaded.
allow_loading_unsigned_plugins =
# Enable or disable installing / uninstalling / updating plugins directly from within Grafana.
plugin_admin_enabled = true
plugin_admin_external_manage_enabled = false
plugin_catalog_url = https://grafana.com/grafana/plugins/
# Enter a comma-separated list of plugin identifiers to hide in the plugin catalog.
plugin_catalog_hidden_plugins =
# Log all backend requests for core and external plugins.
log_backend_requests = false
# Force download of public key for verifying plugin signature on startup.
enforce_public_key_download = false
#################################### Grafana Live ##########################################
[live]
# max_connections to Grafana Live WebSocket endpoint per Grafana server instance. See Grafana Live docs
# if you are planning to make it higher than default 100 since this can require some OS and infrastructure
# tuning. 0 disables Live, -1 means unlimited connections.
max_connections = 100
# allowed_origins is a comma-separated list of origins that can establish connection with Grafana Live.
# If not set then origin will be matched over root_url. Supports wildcard symbol "*".
allowed_origins =
# engine defines an HA (high availability) engine to use for Grafana Live. By default no engine used - in
# this case Live features work only on a single Grafana server.
# Available options: "redis".
# Setting ha_engine is an EXPERIMENTAL feature.
ha_engine =
# ha_engine_address sets a connection address for Live HA engine. Depending on engine type address format can differ.
# For now we only support Redis connection address in "host:port" format.
# This option is EXPERIMENTAL.
ha_engine_address = "127.0.0.1:6379"
#################################### Grafana Image Renderer Plugin ##########################
[plugin.grafana-image-renderer]
# Instruct headless browser instance to use a default timezone when not provided by Grafana, e.g. when rendering panel image of alert.
# See ICU’s metaZones.txt (https://cs.chromium.org/chromium/src/third_party/icu/source/data/misc/metaZones.txt) for a list of supported
# timezone IDs. Fallbacks to TZ environment variable if not set.
rendering_timezone =
# Instruct headless browser instance to use a default language when not provided by Grafana, e.g. when rendering panel image of alert.
# Please refer to the HTTP header Accept-Language to understand how to format this value, e.g. 'fr-CH, fr;q=0.9, en;q=0.8, de;q=0.7, *;q=0.5'.
rendering_language =
# Instruct headless browser instance to use a default device scale factor when not provided by Grafana, e.g. when rendering panel image of alert.
# Default is 1. Using a higher value will produce more detailed images (higher DPI), but will require more disk space to store an image.
rendering_viewport_device_scale_factor =
# Instruct headless browser instance whether to ignore HTTPS errors during navigation. Per default HTTPS errors are not ignored. Due to
# the security risk it's not recommended to ignore HTTPS errors.
rendering_ignore_https_errors =
# Instruct headless browser instance whether to capture and log verbose information when rendering an image. Default is false and will
# only capture and log error messages. When enabled, debug messages are captured and logged as well.
# For the verbose information to be included in the Grafana server log you have to adjust the rendering log level to debug, configure
# [log].filter = rendering:debug.
rendering_verbose_logging =
# Instruct headless browser instance whether to output its debug and error messages into running process of remote rendering service.
# Default is false. This can be useful to enable (true) when troubleshooting.
rendering_dumpio =
# Additional arguments to pass to the headless browser instance. Default is --no-sandbox. The list of Chromium flags can be found
# here (https://peter.sh/experiments/chromium-command-line-switches/). Multiple arguments is separated with comma-character.
rendering_args =
# You can configure the plugin to use a different browser binary instead of the pre-packaged version of Chromium.
# Please note that this is not recommended, since you may encounter problems if the installed version of Chrome/Chromium is not
# compatible with the plugin.
rendering_chrome_bin =
# Instruct how headless browser instances are created. Default is 'default' and will create a new browser instance on each request.
# Mode 'clustered' will make sure that only a maximum of browsers/incognito pages can execute concurrently.
# Mode 'reusable' will have one browser instance and will create a new incognito page on each request.
rendering_mode =
# When rendering_mode = clustered, you can instruct how many browsers or incognito pages can execute concurrently. Default is 'browser'
# and will cluster using browser instances.
# Mode 'context' will cluster using incognito pages.
rendering_clustering_mode =
# When rendering_mode = clustered, you can define the maximum number of browser instances/incognito pages that can execute concurrently. Default is '5'.
rendering_clustering_max_concurrency =
# When rendering_mode = clustered, you can specify the duration a rendering request can take before it will time out. Default is `30` seconds.
rendering_clustering_timeout =
# Limit the maximum viewport width, height and device scale factor that can be requested.
rendering_viewport_max_width =
rendering_viewport_max_height =
rendering_viewport_max_device_scale_factor =
# Change the listening host and port of the gRPC server. Default host is 127.0.0.1 and default port is 0 and will automatically assign
# a port not in use.
grpc_host =
grpc_port =
[enterprise]
license_path =
[feature_toggles]
# there are currently two ways to enable feature toggles in the `grafana.ini`.
# you can either pass an array of feature you want to enable to the `enable` field or
# configure each toggle by setting the name of the toggle to true/false. Toggles set to true/false
# will take precedence over toggles in the `enable` list.
# enable = feature1,feature2
enable =
# Some features are enabled by default, see:
# https://grafana.com/docs/grafana/next/setup-grafana/configure-grafana/feature-toggles/
# To enable features by default, set `Expression: "true"` in:
# https://github.com/grafana/grafana/blob/main/pkg/services/featuremgmt/registry.go
# feature1 = true
# feature2 = false
[date_formats]
# For information on what formatting patterns that are supported https://momentjs.com/docs/#/displaying/
# Default system date format used in time range picker and other places where full time is displayed
full_date = YYYY-MM-DD HH:mm:ss
# Used by graph and other places where we only show small intervals
interval_second = HH:mm:ss
interval_minute = HH:mm
interval_hour = MM/DD HH:mm
interval_day = MM/DD
interval_month = YYYY-MM
interval_year = YYYY
# Experimental feature
use_browser_locale = false
# Default timezone for user preferences. Options are 'browser' for the browser local timezone or a timezone name from IANA Time Zone database, e.g. 'UTC' or 'Europe/Amsterdam' etc.
default_timezone = browser
[expressions]
# Enable or disable the expressions functionality.
enabled = true
[geomap]
# Set the JSON configuration for the default basemap
default_baselayer_config =
# Enable or disable loading other base map layers
enable_custom_baselayers = true
#################################### Support Bundles #####################################
[support_bundles]
# Enable support bundle creation (default: true)
enabled = true
# Only server admins can generate and view support bundles (default: true)
server_admin_only = true
# If set, bundles will be encrypted with the provided public keys separated by whitespace
public_keys = ""
#################################### Storage ################################################
[storage]
# Allow uploading SVG files without sanitization.
allow_unsanitized_svg_upload = false
#################################### Search ################################################
[search]
# Defines the number of dashboards loaded at once in a batch during a full reindex.
# This is a temporary settings that might be removed in the future.
dashboard_loading_batch_size = 200
# Defines the frequency of a full search reindex.
# This is a temporary settings that might be removed in the future.
full_reindex_interval = 5m
# Defines the frequency of partial index updates based on recent changes such as dashboard updates.
# This is a temporary settings that might be removed in the future.
index_update_interval = 10s
# Move an app plugin referenced by its id (including all its pages) to a specific navigation section
# Format: <Plugin ID> = <Section ID> <Sort Weight>
[navigation.app_sections]
# Move a specific app plugin page (referenced by its `path` field) to a specific navigation section
# Format: <Page URL> = <Section ID> <Sort Weight>
[navigation.app_standalone_pages]
#################################### Secure Socks5 Datasource Proxy #####################################
[secure_socks_datasource_proxy]
enabled = false
root_ca_cert =
client_key =
client_cert =
server_name =
# The address of the socks5 proxy datasources should connect to
proxy_address =
conf/alertmanager/rules.yml
[Fichier]
Vous n'aurez peut-être pas besoin de l'ensemble de ces alertes mais elle constituent une bonne base de suivi.
# 2023-05-28
groups:
- name: Alerts
rules:
# Host out of memory
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host memory under memory pressure
- alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host Memory is under utilized
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderUtilized
expr: (100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is under utilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual network throughput in
- alert: HostUnusualNetworkThroughputIn
expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual network throughput out
- alert: HostUnusualNetworkThroughputOut
expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual disk read rate
- alert: HostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual disk write rate
- alert: HostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host out of disk space
# Please add ignored mountpoints in node_exporter parameters like
# \"--collector.filesystem.ignored-mount-points=^/\(sys|proc|dev|run)\(\$|/)\".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host disk will fill in 24 hours
# Please add ignored mountpoints in node_exporter parameters like
# \"--collector.filesystem.ignored-mount-points=^/\(sys|proc|dev|run)\(\$|/)\".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host out of inodes
- alert: HostOutOfInodes
expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host inodes will fill in 24 hours
- alert: HostInodesWillFillIn24Hours
expr: (node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual disk read latency
- alert: HostUnusualDiskReadLatency
expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual disk write latency
- alert: HostUnusualDiskWriteLatency
expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host high CPU
- alert: HostHighCpuLoad
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host CPU is under utilized
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderUtilized
expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is under utilized (instance {{ $labels.instance }})
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host CPU steal noisy neighbor
- alert: HostCpuStealNoisyNeighbor
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host CPU high iowait
- alert: HostCpuHighIowait
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host unusual disk IO
- alert: HostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## Host context switching
# # 1000 context switches is an arbitrary number.
# # Alert threshold depends on nature of application.
# # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
# - alert: HostContextSwitching
# expr: ((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host context switching (instance {{ $labels.instance }})
# description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host swap is filling up
- alert: HostSwapIsFillingUp
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host systemd service crashed
- alert: HostSystemdServiceCrashed
expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host kernel version deviations
- alert: HostKernelVersionDeviations
expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host OOM kill detected
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host Network Receive Errors
- alert: HostNetworkReceiveErrors
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host Network Transmit Errors
- alert: HostNetworkTransmitErrors
expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host Network Interface Saturated
- alert: HostNetworkInterfaceSaturated
expr: \(\(rate\(node\_network\_receive\_bytes\_total{device!~\"^tap.\*|^vnet.\*|^veth.\*|^tun.\*\"}\[1m]) + rate\(node\_network\_transmit\_bytes\_total{device!~\"^tap.\*|^vnet.\*|^veth.\*|^tun.\*\"}\[1m])) / node\_network\_speed\_bytes{device!~\"^tap.\*|^vnet.\*|^veth.\*|^tun.\*\"} > 0.8 < 10000) \* on\(instance) group\_left \(nodename) node\_uname\_info{nodename=~\".+\"}
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host Network Bond Degraded
- alert: HostNetworkBondDegraded
expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host conntrack limit
- alert: HostConntrackLimit
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host clock skew
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host clock not synchronising
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Host requires reboot
- alert: HostRequiresReboot
expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## Container killed
# # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
# - alert: ContainerKilled
# expr: time() - container_last_seen > 60
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Container killed (instance {{ $labels.instance }})
# description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container CPU usage
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container CPU usage (instance {{ $labels.instance }})
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container Memory usage
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container Volume usage
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Container Volume usage (instance {{ $labels.instance }})
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container high throttle rate
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Container Low CPU utilization
- alert: ContainerLowCpuUtilization
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20
for: 7d
labels:
severity: info
annotations:
summary: Container Low CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# ContainerLowMemoryUsage
- alert: ContainerLowMemoryUsage
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20
for: 7d
labels:
severity: info
annotations:
summary: Container Low Memory usage (instance {{ $labels.instance }})
description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Traefik service down
- alert: TraefikServiceDown
expr: count(traefik_service_server_up) by (service) == 0
for: 0m
labels:
severity: critical
annotations:
summary: Traefik service down (instance {{ $labels.instance }})
description: "All Traefik services are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Traefik high HTTP 4xx error rate service
- alert: TraefikHighHttp4xxErrorRateService
expr: sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 10
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 4xx error rate is above 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Traefik high HTTP 5xx error rate service
- alert: TraefikHighHttp5xxErrorRateService
expr: sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 10
for: 1m
labels:
severity: critical
annotations:
summary: Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Traefik service 5xx error rate is above 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# MySQL down
- alert: MysqlDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: MySQL down (instance {{ $labels.instance }})
description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# MySQL too many connections (> 80%)
- alert: MysqlTooManyConnections(>80%)
expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# MySQL high threads running
- alert: MysqlHighThreadsRunning
expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
for: 2m
labels:
severity: warning
annotations:
summary: MySQL high threads running (instance {{ $labels.instance }})
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# MySQL slow queries
- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: MySQL slow queries (instance {{ $labels.instance }})
description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql down
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql too many connections
- alert: PostgresqlTooManyConnections
expr: sum by \(datname) \(pg\_stat\_activity\_count{datname!~\"template.\*|postgres\"}) > pg\_settings\_max\_connections \* 0.8
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many connections (instance {{ $labels.instance }})
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## Postgresql not enough connections
# - alert: PostgresqlNotEnoughConnections
# expr: sum by \(datname) \(pg\_stat\_activity\_count{datname!~\"template.\*|postgres\"}) < 5
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Postgresql not enough connections (instance {{ $labels.instance }})
# description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql dead locks
- alert: PostgresqlDeadLocks
expr: increase\(pg\_stat\_database\_deadlocks{datname!~\"template.\*|postgres\"}\[1m]) > 5
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql high rollback rate
- alert: PostgresqlHighRollbackRate
expr: sum by \(namespace,datname) \(\(rate\(pg\_stat\_database\_xact\_rollback{datname!~\"template.\*|postgres\",datid!=\"0\"}\[3m])) / \(\(rate\(pg\_stat\_database\_xact\_rollback{datname!~\"template.\*|postgres\",datid!=\"0\"}\[3m])) + \(rate\(pg\_stat\_database\_xact\_commit{datname!~\"template.\*|postgres\",datid!=\"0\"}\[3m])))) > 0.02
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## Postgresql commit rate low
# - alert: PostgresqlCommitRateLow
# expr: rate(pg_stat_database_xact_commit[1m]) < 10
# for: 2m
# labels:
# severity: critical
# annotations:
# summary: Postgresql commit rate low (instance {{ $labels.instance }})
# description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql high rate statement timeout
- alert: PostgresqlHighRateStatementTimeout
expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql high rate deadlock
- alert: PostgresqlHighRateDeadlock
expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql too many dead tuples
- alert: PostgresqlTooManyDeadTuples
expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql too many locks acquired
- alert: PostgresqlTooManyLocksAcquired
expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql bloat index high (> 80%)
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatIndexHigh(>80%)
expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Postgresql bloat table high (> 80%)
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatTableHigh(>80%)
expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Redis down
- alert: RedisDown
expr: redis_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Redis down (instance {{ $labels.instance }})
description: "Redis instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Redis out of system memory
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- alert: RedisOutOfSystemMemory
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Redis out of system memory (instance {{ $labels.instance }})
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Redis out of configured maxmemory
- alert: RedisOutOfConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
description: "Redis is running out of configured maxmemory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Redis too many connections
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 2m
labels:
severity: warning
annotations:
summary: Redis too many connections (instance {{ $labels.instance }})
description: "Redis instance has too many connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## Redis not enough connections
# - alert: RedisNotEnoughConnections
# expr: redis_connected_clients < 5
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: Redis not enough connections (instance {{ $labels.instance }})
# description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Redis rejected connections
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Redis rejected connections (instance {{ $labels.instance }})
description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# JVM memory filling up
- alert: JvmMemoryFillingUp
expr: (sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: JVM memory filling up (instance {{ $labels.instance }})
description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# SSL certificate revoked
- alert: SslCertificateRevoked
expr: ssl_ocsp_response_status == 1
for: 0m
labels:
severity: critical
annotations:
summary: SSL certificate revoked (instance {{ $labels.instance }})
description: "SSL certificate revoked {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# SSL certificate expiry (< 7 days)
- alert: SslCertificateExpiry(<7Days)
expr: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7
for: 0m
labels:
severity: warning
annotations:
summary: SSL certificate expiry (< 7 days) (instance {{ $labels.instance }})
description: "{{ $labels.instance }} Certificate is expiring in 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Loki process too many restarts
- alert: LokiProcessTooManyRestarts
expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Loki request errors
- alert: LokiRequestErrors
expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Loki request panic
- alert: LokiRequestPanic
expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Loki request latency
- alert: LokiRequestLatency
expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Promtail request errors
- alert: PromtailRequestErrors
expr: 100 \* sum\(rate\(promtail\_request\_duration\_seconds\_count{status\_code=~\"5..|failed\"}\[1m])) by \(namespace, job, route, instance) / sum\(rate\(promtail\_request\_duration\_seconds\_count\[1m])) by \(namespace, job, route, instance) > 10
for: 5m
labels:
severity: critical
annotations:
summary: Promtail request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Promtail request latency
- alert: PromtailRequestLatency
expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
for: 5m
labels:
severity: critical
annotations:
summary: Promtail request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Cloudflare http 4xx error rate
- alert: CloudflareHttp4xxErrorRate
expr: (sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5
for: 0m
labels:
severity: warning
annotations:
summary: Cloudflare http 4xx error rate (instance {{ $labels.instance }})
description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Cloudflare http 5xx error rate
- alert: CloudflareHttp5xxErrorRate
expr: (sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5
for: 0m
labels:
severity: critical
annotations:
summary: Cloudflare http 5xx error rate (instance {{ $labels.instance }})
description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
conf/provisioning/datasources/datasource.yml
[Fichier]
# 2023-05-24
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
isDefault: false
editable: true
url: http://prometheus:9090
access: proxy
- name: Loki
type: loki
editable: true
url: http://loki:3100
access: proxy
- name: Mimir
type: prometheus
isDefault: true
editable: true
url: http://mimir:9009/prometheus
access: proxy
jsonData:
httpHeaderName1: 'X-Scope-OrgID'
secureJsonData:
httpHeaderValue1: 'org'
conf/provisioning/dashboards/dashboard.yml
[Fichier]
# 2023-05-24
# config file version
apiVersion: 1
providers:
# <string> an unique provider name
- name: Org
# <int> org id. will default to orgId 1 if not specified
org_id: 1
# <string, required> name of the dashboard folder. Required
folder: "General"
# <string, required> provider type. Required
type: "file"
# <bool> disable dashboard deletion
disableDeletion: false
# <bool> enable dashboard editing
editable: true
# <int> how often Grafana will scan for changed dashboards
updateIntervalSeconds: 5
# <bool> allow updating provisioned dashboards from the UI
allowUiUpdates: true
options:
# <string, required> path to dashboard files on disk. Required
path: /etc/grafana/dashboards/provisioned
#path: /opt/monitor-stack/grafana/dashboards
# <bool> use folder names from filesystem to create folders in Grafana
foldersFromFilesStructure: true
conf/provisioning/provisioned/monitoring.json
[Fichier]
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "System Monitoring",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 8,
"links": [
{
"asDropdown": true,
"icon": "external link",
"includeVars": false,
"keepTime": false,
"tags": [
"Zogg"
],
"targetBlank": false,
"title": "Dashboards",
"tooltip": "",
"type": "dashboards",
"url": ""
}
],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 36,
"panels": [],
"title": "Global",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"decimals": 1,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 3,
"x": 0,
"y": 1
},
"id": 38,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "time() - node_boot_time_seconds{instance=\"$nodeexporter\"}",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 1800
}
],
"title": "Uptime",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 3,
"x": 3,
"y": 1
},
"id": 40,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "count(container_last_seen{instance=~\"$cadvisor\",image=~\"(.)+\"})",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 1800
}
],
"title": "Containers",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 80
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 6,
"y": 1
},
"id": 14,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "100 - ((node_filesystem_avail_bytes{instance=\"$nodeexporter\",job=\"$vm\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$nodeexporter\",job=\"$vm\",mountpoint=\"/\",fstype!=\"rootfs\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "Root FS Used",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 11,
"y": 1
},
"id": 16,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "count(count(node_cpu_seconds_total{instance=\"$nodeexporter\",job=\"$vm\"}) by (cpu))",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 240
}
],
"title": "CPU Cores",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 6,
"x": 14,
"y": 1
},
"hideTimeOverride": true,
"id": 18,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_time_seconds{instance=\"$nodeexporter\",job=\"$vm\"} - node_boot_time_seconds{instance=\"$nodeexporter\",job=\"$vm\"}",
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "Uptime",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 85
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 20,
"y": 1
},
"id": 4,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "(((count(count(node_cpu_seconds_total{instance=\"$nodeexporter\",job=\"$vm\"}) by (cpu))) - avg(sum by (mode)(rate(node_cpu_seconds_total{mode='idle',instance=\"$nodeexporter\",job=\"$vm\"}[$__rate_interval])))) * 100) / count(count(node_cpu_seconds_total{instance=\"$nodeexporter\",job=\"$vm\"}) by (cpu))",
"hide": false,
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 240
}
],
"title": "CPU Busy",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 70
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 11,
"y": 3
},
"id": 20,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_filesystem_size_bytes{instance=\"$nodeexporter\",job=\"$vm\",mountpoint=\"/\",fstype!=\"rootfs\"}",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "RootFS Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 14,
"y": 3
},
"id": 22,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_memory_MemTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"}",
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "RAM Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 3,
"x": 17,
"y": 3
},
"id": 24,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_memory_SwapTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"}",
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "SWAP Total",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 5
},
"id": 64,
"panels": [],
"title": "Usages",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 100,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "{id=\"/\",instance=\"cadvisor:8080\",job=\"prometheus\"}"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BA43A9",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byType",
"options": "time"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "hidden"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 0,
"y": 6
},
"id": 62,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_cpu_system_seconds_total{instance=\"$cadvisor\"}[1m]))",
"hide": true,
"intervalFactor": 2,
"legendFormat": "a",
"range": true,
"refId": "B",
"step": 120
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_cpu_system_seconds_total{instance=\"$cadvisor\", name=~\".+\"}[1m]))",
"hide": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "nur container",
"range": true,
"refId": "F",
"step": 10
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_cpu_system_seconds_total{instance=\"$cadvisor\", id=\"/\"}[1m]))",
"hide": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "nur docker host",
"metric": "",
"range": true,
"refId": "A",
"step": 20
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(process_cpu_seconds_total{instance=\"$cadvisor\"}[$interval])) * 100",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "host",
"metric": "",
"range": true,
"refId": "C",
"step": 600
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_cpu_system_seconds_total{instance=\"$cadvisor\", name=~\".+\"}[1m])) + sum(rate(container_cpu_system_seconds_total{instance=\"$cadvisor\", id=\"/\"}[1m])) + sum(rate(process_cpu_seconds_total[1m]))",
"hide": true,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "D",
"step": 120
}
],
"title": "CPU Usage",
"type": "timeseries"
},
{
"alert": {
"conditions": [
{
"evaluator": {
"params": [
1.25
],
"type": "gt"
},
"query": {
"params": [
"A",
"5m",
"now"
]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "60s",
"handler": 1,
"name": "Panel Title alert",
"noDataState": "keep_state",
"notifications": [
{
"id": 1
}
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 100,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"max": 1.5,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent",
"value": null
},
{
"color": "red",
"value": 1.25
}
]
},
"unit": "percentunit"
},
"overrides": [
{
"matcher": {
"id": "byType",
"options": "time"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "hidden"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 6,
"y": 6
},
"id": 66,
"links": [],
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "scalar(node_load1{ip=~\"$ip\"}) / scalar(count by(job, instance)(count by(job, instance, cpu)(node_cpu_seconds_total{ip=~\"$ip\"})))",
"intervalFactor": 2,
"range": true,
"refId": "A",
"step": 600
}
],
"title": "Load",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 80,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "IN on /sda"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#7EB26D",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "OUT on /sda"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byType",
"options": "time"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "hidden"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 6
},
"id": 68,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "-sum(rate(node_disk_read_bytes_total{instance=\"$nodeexporter\"}[$interval])) by (device)",
"hide": false,
"intervalFactor": 2,
"legendFormat": "OUT on /{{device}}",
"metric": "node_disk_bytes_read",
"range": true,
"refId": "A",
"step": 600
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(node_disk_written_bytes_total{instance=\"$nodeexporter\"}[$interval])) by (device)",
"intervalFactor": 2,
"legendFormat": "IN on /",
"metric": "",
"range": true,
"refId": "B",
"step": 600
}
],
"title": "Disk I/O",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 80,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "SENT"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#BF1B00",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byType",
"options": "time"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "hidden"
}
]
}
]
},
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 6
},
"id": 70,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_network_receive_bytes_total{instance=\"$cadvisor\", id=\"/\"}[$interval])) by (id)",
"intervalFactor": 2,
"legendFormat": "RECEIVED",
"range": true,
"refId": "A",
"step": 600
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "- sum(rate(container_network_transmit_bytes_total{instance=\"$cadvisor\", id=\"/\"}[$interval])) by (id)",
"hide": false,
"intervalFactor": 2,
"legendFormat": "SENT",
"range": true,
"refId": "B",
"step": 600
}
],
"title": "Network Traffic",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 12
},
"id": 2,
"panels": [],
"title": "CPU / Mem / Disk",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 85
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 0,
"y": 13
},
"id": 50,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {}
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "avg(node_load1) / count(count(node_cpu_seconds_total{instance=\"$nodeexporter\"}) by (cpu)) * 100",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 240
}
],
"title": "Sys Load (1m avg)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 85
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 5,
"y": 13
},
"id": 6,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "avg(node_load5{instance=\"$nodeexporter\",job=\"$vm\"}) / count(count(node_cpu_seconds_total{instance=\"$nodeexporter\",job=\"$vm\"}) by (cpu)) * 100",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"range": true,
"refId": "A",
"step": 240
}
],
"title": "Sys Load (5m avg)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 85
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 10,
"y": 13
},
"id": 8,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "avg(node_load15{instance=\"$nodeexporter\",job=\"$vm\"}) / count(count(node_cpu_seconds_total{instance=\"$nodeexporter\",job=\"$vm\"}) by (cpu)) * 100",
"hide": false,
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "Sys Load (15m avg)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 80
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 5,
"x": 15,
"y": 13
},
"hideTimeOverride": false,
"id": 10,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "((node_memory_MemTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"} - node_memory_MemFree_bytes{instance=\"$nodeexporter\",job=\"$vm\"}) / (node_memory_MemTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"} )) * 100",
"format": "time_series",
"hide": true,
"intervalFactor": 1,
"refId": "A",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "100 - ((node_memory_MemAvailable_bytes{instance=\"$nodeexporter\",job=\"$vm\"} * 100) / node_memory_MemTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"})",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"refId": "B",
"step": 240
}
],
"title": "RAM Used",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 10
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 25
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 20,
"y": 13
},
"id": 12,
"links": [],
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "((node_memory_SwapTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"} - node_memory_SwapFree_bytes{instance=\"$nodeexporter\",job=\"$vm\"}) / (node_memory_SwapTotal_bytes{instance=\"$nodeexporter\",job=\"$vm\"} )) * 100",
"intervalFactor": 1,
"refId": "A",
"step": 240
}
],
"title": "SWAP Used",
"type": "gauge"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 17
},
"id": 26,
"panels": [],
"title": "System",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "counter",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"links": [],
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 18
},
"id": 28,
"links": [],
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max",
"min"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_load1{instance=\"$nodeexporter\",job=\"$vm\"}",
"format": "time_series",
"intervalFactor": 4,
"legendFormat": "Load 1m",
"refId": "A",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_load5{instance=\"$nodeexporter\",job=\"$vm\"}",
"format": "time_series",
"intervalFactor": 4,
"legendFormat": "Load 5m",
"refId": "B",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_load15{instance=\"$nodeexporter\",job=\"$vm\"}",
"format": "time_series",
"intervalFactor": 4,
"legendFormat": "Load 15m",
"refId": "C",
"step": 240
}
],
"title": "System Load",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 50,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 26
},
"id": 52,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.0.5",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_cpu_usage_seconds_total{instance=~\"$cadvisor\", name=~\".+\"}[$interval])) by (name) * 100",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"range": true,
"refId": "F",
"step": 240
}
],
"title": "CPU Usage per Container",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 32
},
"id": 75,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(container_memory_rss{instance=~\"$cadvisor\", name=~\".+\"}) by (name)",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "container_memory_usage_bytes{instance=~\"$cadvisor\", name=~\".+\"}",
"hide": true,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "B",
"step": 240
}
],
"title": "Memory Usage per Container",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 42
},
"id": 56,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 43
},
"id": 58,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_network_receive_bytes_total{instance=~\"$cadvisor\", name=~\".+\"}[$interval])) by (name)",
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "- rate(container_network_transmit_bytes_total{instance=~\"$cadvisor\", name=~\".+\"}[$interval])",
"hide": true,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "B",
"step": 10
}
],
"title": "Received Network Traffic per Container",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": true,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "Bps"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 50
},
"id": 60,
"links": [],
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(container_network_transmit_bytes_total{instance=~\"$cadvisor\", name=~\".+\"}[$interval])) by (name)",
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(container_network_transmit_bytes_total{instance=~\"$cadvisor\", id=\"/\"}[$interval])",
"hide": true,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "B",
"step": 10
}
],
"title": "Sent Network Traffic per Container",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 57
},
"id": 32,
"panels": [],
"title": "Storage",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "bytes",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"links": [],
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 10,
"w": 24,
"x": 0,
"y": 58
},
"id": 34,
"links": [],
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max",
"min"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_filesystem_avail_bytes{instance=\"$nodeexporter\",job=\"$vm\",device!~'rootfs'}",
"format": "time_series",
"hide": false,
"intervalFactor": 1,
"legendFormat": " - Available",
"metric": "",
"refId": "A",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_filesystem_free_bytes{instance=\"$nodeexporter\",job=\"$vm\",device!~'rootfs'}",
"format": "time_series",
"hide": true,
"intervalFactor": 1,
"legendFormat": " - Free",
"refId": "B",
"step": 240
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "node_filesystem_size_bytes{instance=\"$nodeexporter\",job=\"$vm\",device!~'rootfs'}",
"format": "time_series",
"hide": true,
"intervalFactor": 1,
"legendFormat": " - Size",
"refId": "C",
"step": 240
}
],
"title": "Filesystem space available",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "%util",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 40,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"links": [],
"mappings": [],
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "io time"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#890F02",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byType",
"options": "time"
},
"properties": [
{
"id": "custom.axisPlacement",
"value": "hidden"
}
]
}
]
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 68
},
"id": 74,
"links": [],
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max",
"min"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "9.5.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"expr": "rate(node_disk_io_time_seconds_total{instance=\"$nodeexporter\",job=\"$vm\",device=~\"$diskdevices\"} [$__rate_interval])",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 240
}
],
"title": "I/O Utilization",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 77
},
"id": 44,
"panels": [],
"title": "Informations",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "fixed"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"filterable": false,
"inspect": false
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 78
},
"hideTimeOverride": false,
"id": 46,
"links": [],
"options": {
"cellHeight": "sm",
"footer": {
"countRows": false,
"enablePagination": false,
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"showHeader": false
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "cadvisor_version_info{instance=~\"$nodeexporter\",job=\"$vm\"}",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "cAdvisor Version: ",
"refId": "A",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "prometheus_build_info{instance=~\"$nodeexporter\",job=\"$vm\"}",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Prometheus Version: ",
"range": true,
"refId": "B",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "node_exporter_build_info{instance=~\"$nodeexporter\",job=\"$vm\"}",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Node-Exporter Version: ",
"range": true,
"refId": "C",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "cadvisor_version_info{instance=~\"$cadvisor\",job=\"$vm\"}",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Docker Version: ",
"range": true,
"refId": "D",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "cadvisor_version_info{instance=~\"$cadvisor\",job=\"$vm\"}",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Host OS Version: ",
"range": true,
"refId": "E",
"step": 2
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": true,
"expr": "cadvisor_version_info{instance=~\"$cadvisor\",job=\"$vm\"}",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Host Kernel Version: ",
"range": true,
"refId": "F",
"step": 2
}
],
"title": "Stack Info",
"transformations": [
{
"id": "reduce",
"options": {
"includeTimeField": false,
"labelsToFields": false,
"reducers": [
"mean"
]
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Mean": true
},
"indexByName": {},
"renameByName": {
"Field": "Stack Info",
"Mean": ""
}
}
}
],
"type": "table"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 102400
},
{
"color": "orange",
"value": 512000
},
{
"color": "dark-orange",
"value": 1048576
},
{
"color": "semi-dark-red",
"value": 5242880
}
]
},
"unit": "deckbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 78
},
"id": 72,
"links": [],
"options": {
"displayMode": "basic",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"text": {
"valueSize": 10
},
"valueMode": "color"
},
"pluginVersion": "9.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "builder",
"exemplar": false,
"expr": "container_memory_usage_bytes{instance=~\"$cadvisor\", name=~\".+\"} / 1024",
"format": "time_series",
"hide": false,
"instant": false,
"intervalFactor": 2,
"legendFormat": "",
"range": true,
"refId": "A",
"step": 240
}
],
"title": "Usage memory",
"type": "bargauge"
}
],
"refresh": "5m",
"schemaVersion": 38,
"style": "dark",
"tags": [
"Zogg",
"Prometheus",
"Linux",
"cAdvisor",
"NodeExporter",
"Grafana"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Mimir",
"value": "Mimir"
},
"hide": 2,
"includeAll": false,
"label": "datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"auto": true,
"auto_count": 50,
"auto_min": "50s",
"current": {
"selected": false,
"text": "auto",
"value": "$__auto_interval_interval"
},
"hide": 0,
"label": "Interval",
"name": "interval",
"options": [
{
"selected": true,
"text": "auto",
"value": "$__auto_interval_interval"
},
{
"selected": false,
"text": "30s",
"value": "30s"
},
{
"selected": false,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "2m",
"value": "2m"
},
{
"selected": false,
"text": "3m",
"value": "3m"
},
{
"selected": false,
"text": "5m",
"value": "5m"
},
{
"selected": false,
"text": "7m",
"value": "7m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "14d",
"value": "14d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "30s,1m,2m,3m,5m,7m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"queryValue": "",
"refresh": 2,
"skipUrlSync": false,
"type": "interval"
},
{
"current": {
"selected": false,
"text": "services",
"value": "services"
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(node_uname_info, job)",
"hide": 0,
"includeAll": false,
"label": "VM",
"multi": false,
"name": "vm",
"options": [],
"query": {
"query": "label_values(node_uname_info, job)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": false,
"text": "template.home:9100",
"value": "template.home:9100"
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(node_uname_info{job=\"$vm\"},instance)",
"hide": 2,
"includeAll": false,
"label": "Exporter",
"multi": false,
"name": "nodeexporter",
"options": [],
"query": {
"query": "label_values(node_uname_info{job=\"$vm\"},instance)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": false,
\"text\": \"\[a-z]+|nvme\[0-9]+n\[0-9]+|mmcblk\[0-9]+\",
\"value\": \"\[a-z]+|nvme\[0-9]+n\[0-9]+|mmcblk\[0-9]+\"
},
"hide": 2,
"includeAll": false,
"multi": false,
"name": "diskdevices",
"options": [
{
"selected": true,
\"text\": \"\[a-z]+|nvme\[0-9]+n\[0-9]+|mmcblk\[0-9]+\",
\"value\": \"\[a-z]+|nvme\[0-9]+n\[0-9]+|mmcblk\[0-9]+\"
}
],
\"query\": \"\[a-z]+|nvme\[0-9]+n\[0-9]+|mmcblk\[0-9]+\",
"skipUrlSync": false,
"type": "custom"
},
{
"current": {
"selected": false,
"text": "template.home:8080",
"value": "template.home:8080"
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(container_cpu_user_seconds_total{job=~\"$vm\"},instance)",
"hide": 2,
"includeAll": false,
"label": "Advisor",
"multi": false,
"name": "cadvisor",
"options": [],
"query": {
"query": "label_values(container_cpu_user_seconds_total{job=~\"$vm\"},instance)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": false,
"text": "192.168.50.102",
"value": "192.168.50.102"
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(node_boot_time_seconds{job=\"$vm\"},ip)",
"hide": 2,
"includeAll": false,
"label": "IP",
"multi": false,
"name": "ip",
"options": [],
"query": {
"query": "label_values(node_boot_time_seconds{job=\"$vm\"},ip)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Monitoring",
"uid": "ZOGG0001",
"version": 3,
"weekStart": ""
}
Répertoires requis
Hormis les répertoires spécifié par les fichiers de configuration, il faudra créer les répertoires suivant :
- datas/alertmanager
- datas/mimir
- datas/prometheus
- datas/grafana/dashboards
- datas/grafana/datas
Mise en place
Je partirai du principe que vous utilisez Portainer pour gérer vos stack Docker.
Dans l’ordre, il vous faut créer les répertoires requis sous le dossier /opt/docker/monitoring.
Si vous choisissez de placer l’ensemble des fichiers ailleurs, il sera donc nécessaire de mettre à jour les chemins dans le docker-compose entre autre.
Ces informations doivent refléter votre propre besoin ou organisation sur votre disque.
Par défaut, pour des raisons de sécurité, l'ensemble des répertoires que j'utilise est positionné avec les droits utilisateur 1000:1000. Ce sera donc à ajuster au besoin selon votre cas.
Lorsque c’est fait, vous créez une nouvelle stack sous Portainer et copier/coller le contenu du fichier docker-compose.yml dans l’éditeur et vous lancez sa création.
Précisions
Il s’agit ici d’une configuration complète de monitoring.
Sur vos autres machines, il vous suffit de jouer le stack suivant sous Portainer :
[Fichier]
version: "3.0"
#
# updated: 2023-05-28
# stack: monitoring (light only to export VMs)
#
x-logging: &x-logging
logging:
driver: loki
options:
loki-url: "http://loki:3100/loki/api/v1/push"
loki-retries: "5"
loki-batch-size: "400"
x-common: &x-common
<<: *x-logging
restart: "no"
stop_grace_period: 5s
stdin_open: true
tty: true
privileged: false
security_opt:
- no-new-privileges=true
cap_drop:
- ALL
cap_add:
- KILL
dns:
- 1.1.1.1
- 8.8.8.8
- 1.0.0.1
- 8.8.4.4
ipc: "shareable"
extra_hosts:
- "template.home:192.168.0.0"
environment:
TZ: "Europe/Paris"
PUID: 1000
PGID: 1000
user: 1000:1000
labels:
com.centurylinklabs.watchtower.enable: true
logging: "promtail"
com.stack.name: "common"
com.stack.service.name: "common"
devices:
- /dev/kmsg:/dev/kmsg
deploy:
resources:
limits:
cpus: "0.50"
memory: 256M
ulimits:
nproc: 65535
nofile:
soft: 20000
hard: 40000
tmpfs:
- /tmp:rw,noexec,nosuid,size=64k
sysctls:
net.core.somaxconn: 1024
net.ipv4.tcp_syncookies: 0
x-volume-timezone: &x-volume-timezone "/etc/timezone:/etc/timezone:ro"
x-volume-localtime: &x-volume-localtime "/etc/localtime:/etc/localtime:ro"
x-volume-docker-socket: &x-volume-docker-socket "/var/run/docker.sock:/var/run/docker.sock:rw"
x-volume-cgroups: &x-volume-cgroups "/proc/cgroups:/cgroup:rw"
x-volume-ssl: &x-volume-ssl "/opt/docker/ssl:/ssl:ro"
services:
cadvisor:
<<: *x-common
user: 0:0
container_name: cadvisor
hostname: cadvisor
image: gcr.io/cadvisor/cadvisor:v0.47.1
restart: always
privileged: true
cap_add:
- SYS_PTRACE
ports:
- "8080:8080"
expose:
- "8080"
command:
- "--storage_duration=1m0s"
- "--allow_dynamic_housekeeping=true"
- "--housekeeping_interval=30s"
- "--global_housekeeping_interval=30s"
- "--event_storage_age_limit=default=0"
- "--event_storage_event_limit=default=0"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "cadvisor"
volumes:
- *x-volume-timezone
- *x-volume-localtime
- *x-volume-docker-socket
- *x-volume-cgroups
- /var/lib/docker/:/var/lib/docker:ro
- /etc/machine-id:/etc/machine-id:ro
- /:/rootfs:ro
- /sys:/sys:ro
- /dev/disk/:/dev/disk:ro
- /var/run:/var/run:ro
node-exporter:
<<: *x-common
user: 0:0
container_name: node-exporter
hostname: node-exporter
image: prom/node-exporter:latest
restart: always
privileged: true
cap_add:
- SYS_ADMIN
ports:
- "9100:9100"
expose:
- "9100"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9100/ \|| exit 1
command:
- "--collector.arp"
- "--collector.bcache"
- "--collector.bonding"
- "--collector.btrfs"
- "--collector.conntrack"
- "--collector.cpu"
- "--collector.cpufreq"
- "--collector.diskstats"
- "--collector.dmi"
- "--collector.edac"
- "--collector.entropy"
- "--collector.fibrechannel"
- "--collector.filefd"
- "--collector.filesystem"
- "--collector.hwmon"
- "--collector.infiniband"
- "--collector.ipvs"
- "--collector.loadavg"
- "--collector.mdadm"
- "--collector.meminfo"
- "--collector.netclass"
- "--collector.netdev"
- "--collector.netstat"
- "--collector.nfs"
- "--collector.nfsd"
- "--collector.nvme"
- "--collector.os"
- "--collector.powersupplyclass"
- "--collector.pressure"
- "--collector.rapl"
- "--collector.schedstat"
- "--collector.selinux"
- "--collector.sockstat"
- "--collector.softnet"
- "--collector.stat"
- "--collector.tapestats"
- "--collector.textfile"
- "--collector.thermal_zone"
- "--collector.time"
- "--collector.timex"
- "--collector.udp_queues"
- "--collector.uname"
- "--collector.vmstat"
- "--collector.xfs"
- "--collector.zfs"
- "--collector.buddyinfo"
- "--collector.cgroups"
- "--collector.drbd"
- "--collector.ethtool"
- "--collector.interrupts"
- "--collector.ksmd"
- "--collector.lnstat"
- "--collector.logind"
- "--collector.meminfo_numa"
- "--collector.mountstats"
- "--collector.network_route"
- "--collector.ntp"
- "--collector.perf"
- "--collector.processes"
- "--collector.qdisc"
- "--collector.runit"
#- "--collector.slabinfo"
- "--collector.supervisord"
- "--collector.sysctl"
- "--collector.systemd"
- "--collector.tcpstat"
- "--collector.wifi"
- "--collector.zoneinfo"
- "--path.rootfs=/host"
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- \"--collector.filesystem.ignored-mount-points='^\(/rootfs|/host|)/\(sys|proc|dev|host|etc)\(\$\$|/)'\"
- \"--collector.filesystem.ignored-fs-types='^\(sys|proc|auto|cgroup|devpts|ns|au|fuse\.lxc|mqueue)\(fs|)\$\$'\"
- \"--collector.netdev.device-exclude='^\(lo|veth.\*)\$'\"
- \"--collector.ethtool.device-exclude='^\(lo|br-.\*|docker0|flannel.\*|veth.\*)\$'\"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "node-exporter"
volumes:
- *x-volume-timezone
- *x-volume-localtime
- *x-volume-docker-socket
- *x-volume-cgroups
- /etc/service:/etc/service:ro
- /:/host:ro,rslave
- /sys:/host/sys:ro
- /proc:/host/proc:ro
- /run/udev/data:/run/udev/data:ro
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
prometheus:
<<: *x-common
user: 0:0
cap_add:
- DAC_OVERRIDE
stop_grace_period: 60s
container_name: prometheus
hostname: prometheus
image: prom/prometheus:latest
restart: always
privileged: true
depends_on:
- cadvisor
- node-exporter
links:
- cadvisor
- node-exporter
ports:
- "9090:9090"
expose:
- "9090"
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9090/ \|| exit 1
command:
- "--storage.tsdb.retention.time=30d"
- "--config.file=/etc/prometheus/prometheus.yml"
labels:
com.stack.name: "monitoring"
com.stack.service.name: "prometheus"
deploy:
resources:
limits:
memory: 1G
volumes:
- *x-volume-timezone
- *x-volume-localtime
- /opt/docker/monitoring/conf/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- /opt/docker/monitoring/datas:/prometheus
Exploitation
Lorsque le stack complet est lancé et que l’ensemble des conteneurs à bien démarré, vous pouvez vous rendre sur :
http://grafana:3000
Les information d'identification sont ici celles par défaut : admin / admin
Dashboard
Conclusion
Vous disposer maintenant des bases vous permettant de mettre en place un système de surveillance complet d’une ou plusieurs machines.
Vous avez aussi la possibilité d’effectuer une surveillance active grâce aux alertes prises en comptes par Alertmanager mais vous pouvez aussi en définir au sein de Grafana.
Si vous souhaiter tester des requêtes en PromQL (le language de requêtes de Prometheus utilisé aussi par Grafana), il suffit de vous rendre sur:
http://prometheus:9090
Le suivi de Mimi se fait ici :
http://mimir:9009
Et le suivi des alertes levées par Alertmanager se fait ici :
http://alertmanager:9093
Bon monitoring :)
Petit bonus
Vive les alertes automatisées pour suivre simplement l’état de vos machines :