metrics.gokgoz.net

prometheus logo, prometheus ile monitoring işlemleri, prometheus full hd logo, yüksek çözünürlüklü prometheus logosu

Prometheus Alertmanager Kuralları

Alarmların oluşturulması oldukça önemli, bu makalede internet ortamında yayınlanmış alarm kural setleri toparlanmıştır.

  • Node Exporter aracılığı ile hostların izlenmesinde kullanılacak kurallar
 - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of memory (instance {{ $labels.instance }})"
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostUnusualDiskReadRate
    expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostUnusualDiskWriteRate
    expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"}  * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of disk space (instance {{ $labels.instance }})"
      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostDiskWillFillIn4Hours
    expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
      description: "Disk will fill in 4 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostOutOfInodes
    expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host out of inodes (instance {{ $labels.instance }})"
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual disk read latency (instance {{ $labels.instance }})"
      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostUnusualDiskWriteLatency
    expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host unusual disk write latency (instance {{ $labels.instance }})"
      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host high CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostContextSwitching
    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host context switching (instance {{ $labels.instance }})"
      description: "Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostSwapIsFillingUp
    expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host swap is filling up (instance {{ $labels.instance }})"
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostSystemdServiceCrashed
    expr: node_systemd_unit_state{state="failed"} == 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host SystemD service crashed (instance {{ $labels.instance }})"
      description: "SystemD service crashed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostPhysicalComponentTooHot
    expr: node_hwmon_temp_celsius > 75
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host physical component too hot (instance {{ $labels.instance }})"
      description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostNodeOvertemperatureAlarm
    expr: node_hwmon_temp_alarm == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Host node overtemperature alarm (instance {{ $labels.instance }})"
      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostRaidArrayGotInactive
    expr: node_md_state{state="inactive"} > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Host RAID array got inactive (instance {{ $labels.instance }})"
      description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostRaidDiskFailure
    expr: node_md_disks{state="fail"} > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host RAID disk failure (instance {{ $labels.instance }})"
      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostKernelVersionDeviations
    expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host kernel version deviations (instance {{ $labels.instance }})"
      description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: HostOomKillDetected
    expr: increase(node_vmstat_oom_kill[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host OOM kill detected (instance {{ $labels.instance }})"
      description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

- alert: HostEdacCorrectableErrorsDetected
    expr: increase(node_edac_correctable_errors_total[5m]) > 0
    for: 5m
    labels:
      severity: info
    annotations:
      summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})"
      description: "{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostEdacUncorrectableErrorsDetected
    expr: node_edac_uncorrectable_errors_total > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})"
      description: "{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostNetworkReceiveErrors
    expr: increase(node_network_receive_errs_total[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host Network Receive Errors (instance {{ $labels.instance }})"
      description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: HostNetworkTransmitErrors
    expr: increase(node_network_transmit_errs_total[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
      description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  • cAdvisor kullanarak docker makinelerinin izlenmesinde kullanılacak kurallar
  - alert: ContainerKilled
    expr: time() - container_last_seen > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container killed (instance {{ $labels.instance }})"
      description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ContainerCpuUsage
    expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container CPU usage (instance {{ $labels.instance }})"
      description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ContainerMemoryUsage
    expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Memory usage (instance {{ $labels.instance }})"
      description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ContainerVolumeUsage
    expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume usage (instance {{ $labels.instance }})"
      description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ContainerVolumeIoUsage
    expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume IO usage (instance {{ $labels.instance }})"
      description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: ContainerHighThrottleRate
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container high throttle rate (instance {{ $labels.instance }})"
      description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxProbeFailed
    expr: probe_success == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox probe failed (instance {{ $labels.instance }})"
      description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxSlowProbe
    expr: avg_over_time(probe_duration_seconds[1m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox slow probe (instance {{ $labels.instance }})"
      description: "Blackbox probe took more than 1s to complete\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxProbeHttpFailure
    expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox probe HTTP failure (instance {{ $labels.instance }})"
      description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxSslCertificateWillExpireSoon
    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
      description: "SSL certificate expires in 30 days\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxSslCertificateWillExpireSoon
    expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})"
      description: "SSL certificate expires in 3 days\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxSslCertificateExpired
    expr: probe_ssl_earliest_cert_expiry - time() <= 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Blackbox SSL certificate expired (instance {{ $labels.instance }})"
      description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxProbeSlowHttp
    expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox probe slow HTTP (instance {{ $labels.instance }})"
      description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: BlackboxProbeSlowPing
    expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox probe slow ping (instance {{ $labels.instance }})"
      description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: WindowsServerCollectorError
    expr: wmi_exporter_collector_success == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Windows Server collector Error (instance {{ $labels.instance }})"
      description: "Collector {{ $labels.collector }} was not successful\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: WindowsServerServiceStatus
    expr: wmi_service_status{status="ok"} != 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Windows Server service Status (instance {{ $labels.instance }})"
      description: "Windows Service state is not OK\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: WindowsServerCpuUsage
    expr: 100 - (avg by (instance) (rate(wmi_cpu_time_total{mode="idle"}[2m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Windows Server CPU Usage (instance {{ $labels.instance }})"
      description: "CPU Usage is more than 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: WindowsServerMemoryUsage
    expr: 100 * (wmi_os_physical_memory_free_bytes) / wmi_cs_physical_memory_bytes > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Windows Server memory Usage (instance {{ $labels.instance }})"
      description: "Memory usage is more than 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: WindowsServerDiskSpaceUsage
    expr: 100.0 - 100 * ((wmi_logical_disk_free_bytes{} / 1024 / 1024 ) / (wmi_logical_disk_size_bytes{} / 1024 / 1024)) > 80
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Windows Server disk Space Usage (instance {{ $labels.instance }})"
      description: "Disk usage is more than 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  • Mysqld Exporter ile MySQL sunucularının izlenmesinde kullanılacak kurallar
 - alert: MysqlDown
    expr: mysql_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL down (instance {{ $labels.instance }})"
      description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlTooManyConnections
    expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL too many connections (instance {{ $labels.instance }})"
      description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlHighThreadsRunning
    expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL high threads running (instance {{ $labels.instance }})"
      description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


  - alert: MysqlSlaveIoThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})"
      description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlaveSqlThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})"
      description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlaveReplicationLag
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL Slave replication lag (instance {{ $labels.instance }})"
      description: "MysqL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlowQueries
    expr: mysql_global_status_slow_queries > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL slow queries (instance {{ $labels.instance }})"
      description: "MySQL server is having some slow queries.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlRestarted
    expr: mysql_global_status_uptime < 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL restarted (instance {{ $labels.instance }})"
      description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlDown
    expr: pg_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql down (instance {{ $labels.instance }})"
      description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlRestarted
    expr: time() - pg_postmaster_start_time_seconds < 60
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql restarted (instance {{ $labels.instance }})"
      description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: PostgresqlExporterError
    expr: pg_exporter_last_scrape_error > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql exporter error (instance {{ $labels.instance }})"
      description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlReplicationLag
    expr: (pg_replication_lag) > 10 and ON(instance) (pg_replication_is_replica == 1)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql replication lag (instance {{ $labels.instance }})"
      description: "PostgreSQL replication lag is going up (> 10s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlTableNotVaccumed
    expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql table not vaccumed (instance {{ $labels.instance }})"
      description: "Table has not been vaccum for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlTableNotAnalyzed
    expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql table not analyzed (instance {{ $labels.instance }})"
      description: "Table has not been analyzed for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlTooManyConnections
    expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql too many connections (instance {{ $labels.instance }})"
      description: "PostgreSQL instance has too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlNotEnoughConnections
    expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql not enough connections (instance {{ $labels.instance }})"
      description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlDeadLocks
    expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql dead locks (instance {{ $labels.instance }})"
      description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlSlowQueries
    expr: pg_slow_queries > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql slow queries (instance {{ $labels.instance }})"
      description: "PostgreSQL executes slow queries\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlHighRollbackRate
    expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql high rollback rate (instance {{ $labels.instance }})"
      description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlCommitRateLow
    expr: rate(pg_stat_database_xact_commit[1m]) < 10
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql commit rate low (instance {{ $labels.instance }})"
      description: "Postgres seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlLowXidConsumption
    expr: rate(pg_txid_current[1m]) < 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql low XID consumption (instance {{ $labels.instance }})"
      description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqllowXlogConsumption
    expr: rate(pg_xlog_position_bytes[1m]) < 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresqllow XLOG consumption (instance {{ $labels.instance }})"
      description: "Postgres seems to be consuming XLOG very slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlWaleReplicationStopped
    expr: rate(pg_xlog_position_bytes[1m]) == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql WALE replication stopped (instance {{ $labels.instance }})"
      description: "WAL-E replication seems to be stopped\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlHighRateStatementTimeout
    expr: rate(postgresql_errors_total{type="statement_timeout"}[5m]) > 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql high rate statement timeout (instance {{ $labels.instance }})"
      description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlHighRateDeadlock
    expr: rate(postgresql_errors_total{type="deadlock_detected"}[1m]) * 60 > 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql high rate deadlock (instance {{ $labels.instance }})"
      description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlReplicationLabBytes
    expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) > 1e+09
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql replication lab bytes (instance {{ $labels.instance }})"
      description: "Postgres Replication lag (in bytes) is high\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlUnusedReplicationSlot
    expr: pg_replication_slots_active == 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql unused replication slot (instance {{ $labels.instance }})"
      description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlTooManyDeadTuples
    expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql too many dead tuples (instance {{ $labels.instance }})"
      description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlSplitBrain
    expr: count(pg_replication_is_replica == 0) != 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql split brain (instance {{ $labels.instance }})"
      description: "Split Brain, too many primary Postgresql databases in read-write mode\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlPromotedNode
    expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql promoted node (instance {{ $labels.instance }})"
      description: "Postgresql standby server has been promoted as primary node\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlConfigurationChanged
    expr: {__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Postgresql configuration changed (instance {{ $labels.instance }})"
      description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlSslCompressionActive
    expr: sum(pg_stat_ssl_compression) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql SSL compression active (instance {{ $labels.instance }})"
      description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PostgresqlTooManyLocksAcquired
    expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Postgresql too many locks acquired (instance {{ $labels.instance }})"
      description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  • Redis Exporter ile Redis sunucularının izlenmesinde kullanılacak kurallar
  - alert: RedisDown
    expr: redis_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis down (instance {{ $labels.instance }})"
      description: "Redis instance is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisMissingMaster
    expr: count(redis_instance_info{role="master"}) == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis missing master (instance {{ $labels.instance }})"
      description: "Redis cluster has no node marked as master.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisTooManyMasters
    expr: count(redis_instance_info{role="master"}) > 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis too many masters (instance {{ $labels.instance }})"
      description: "Redis cluster has too many nodes marked as master.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

 - alert: RedisDisconnectedSlaves
    expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis disconnected slaves (instance {{ $labels.instance }})"
      description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisReplicationBroken
    expr: delta(redis_connected_slaves[1m]) < 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis replication broken (instance {{ $labels.instance }})"
      description: "Redis instance lost a slave\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisClusterFlapping
    expr: changes(redis_connected_slaves[5m]) > 2
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis cluster flapping (instance {{ $labels.instance }})"
      description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisMissingBackup
    expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis missing backup (instance {{ $labels.instance }})"
      description: "Redis has not been backuped for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisOutOfMemory
    expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Redis out of memory (instance {{ $labels.instance }})"
      description: "Redis is running out of memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisTooManyConnections
    expr: redis_connected_clients > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Redis too many connections (instance {{ $labels.instance }})"
      description: "Redis instance has too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisNotEnoughConnections
    expr: redis_connected_clients < 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Redis not enough connections (instance {{ $labels.instance }})"
      description: "Redis instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RedisRejectedConnections
    expr: increase(redis_rejected_connections_total[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Redis rejected connections (instance {{ $labels.instance }})"
      description: "Some connections to Redis has been rejected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqDown
    expr: rabbitmq_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq down (instance {{ $labels.instance }})"
      description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqClusterDown
    expr: sum(rabbitmq_running) < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq cluster down (instance {{ $labels.instance }})"
      description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqClusterPartition
    expr: rabbitmq_partitions > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq cluster partition (instance {{ $labels.instance }})"
      description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"


  - alert: RabbitmqOutOfMemory
    expr: rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Rabbitmq out of memory (instance {{ $labels.instance }})"
      description: "Memory available for RabbmitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqTooManyConnections
    expr: rabbitmq_connectionsTotal > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Rabbitmq too many connections (instance {{ $labels.instance }})"
      description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqDeadLetterQueueFillingUp
    expr: rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})"
      description: "Dead letter queue is filling up (> 10 msgs)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqTooManyMessagesInQueue
    expr: rabbitmq_queue_messages_ready{queue="my-queue"} > 1000
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Rabbitmq too many messages in queue (instance {{ $labels.instance }})"
      description: "Queue is filling up (> 1000 msgs)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqSlowQueueConsuming
    expr: time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Rabbitmq slow queue consuming (instance {{ $labels.instance }})"
      description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqNoConsumer
    expr: rabbitmq_queue_consumers == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq no consumer (instance {{ $labels.instance }})"
      description: "Queue has no consumer\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqTooManyConsumers
    expr: rabbitmq_queue_consumers > 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Rabbitmq too many consumers (instance {{ $labels.instance }})"
      description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: RabbitmqUnactiveExchange
    expr: rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Rabbitmq unactive exchange (instance {{ $labels.instance }})"
      description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchHeapUsageTooHigh
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})"
      description: "The heap usage is over 90% for 5m\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchHeapUsageWarning
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch Heap Usage warning (instance {{ $labels.instance }})"
      description: "The heap usage is over 80% for 5m\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchDiskSpaceLow
    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch disk space low (instance {{ $labels.instance }})"
      description: "The disk usage is over 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchDiskOutOfSpace
    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch disk out of space (instance {{ $labels.instance }})"
      description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchClusterRed
    expr: elasticsearch_cluster_health_status{color="red"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch Cluster Red (instance {{ $labels.instance }})"
      description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchClusterYellow
    expr: elasticsearch_cluster_health_status{color="yellow"} == 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch Cluster Yellow (instance {{ $labels.instance }})"
      description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchHealthyNodes
    expr: elasticsearch_cluster_health_number_of_nodes < number_of_nodes
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch Healthy Nodes (instance {{ $labels.instance }})"
      description: "Number Healthy Nodes less then number_of_nodes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchHealthyDataNodes
    expr: elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch Healthy Data Nodes (instance {{ $labels.instance }})"
      description: "Number Healthy Data Nodes less then number_of_data_nodes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchRelocationShards
    expr: elasticsearch_cluster_health_relocating_shards > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch relocation shards (instance {{ $labels.instance }})"
      description: "Number of relocation shards for 20 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchInitializingShards
    expr: elasticsearch_cluster_health_initializing_shards > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch initializing shards (instance {{ $labels.instance }})"
      description: "Number of initializing shards for 10 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchUnassignedShards
    expr: elasticsearch_cluster_health_unassigned_shards > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch unassigned shards (instance {{ $labels.instance }})"
      description: "Number of unassigned shards for 2 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchPendingTasks
    expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch pending tasks (instance {{ $labels.instance }})"
      description: "Number of pending tasks for 10 min. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ElasticsearchNoNewDocuments
    expr: rate(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Elasticsearch no new documents (instance {{ $labels.instance }})"
      description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

Bütün Kurallar

  - alert: PrometheusJobMissing
    expr: absent(up{job="my-job"})
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus job missing (instance {{ $labels.instance }})"
      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTargetMissing
    expr: up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus target missing (instance {{ $labels.instance }})"
      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusAllTargetsMissing
    expr: count by (job) (up) == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusConfigurationReloadFailure
    expr: prometheus_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus configuration reload failure (instance {{ $labels.instance }})"
      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTooManyRestarts
    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusAlertmanagerConfigurationReloadFailure
    expr: alertmanager_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusAlertmanagerConfigNotSynced
    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusAlertmanagerE2eDeadManSwitch
    expr: vector(1)
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})"
      description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusNotConnectedToAlertmanager
    expr: prometheus_notifications_alertmanagers_discovered < 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusRuleEvaluationFailures
    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTemplateTextExpansionFailures
    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusRuleEvaluationSlow
    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
      description: "Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusNotificationsBacklog
    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusAlertmanagerNotificationFailing
    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTargetEmpty
    expr: prometheus_sd_discovered_targets == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus target empty (instance {{ $labels.instance }})"
      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTargetScrapingSlow
    expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
      description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusLargeScrape
    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus large scrape (instance {{ $labels.instance }})"
      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTargetScrapeDuplicate
    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbCheckpointCreationFailures
    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbCheckpointDeletionFailures
    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbCompactionsFailed
    expr: increase(prometheus_tsdb_compactions_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbHeadTruncationsFailed
    expr: increase(prometheus_tsdb_head_truncations_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbReloadFailures
    expr: increase(prometheus_tsdb_reloads_failures_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbWalCorruptions
    expr: increase(prometheus_tsdb_wal_corruptions_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: PrometheusTsdbWalTruncationsFailed
    expr: increase(prometheus_tsdb_wal_truncations_failed_total[3m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlDown
    expr: mysql_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL down (instance {{ $labels.instance }})"
      description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlTooManyConnections
    expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL too many connections (instance {{ $labels.instance }})"
      description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlHighThreadsRunning
    expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL high threads running (instance {{ $labels.instance }})"
      description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlaveIoThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})"
      description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlaveSqlThreadNotRunning
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})"
      description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlaveReplicationLag
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL Slave replication lag (instance {{ $labels.instance }})"
      description: "MysqL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlSlowQueries
    expr: mysql_global_status_slow_queries > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL slow queries (instance {{ $labels.instance }})"
      description: "MySQL server is having some slow queries.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: MysqlRestarted
    expr: mysql_global_status_uptime < 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "MySQL restarted (instance {{ $labels.instance }})"
      description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KafkaTopicsReplicas
    expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kafka topics replicas (instance {{ $labels.instance }})"
      description: "Kafka topic in-sync partition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KafkaConsumersGroup
    expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kafka consumers group (instance {{ $labels.instance }})"
      description: "Kafka consumers group\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: NginxHighHttp4xxErrorRate
    expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})"
      description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: NginxHighHttp5xxErrorRate
    expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})"
      description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: NginxLatencyHigh
    expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Nginx latency high (instance {{ $labels.instance }})"
      description: "Nginx p99 latency is higher than 10 seconds\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesNodeReady
    expr: kube_node_status_condition{condition="Ready",status="true"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes Node ready (instance {{ $labels.instance }})"
      description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesMemoryPressure
    expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes memory pressure (instance {{ $labels.instance }})"
      description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesDiskPressure
    expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes disk pressure (instance {{ $labels.instance }})"
      description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesOutOfDisk
    expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes out of disk (instance {{ $labels.instance }})"
      description: "{{ $labels.node }} has OutOfDisk condition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesJobFailed
    expr: kube_job_status_failed > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes Job failed (instance {{ $labels.instance }})"
      description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesCronjobSuspended
    expr: kube_cronjob_spec_suspend != 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes CronJob suspended (instance {{ $labels.instance }})"
      description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesPersistentvolumeclaimPending
    expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})"
      description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesVolumeOutOfDiskSpace
    expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes Volume out of disk space (instance {{ $labels.instance }})"
      description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesVolumeFullInFourDays
    expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes Volume full in four days (instance {{ $labels.instance }})"
      description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesPersistentvolumeError
    expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes PersistentVolume error (instance {{ $labels.instance }})"
      description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesStatefulsetDown
    expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})"
      description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesHpaScalingAbility
    expr: kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes HPA scaling ability (instance {{ $labels.instance }})"
      description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesHpaMetricAvailability
    expr: kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes HPA metric availability (instance {{ $labels.instance }})"
      description: "HPA is not able to colelct metrics\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesHpaScaleCapability
    expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes HPA scale capability (instance {{ $labels.instance }})"
      description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesPodNotHealthy
    expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})"
      description: "Pod has been in a non-ready state for longer than an hour.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesPodCrashLooping
    expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes pod crash looping (instance {{ $labels.instance }})"
      description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesReplicassetMismatch
    expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})"
      description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesDeploymentReplicasMismatch
    expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})"
      description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesStatefulsetReplicasMismatch
    expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})"
      description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesDeploymentGenerationMismatch
    expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})"
      description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesStatefulsetGenerationMismatch
    expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})"
      description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesStatefulsetUpdateNotRolledOut
    expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})"
      description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesDaemonsetRolloutStuck
    expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})"
      description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesDaemonsetMisscheduled
    expr: kube_daemonset_status_number_misscheduled > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})"
      description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesCronjobTooLong
    expr: time() - kube_cronjob_next_schedule_time > 3600
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})"
      description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesJobCompletion
    expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes job completion (instance {{ $labels.instance }})"
      description: "Kubernetes Job failed to complete\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesApiServerErrors
    expr: sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[2m])) / sum(rate(apiserver_request_count{job="apiserver"}[2m])) * 100 > 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes API server errors (instance {{ $labels.instance }})"
      description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesApiClientErrors
    expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[2m])) by (instance, job) / sum(rate(rest_client_requests_total[2m])) by (instance, job)) * 100 > 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes API client errors (instance {{ $labels.instance }})"
      description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesClientCertificateExpiresNextWeek
    expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes client certificate expires next week (instance {{ $labels.instance }})"
      description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesClientCertificateExpiresSoon
    expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kubernetes client certificate expires soon (instance {{ $labels.instance }})"
      description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: KubernetesApiServerLatency
    expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Kubernetes API server latency (instance {{ $labels.instance }})"
      description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephState
    expr: ceph_health_status != 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Ceph State (instance {{ $labels.instance }})"
      description: "Ceph instance unhealthy\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephMonitorClockSkew
    expr: abs(ceph_monitor_clock_skew_seconds) > 0.2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph monitor clock skew (instance {{ $labels.instance }})"
      description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephMonitorLowSpace
    expr: ceph_monitor_avail_percent < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph monitor low space (instance {{ $labels.instance }})"
      description: "Ceph monitor storage is low.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephOsdDown
    expr: ceph_osd_up == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Ceph OSD Down (instance {{ $labels.instance }})"
      description: "Ceph Object Storage Daemon Down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephHighOsdLatency
    expr: ceph_osd_perf_apply_latency_seconds > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph high OSD latency (instance {{ $labels.instance }})"
      description: "Ceph Object Storage Daemon latetncy is high. Please check if it doesn't stuck in weird state.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephOsdLowSpace
    expr: ceph_osd_utilization > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph OSD low space (instance {{ $labels.instance }})"
      description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephOsdReweighted
    expr: ceph_osd_weight < 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph OSD reweighted (instance {{ $labels.instance }})"
      description: "Ceph Object Storage Daemon take ttoo much time to resize.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgDown
    expr: ceph_pg_down > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Ceph PG down (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgIncomplete
    expr: ceph_pg_incomplete > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Ceph PG incomplete (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgInconsistant
    expr: ceph_pg_inconsistent > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph PG inconsistant (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are inconsitent. Data is available but inconsistent across nodes.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgActivationLong
    expr: ceph_pg_activating > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph PG activation long (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are too long to activate.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgBackfillFull
    expr: ceph_pg_backfill_toofull > 0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Ceph PG backfill full (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: CephPgUnavailable
    expr: ceph_pg_total - ceph_pg_active > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Ceph PG unavailable (instance {{ $labels.instance }})"
      description: "Some Ceph placement groups are unavailable.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ThanosCompactionHalted
    expr: thanos_compactor_halted == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Thanos compaction halted (instance {{ $labels.instance }})"
      description: "Thanos compaction has failed to run and is now halted.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ThanosCompactBucketOperationFailure
    expr: rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Thanos compact bucket operation failure (instance {{ $labels.instance }})"
      description: "Thanos compaction has failing storage operations\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

  - alert: ThanosCompactNotRun
    expr: (time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Thanos compact not run (instance {{ $labels.instance }})"
      description: "Thanos compaction has not run in 24 hours.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

https://awesome-prometheus-alerts.grep.to/rules adresinde yayınlanan kuralların mirrorlanmış bir sürümünü içermektedir.