docs: rewrite README with structured overview and quick start guide
Replaces the minimal project description with a comprehensive README including a component overview table, quick start instructions, common Ansible operations, and links to detailed documentation. Aligns with Red Panda Approval™ standards.
This commit is contained in:
249
ansible/prometheus/alert_rules.yml.j2
Normal file
249
ansible/prometheus/alert_rules.yml.j2
Normal file
@@ -0,0 +1,249 @@
|
||||
# Prometheus Alert Rules
|
||||
# Red Panda Approved 🐼
|
||||
# Deployed to: /etc/prometheus/alert_rules.yml
|
||||
{% raw %}
|
||||
groups:
|
||||
# ============================================================================
|
||||
# Node/Infrastructure Alerts
|
||||
# ============================================================================
|
||||
- name: node_alerts
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} is down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
|
||||
|
||||
# ============================================================================
|
||||
# Process-Level Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_process_alerts
|
||||
rules:
|
||||
- alert: PuckHighCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
|
||||
|
||||
- alert: PuckCriticalCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
|
||||
|
||||
- alert: PuckHighMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
|
||||
|
||||
- alert: PuckCriticalMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
|
||||
|
||||
- alert: PuckProcessCrashLoop
|
||||
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Process count dropped on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
|
||||
|
||||
# ============================================================================
|
||||
# Docker Container Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_container_alerts
|
||||
rules:
|
||||
- alert: PuckHighContainerCount
|
||||
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High container count on puck"
|
||||
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
|
||||
|
||||
- alert: PuckDuplicateContainers
|
||||
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Duplicate containers on puck: {{ $labels.image }}"
|
||||
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
|
||||
|
||||
- alert: PuckOrphanedContainer
|
||||
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Possible orphaned container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
|
||||
|
||||
- alert: PuckMCPContainerOnPuck
|
||||
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MCP container detected on puck (WRONG HOST)"
|
||||
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
|
||||
|
||||
- alert: PuckContainerHighCPU
|
||||
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
|
||||
|
||||
- alert: PuckContainerHighMemory
|
||||
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
|
||||
|
||||
- alert: PuckContainerOOMKilled
|
||||
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container OOM killed on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} was killed by OOM killer"
|
||||
|
||||
# ============================================================================
|
||||
# Service/Application Alerts
|
||||
# ============================================================================
|
||||
- name: service_alerts
|
||||
rules:
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus target missing: {{ $labels.instance }}"
|
||||
description: "A Prometheus target has been down for more than 5 minutes."
|
||||
|
||||
- alert: PrometheusJobMissing
|
||||
expr: absent(up{job="node-exporter"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus job missing"
|
||||
description: "A Prometheus job has disappeared from target discovery."
|
||||
|
||||
- alert: AlertmanagerDown
|
||||
expr: absent(up{job="alertmanager"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager is down"
|
||||
description: "Alertmanager is not responding. Alerts may not be delivered."
|
||||
|
||||
# ============================================================================
|
||||
# Loki/Logging Alerts
|
||||
# ============================================================================
|
||||
- name: loki_alerts
|
||||
rules:
|
||||
- alert: LokiHighLogVolume
|
||||
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
Reference in New Issue
Block a user