Files
ouranos/ansible/pplg/alert_rules.yml.j2
Robert Helewka 8c95173705 feat(alloy): add journal relabeling and kottos integration on puck
Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.

Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00

390 lines
17 KiB
Django/Jinja

# Prometheus Alert Rules
# Red Panda Approved 🐼
# Deployed to: /etc/prometheus/alert_rules.yml
{% raw %}
groups:
# ============================================================================
# Node/Infrastructure Alerts
# ============================================================================
- name: node_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighLoadAverage
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
# ============================================================================
# Process-Level Alerts (puck.incus)
# ============================================================================
- name: puck_process_alerts
rules:
- alert: PuckHighCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
- alert: PuckCriticalCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
- alert: PuckHighMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
- alert: PuckCriticalMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
for: 1m
labels:
severity: critical
annotations:
summary: "Critical memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
- alert: PuckProcessCrashLoop
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
for: 1m
labels:
severity: warning
annotations:
summary: "Process count dropped on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
# ============================================================================
# Docker Container Alerts (puck.incus)
# ============================================================================
- name: puck_container_alerts
rules:
- alert: PuckHighContainerCount
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High container count on puck"
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
- alert: PuckDuplicateContainers
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Duplicate containers on puck: {{ $labels.image }}"
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
- alert: PuckOrphanedContainer
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Possible orphaned container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
- alert: PuckMCPContainerOnPuck
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
for: 1m
labels:
severity: critical
annotations:
summary: "MCP container detected on puck (WRONG HOST)"
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
- alert: PuckContainerHighCPU
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
- alert: PuckContainerHighMemory
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
- alert: PuckContainerOOMKilled
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Container OOM killed on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} was killed by OOM killer"
# ============================================================================
# Service/Application Alerts
# ============================================================================
- name: service_alerts
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target missing: {{ $labels.instance }}"
description: "A Prometheus target has been down for more than 5 minutes."
- alert: PrometheusJobMissing
expr: absent(up{job="node-exporter"})
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus job missing"
description: "A Prometheus job has disappeared from target discovery."
- alert: AlertmanagerDown
expr: absent(up{job="alertmanager"})
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager is not responding. Alerts may not be delivered."
# ============================================================================
# Loki/Logging Alerts
# ============================================================================
- name: loki_alerts
rules:
- alert: LokiHighLogVolume
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
for: 10m
labels:
severity: warning
annotations:
summary: "High log ingestion rate"
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
# ============================================================================
# Daedalus Application Alerts
# ============================================================================
- name: daedalus_alerts
rules:
- alert: DaedalusDown
expr: daedalus_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Daedalus is down"
description: "Daedalus has been unreachable for more than 1 minute."
- alert: DaedalusMCPDisconnected
expr: daedalus_mcp_connections_active == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Daedalus has no active MCP connections"
description: "Daedalus has reported zero active MCP connections for 5 minutes."
- alert: DaedalusHighErrorRate
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Daedalus HTTP 5xx error rate above 5%"
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
- alert: DaedalusClientExceptionSpike
expr: rate(daedalus_client_exceptions_total[1m]) > 10
for: 1m
labels:
severity: warning
annotations:
summary: "Daedalus client exception spike"
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
- alert: DaedalusSlowResponses
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Daedalus p95 response time above 5s"
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
- alert: DaedalusMCPLatency
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Daedalus MCP p95 latency above 30s"
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
- alert: DaedalusS3Errors
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Daedalus S3 error rate above 1%"
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
# ============================================================================
# Mnemosyne Application Alerts
# ============================================================================
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
# The Django app container hosts the single prometheus_client registry that
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
# call counters) write to, so "MCP is broken" signals show up as
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
# than a separate up{} series.
- name: mnemosyne_alerts
rules:
- alert: MnemosyneDown
expr: up{job="mnemosyne"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Mnemosyne is down"
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
- alert: MnemosyneHighErrorRate
expr: |
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne HTTP 5xx error rate above 5%"
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
- alert: MnemosyneSlowResponses
expr: |
histogram_quantile(0.95,
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne p95 response time above 5s"
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
# MCP tool-call error surface — owned by mcp_server.metrics on the
# same /metrics endpoint. This complements MnemosyneDown by catching
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
# failing, or Neo4j vector search is 500-ing.
- alert: MnemosyneMCPToolErrors
expr: |
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne MCP tool error rate above 10%"
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
# Celery queue depth — high pending count usually means the embedding
# worker is stuck or throttled by the embedding provider. Requires
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
# if that is not deployed yet, this rule simply never fires.
- alert: MnemosyneCeleryBacklog
expr: |
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}