Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.
Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
390 lines
17 KiB
Django/Jinja
390 lines
17 KiB
Django/Jinja
# Prometheus Alert Rules
|
|
# Red Panda Approved 🐼
|
|
# Deployed to: /etc/prometheus/alert_rules.yml
|
|
{% raw %}
|
|
groups:
|
|
# ============================================================================
|
|
# Node/Infrastructure Alerts
|
|
# ============================================================================
|
|
- name: node_alerts
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: CriticalCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk space on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighLoadAverage
|
|
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High load average on {{ $labels.instance }}"
|
|
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
|
|
|
|
# ============================================================================
|
|
# Process-Level Alerts (puck.incus)
|
|
# ============================================================================
|
|
- name: puck_process_alerts
|
|
rules:
|
|
- alert: PuckHighCPUProcess
|
|
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
|
|
|
|
- alert: PuckCriticalCPUProcess
|
|
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
|
|
|
|
- alert: PuckHighMemoryProcess
|
|
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
|
|
|
|
- alert: PuckCriticalMemoryProcess
|
|
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
|
|
|
|
- alert: PuckProcessCrashLoop
|
|
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Process count dropped on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
|
|
|
|
# ============================================================================
|
|
# Docker Container Alerts (puck.incus)
|
|
# ============================================================================
|
|
- name: puck_container_alerts
|
|
rules:
|
|
- alert: PuckHighContainerCount
|
|
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High container count on puck"
|
|
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
|
|
|
|
- alert: PuckDuplicateContainers
|
|
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Duplicate containers on puck: {{ $labels.image }}"
|
|
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
|
|
|
|
- alert: PuckOrphanedContainer
|
|
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Possible orphaned container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
|
|
|
|
- alert: PuckMCPContainerOnPuck
|
|
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "MCP container detected on puck (WRONG HOST)"
|
|
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
|
|
|
|
- alert: PuckContainerHighCPU
|
|
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
|
|
|
|
- alert: PuckContainerHighMemory
|
|
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
|
|
|
|
- alert: PuckContainerOOMKilled
|
|
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Container OOM killed on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} was killed by OOM killer"
|
|
|
|
# ============================================================================
|
|
# Service/Application Alerts
|
|
# ============================================================================
|
|
- name: service_alerts
|
|
rules:
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus target missing: {{ $labels.instance }}"
|
|
description: "A Prometheus target has been down for more than 5 minutes."
|
|
|
|
- alert: PrometheusJobMissing
|
|
expr: absent(up{job="node-exporter"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus job missing"
|
|
description: "A Prometheus job has disappeared from target discovery."
|
|
|
|
- alert: AlertmanagerDown
|
|
expr: absent(up{job="alertmanager"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager is not responding. Alerts may not be delivered."
|
|
|
|
# ============================================================================
|
|
# Loki/Logging Alerts
|
|
# ============================================================================
|
|
- name: loki_alerts
|
|
rules:
|
|
- alert: LokiHighLogVolume
|
|
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High log ingestion rate"
|
|
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
|
|
|
# ============================================================================
|
|
# Daedalus Application Alerts
|
|
# ============================================================================
|
|
- name: daedalus_alerts
|
|
rules:
|
|
- alert: DaedalusDown
|
|
expr: daedalus_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Daedalus is down"
|
|
description: "Daedalus has been unreachable for more than 1 minute."
|
|
|
|
- alert: DaedalusMCPDisconnected
|
|
expr: daedalus_mcp_connections_active == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus has no active MCP connections"
|
|
description: "Daedalus has reported zero active MCP connections for 5 minutes."
|
|
|
|
- alert: DaedalusHighErrorRate
|
|
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus HTTP 5xx error rate above 5%"
|
|
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
|
|
|
|
- alert: DaedalusClientExceptionSpike
|
|
expr: rate(daedalus_client_exceptions_total[1m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus client exception spike"
|
|
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
|
|
|
|
- alert: DaedalusSlowResponses
|
|
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus p95 response time above 5s"
|
|
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
|
|
|
|
- alert: DaedalusMCPLatency
|
|
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus MCP p95 latency above 30s"
|
|
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
|
|
|
|
- alert: DaedalusS3Errors
|
|
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus S3 error rate above 1%"
|
|
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
|
|
|
# ============================================================================
|
|
# Mnemosyne Application Alerts
|
|
# ============================================================================
|
|
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
|
|
# The Django app container hosts the single prometheus_client registry that
|
|
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
|
|
# call counters) write to, so "MCP is broken" signals show up as
|
|
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
|
|
# than a separate up{} series.
|
|
- name: mnemosyne_alerts
|
|
rules:
|
|
- alert: MnemosyneDown
|
|
expr: up{job="mnemosyne"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Mnemosyne is down"
|
|
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
|
|
|
|
- alert: MnemosyneHighErrorRate
|
|
expr: |
|
|
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
|
|
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne HTTP 5xx error rate above 5%"
|
|
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
|
|
|
|
- alert: MnemosyneSlowResponses
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
|
|
) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne p95 response time above 5s"
|
|
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
|
|
|
|
# MCP tool-call error surface — owned by mcp_server.metrics on the
|
|
# same /metrics endpoint. This complements MnemosyneDown by catching
|
|
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
|
|
# failing, or Neo4j vector search is 500-ing.
|
|
- alert: MnemosyneMCPToolErrors
|
|
expr: |
|
|
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
|
|
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne MCP tool error rate above 10%"
|
|
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
|
|
|
|
# Celery queue depth — high pending count usually means the embedding
|
|
# worker is stuck or throttled by the embedding provider. Requires
|
|
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
|
|
# if that is not deployed yet, this rule simply never fires.
|
|
- alert: MnemosyneCeleryBacklog
|
|
expr: |
|
|
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
|
|
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
|
|
|
|
# Red Panda Seal of Approval 🐼
|
|
# "If the metrics aren't red, go back to bed"
|
|
{% endraw %}
|