- Drop `FREECAD_MCP_` prefix from env vars (use `FREECAD_*`) - Update freecad_mcp port from 22032 to 22061 - Document that FreeCAD bridge is required for tool calls - Replace kottos deployment with pallas deployment
449 lines
20 KiB
Django/Jinja
449 lines
20 KiB
Django/Jinja
# Prometheus Alert Rules
|
|
# Red Panda Approved 🐼
|
|
# Deployed to: /etc/prometheus/alert_rules.yml
|
|
{% raw %}
|
|
groups:
|
|
# ============================================================================
|
|
# Node/Infrastructure Alerts
|
|
# ============================================================================
|
|
- name: node_alerts
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} is down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: CriticalCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk space on {{ $labels.instance }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
|
|
|
- alert: HighLoadAverage
|
|
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High load average on {{ $labels.instance }}"
|
|
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
|
|
|
|
# ============================================================================
|
|
# Process-Level Alerts (puck.incus)
|
|
# ============================================================================
|
|
- name: puck_process_alerts
|
|
rules:
|
|
- alert: PuckHighCPUProcess
|
|
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
|
|
|
|
- alert: PuckCriticalCPUProcess
|
|
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
|
|
|
|
- alert: PuckHighMemoryProcess
|
|
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
|
|
|
|
- alert: PuckCriticalMemoryProcess
|
|
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory process on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
|
|
|
|
- alert: PuckProcessCrashLoop
|
|
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Process count dropped on puck: {{ $labels.groupname }}"
|
|
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
|
|
|
|
# ============================================================================
|
|
# Docker Container Alerts (puck.incus)
|
|
# ============================================================================
|
|
- name: puck_container_alerts
|
|
rules:
|
|
- alert: PuckHighContainerCount
|
|
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High container count on puck"
|
|
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
|
|
|
|
- alert: PuckDuplicateContainers
|
|
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Duplicate containers on puck: {{ $labels.image }}"
|
|
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
|
|
|
|
- alert: PuckOrphanedContainer
|
|
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Possible orphaned container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
|
|
|
|
- alert: PuckMCPContainerOnPuck
|
|
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "MCP container detected on puck (WRONG HOST)"
|
|
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
|
|
|
|
- alert: PuckContainerHighCPU
|
|
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
|
|
|
|
- alert: PuckContainerHighMemory
|
|
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory container on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
|
|
|
|
- alert: PuckContainerOOMKilled
|
|
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Container OOM killed on puck: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} was killed by OOM killer"
|
|
|
|
# ============================================================================
|
|
# Service/Application Alerts
|
|
# ============================================================================
|
|
- name: service_alerts
|
|
rules:
|
|
- alert: PrometheusTargetMissing
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus target missing: {{ $labels.instance }}"
|
|
description: "A Prometheus target has been down for more than 5 minutes."
|
|
|
|
- alert: PrometheusJobMissing
|
|
expr: absent(up{job="node-exporter"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus job missing"
|
|
description: "A Prometheus job has disappeared from target discovery."
|
|
|
|
- alert: AlertmanagerDown
|
|
expr: absent(up{job="alertmanager"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Alertmanager is down"
|
|
description: "Alertmanager is not responding. Alerts may not be delivered."
|
|
|
|
# ============================================================================
|
|
# Loki/Logging Alerts
|
|
# ============================================================================
|
|
- name: loki_alerts
|
|
rules:
|
|
- alert: LokiHighLogVolume
|
|
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High log ingestion rate"
|
|
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
|
|
|
# ============================================================================
|
|
# Django Application Alerts (generic — any Django app exporting the counter)
|
|
# ============================================================================
|
|
# Apps emit django_superuser_logins_total from a user_logged_in signal when
|
|
# the authenticating user is a superuser. The job/component labels identify
|
|
# which app fired; forensic detail (user, IP) is in the matching Loki line.
|
|
- name: django_alerts
|
|
rules:
|
|
- alert: DjangoSuperuserLogin
|
|
expr: increase(django_superuser_logins_total[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Superuser login on {{ $labels.job }}"
|
|
description: "A superuser account just logged in to {{ $labels.job }} (component {{ $labels.component }}). This account is rarely used — confirm it was expected. Forensic detail (user, IP) in Loki: {service=\"{{ $labels.job }}\"} |= \"event=superuser_login\"."
|
|
|
|
# ============================================================================
|
|
# Daedalus Application Alerts
|
|
# ============================================================================
|
|
- name: daedalus_alerts
|
|
rules:
|
|
- alert: DaedalusDown
|
|
expr: daedalus_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Daedalus is down"
|
|
description: "Daedalus has been unreachable for more than 1 minute."
|
|
|
|
- alert: DaedalusMCPDisconnected
|
|
expr: daedalus_mcp_connections_active == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus has no active MCP connections"
|
|
description: "Daedalus has reported zero active MCP connections for 5 minutes."
|
|
|
|
- alert: DaedalusHighErrorRate
|
|
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus HTTP 5xx error rate above 5%"
|
|
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
|
|
|
|
- alert: DaedalusClientExceptionSpike
|
|
expr: rate(daedalus_client_exceptions_total[1m]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus client exception spike"
|
|
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
|
|
|
|
- alert: DaedalusSlowResponses
|
|
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus p95 response time above 5s"
|
|
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
|
|
|
|
- alert: DaedalusMCPLatency
|
|
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus MCP p95 latency above 30s"
|
|
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
|
|
|
|
- alert: DaedalusS3Errors
|
|
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Daedalus S3 error rate above 1%"
|
|
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
|
|
|
# ============================================================================
|
|
# Mnemosyne Application Alerts
|
|
# ============================================================================
|
|
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
|
|
# The Django app container hosts the single prometheus_client registry that
|
|
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
|
|
# call counters) write to, so "MCP is broken" signals show up as
|
|
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
|
|
# than a separate up{} series.
|
|
- name: mnemosyne_alerts
|
|
rules:
|
|
- alert: MnemosyneDown
|
|
expr: up{job="mnemosyne"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Mnemosyne is down"
|
|
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
|
|
|
|
- alert: MnemosyneHighErrorRate
|
|
expr: |
|
|
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
|
|
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne HTTP 5xx error rate above 5%"
|
|
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
|
|
|
|
- alert: MnemosyneSlowResponses
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
|
|
) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne p95 response time above 5s"
|
|
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
|
|
|
|
# MCP tool-call error surface — owned by mcp_server.metrics on the
|
|
# same /metrics endpoint. This complements MnemosyneDown by catching
|
|
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
|
|
# failing, or Neo4j vector search is 500-ing.
|
|
- alert: MnemosyneMCPToolErrors
|
|
expr: |
|
|
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
|
|
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne MCP tool error rate above 10%"
|
|
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
|
|
|
|
# Celery queue depth — high pending count usually means the embedding
|
|
# worker is stuck or throttled by the embedding provider. Requires
|
|
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
|
|
# if that is not deployed yet, this rule simply never fires.
|
|
- alert: MnemosyneCeleryBacklog
|
|
expr: |
|
|
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
|
|
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
|
|
|
|
# ============================================================================
|
|
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
|
|
# ============================================================================
|
|
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
|
|
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
|
|
# metrics. "Exporter down" therefore covers both "exporter container
|
|
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
|
|
# effectively unobservable. Hostname-only — purpose of each instance
|
|
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
|
|
- name: neo4j_alerts
|
|
rules:
|
|
- alert: Neo4jExporterDown
|
|
expr: up{job="neo4j"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Neo4j exporter down on {{ $labels.instance }}"
|
|
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
|
|
|
|
- alert: Neo4jHighRollbackRate
|
|
expr: |
|
|
rate(neo4j_monitor_tx_rolledBackTx[10m])
|
|
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
|
|
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
|
|
|
|
- alert: Neo4jStoreGrowthStalled
|
|
expr: |
|
|
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
|
|
and neo4j_monitor_tx_currentOpenedTx > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
|
|
|
|
# Red Panda Seal of Approval 🐼
|
|
# "If the metrics aren't red, go back to bed"
|
|
{% endraw %}
|