feat(alloy): add journal relabeling and kottos integration on puck

Introduce structured journal relabel rules on puck to tag Pallas-managed units with {service, project, component} labels matching the Mnemosyne and Daedalus schema. Add kottos release variable and vault secrets example entries for the new Pallas FastAgent runtime. Remove the defunct mnemosyne syslog listener now that Mnemosyne ships JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00
parent e92ab80bbf
commit 8c95173705
19 changed files with 1336 additions and 27 deletions
--- a/ansible/pplg/alert_rules.yml.j2
+++ b/ansible/pplg/alert_rules.yml.j2
@@ -312,6 +312,78 @@ groups:
          summary: "Daedalus S3 error rate above 1%"
          description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."

+  # ============================================================================
+  # Mnemosyne Application Alerts
+  # ============================================================================
+  # One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
+  # The Django app container hosts the single prometheus_client registry that
+  # both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
+  # call counters) write to, so "MCP is broken" signals show up as
+  # ``mcp_tool_invocations_total{status="error"}`` on the same job rather
+  # than a separate up{} series.
+  - name: mnemosyne_alerts
+    rules:
+      - alert: MnemosyneDown
+        expr: up{job="mnemosyne"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Mnemosyne is down"
+          description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes.  Both the Django app and the MCP server (same container family) are presumed unavailable."
+
+      - alert: MnemosyneHighErrorRate
+        expr: |
+          sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
+            / sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne HTTP 5xx error rate above 5%"
+          description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
+
+      - alert: MnemosyneSlowResponses
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
+          ) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne p95 response time above 5s"
+          description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
+
+      # MCP tool-call error surface — owned by mcp_server.metrics on the
+      # same /metrics endpoint.  This complements MnemosyneDown by catching
+      # "app is up but the MCP layer is sick" — e.g. auth token lookups are
+      # failing, or Neo4j vector search is 500-ing.
+      - alert: MnemosyneMCPToolErrors
+        expr: |
+          sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
+            / sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne MCP tool error rate above 10%"
+          description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
+
+      # Celery queue depth — high pending count usually means the embedding
+      # worker is stuck or throttled by the embedding provider.  Requires
+      # ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
+      # if that is not deployed yet, this rule simply never fires.
+      - alert: MnemosyneCeleryBacklog
+        expr: |
+          sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
+          description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
+
 # Red Panda Seal of Approval 🐼
 # "If the metrics aren't red, go back to bed"
 {% endraw %}