feat(alloy): add journal relabeling and kottos integration on puck

Introduce structured journal relabel rules on puck to tag Pallas-managed units with {service, project, component} labels matching the Mnemosyne and Daedalus schema. Add kottos release variable and vault secrets example entries for the new Pallas FastAgent runtime. Remove the defunct mnemosyne syslog listener now that Mnemosyne ships JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00
parent e92ab80bbf
commit 8c95173705
19 changed files with 1336 additions and 27 deletions
--- a/ansible/pplg/alert_rules.yml.j2
+++ b/ansible/pplg/alert_rules.yml.j2
@@ -312,6 +312,78 @@ groups:
          summary: "Daedalus S3 error rate above 1%"
          description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."

+  # ============================================================================
+  # Mnemosyne Application Alerts
+  # ============================================================================
+  # One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
+  # The Django app container hosts the single prometheus_client registry that
+  # both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
+  # call counters) write to, so "MCP is broken" signals show up as
+  # ``mcp_tool_invocations_total{status="error"}`` on the same job rather
+  # than a separate up{} series.
+  - name: mnemosyne_alerts
+    rules:
+      - alert: MnemosyneDown
+        expr: up{job="mnemosyne"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Mnemosyne is down"
+          description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes.  Both the Django app and the MCP server (same container family) are presumed unavailable."
+
+      - alert: MnemosyneHighErrorRate
+        expr: |
+          sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
+            / sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne HTTP 5xx error rate above 5%"
+          description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
+
+      - alert: MnemosyneSlowResponses
+        expr: |
+          histogram_quantile(0.95,
+            sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
+          ) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne p95 response time above 5s"
+          description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
+
+      # MCP tool-call error surface — owned by mcp_server.metrics on the
+      # same /metrics endpoint.  This complements MnemosyneDown by catching
+      # "app is up but the MCP layer is sick" — e.g. auth token lookups are
+      # failing, or Neo4j vector search is 500-ing.
+      - alert: MnemosyneMCPToolErrors
+        expr: |
+          sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
+            / sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne MCP tool error rate above 10%"
+          description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
+
+      # Celery queue depth — high pending count usually means the embedding
+      # worker is stuck or throttled by the embedding provider.  Requires
+      # ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
+      # if that is not deployed yet, this rule simply never fires.
+      - alert: MnemosyneCeleryBacklog
+        expr: |
+          sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
+          description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
+
 # Red Panda Seal of Approval 🐼
 # "If the metrics aren't red, go back to bed"
 {% endraw %}
--- a/ansible/pplg/dashboards_provider.yml.j2
+++ b/ansible/pplg/dashboards_provider.yml.j2
@@ -0,0 +1,23 @@
+# Grafana dashboard file provider
+# Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml
+#
+# Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports
+# any JSON file it finds.  Each dashboard JSON lives in that directory and
+# is owned by Ansible — operators should not edit dashboards through the
+# Grafana UI (changes won't survive a deploy; export the final JSON and
+# land it in this role).
+apiVersion: 1
+
+providers:
+  - name: 'puck'
+    orgId: 1
+    folder: 'Puck Services'
+    folderUid: puck-services
+    type: file
+    disableDeletion: false
+    editable: true
+    allowUiUpdates: false
+    updateIntervalSeconds: 30
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false
--- a/ansible/pplg/deploy.yml
+++ b/ansible/pplg/deploy.yml
@@ -208,6 +208,32 @@
      group: grafana
      mode: '750'

+  - name: Ensure Grafana dashboard provisioning directory exists
+    ansible.builtin.file:
+      path: /etc/grafana/provisioning/dashboards
+      state: directory
+      owner: grafana
+      group: grafana
+      mode: '750'
+
+  - name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards)
+    ansible.builtin.template:
+      src: "dashboards_provider.yml.j2"
+      dest: "/etc/grafana/provisioning/dashboards/puck.yaml"
+      owner: grafana
+      group: grafana
+      mode: '640'
+    notify: restart grafana
+
+  - name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus)
+    ansible.builtin.template:
+      src: "puck_services_dashboard.json.j2"
+      dest: "/var/lib/grafana/dashboards/puck_services.json"
+      owner: grafana
+      group: grafana
+      mode: '640'
+    notify: restart grafana
+
  - name: Template Grafana main configuration
    ansible.builtin.template:
      src: "grafana.ini.j2"
--- a/ansible/pplg/prometheus.yml.j2
+++ b/ansible/pplg/prometheus.yml.j2
@@ -47,7 +47,18 @@ scrape_configs:

  - job_name: 'daedalus'
    static_configs:
-      - targets: ['puck.incus:22181']
+      - targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}']
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+
+  # Mnemosyne — single /metrics endpoint on the app container serves both
+  # django-prometheus HTTP/Celery metrics and the MCP server's tool-call
+  # counters (the mcp_server.metrics module registers into the same
+  # prometheus_client process registry on the Django side).  The mcp
+  # container itself does not expose /metrics; run 'em on the WSGI side.
+  - job_name: 'mnemosyne'
+    static_configs:
+      - targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
    metrics_path: '/metrics'
    scrape_interval: 15s

--- a/ansible/pplg/puck_services_dashboard.json.j2
+++ b/ansible/pplg/puck_services_dashboard.json.j2
@@ -0,0 +1,242 @@
+{
+  "title": "Puck Services — Logs & Health",
+  "uid": "puck-services-logs",
+  "tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "version": 1,
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "weekStart": "",
+  "refresh": "30s",
+  "time": {"from": "now-1h", "to": "now"},
+  "templating": {
+    "list": [
+      {
+        "name": "loki",
+        "type": "datasource",
+        "query": "loki",
+        "current": {"selected": false, "text": "Loki", "value": "Loki"},
+        "hide": 0,
+        "label": "Loki datasource"
+      },
+      {
+        "name": "prom",
+        "type": "datasource",
+        "query": "prometheus",
+        "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
+        "hide": 0,
+        "label": "Prometheus datasource"
+      }
+    ]
+  },
+  "panels": [
+    {
+      "id": 1,
+      "type": "row",
+      "title": "Mnemosyne",
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Mnemosyne — log rate by level",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
+          "legendFormat": "{{level}}"
+        }
+      ],
+      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
+    },
+    {
+      "id": 3,
+      "type": "logs",
+      "title": "Mnemosyne — errors (last 25)",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
+          "maxLines": 25
+        }
+      ],
+      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Mnemosyne — HTTP 5xx rate",
+      "datasource": {"type": "prometheus", "uid": "${prom}"},
+      "gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
+        }
+      ],
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"]},
+        "colorMode": "value",
+        "textMode": "auto"
+      },
+      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Mnemosyne — p95 latency",
+      "datasource": {"type": "prometheus", "uid": "${prom}"},
+      "gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
+        }
+      ],
+      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
+      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Mnemosyne — MCP tool error rate",
+      "datasource": {"type": "prometheus", "uid": "${prom}"},
+      "gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
+        }
+      ],
+      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
+      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
+    },
+
+    {
+      "id": 10,
+      "type": "row",
+      "title": "Pallas (Kottos agents)",
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
+    },
+    {
+      "id": 11,
+      "type": "timeseries",
+      "title": "Pallas — log rate by agent (component)",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
+          "legendFormat": "{{component}}"
+        }
+      ],
+      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
+    },
+    {
+      "id": 12,
+      "type": "logs",
+      "title": "Pallas — forward trace errors (opaque MCP transport failures)",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
+          "maxLines": 25
+        }
+      ],
+      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
+    },
+    {
+      "id": 13,
+      "type": "logs",
+      "title": "Pallas — last 25 ERROR lines (any agent)",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
+          "maxLines": 25
+        }
+      ],
+      "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
+    },
+
+    {
+      "id": 20,
+      "type": "row",
+      "title": "Daedalus",
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
+    },
+    {
+      "id": 21,
+      "type": "timeseries",
+      "title": "Daedalus — log rate by level",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
+          "legendFormat": "{{level}}"
+        }
+      ],
+      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
+    },
+    {
+      "id": 22,
+      "type": "stat",
+      "title": "Daedalus — HTTP 5xx rate",
+      "datasource": {"type": "prometheus", "uid": "${prom}"},
+      "gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
+        }
+      ],
+      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
+      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
+    },
+    {
+      "id": 23,
+      "type": "stat",
+      "title": "Daedalus — MCP p95 latency",
+      "datasource": {"type": "prometheus", "uid": "${prom}"},
+      "gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
+        }
+      ],
+      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
+      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
+    },
+    {
+      "id": 24,
+      "type": "logs",
+      "title": "Daedalus — errors (last 25)",
+      "datasource": {"type": "loki", "uid": "${loki}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
+      "targets": [
+        {
+          "refId": "A",
+          "expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
+          "maxLines": 25
+        }
+      ],
+      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
+    }
+  ]
+}