feat(alloy): add journal relabeling and kottos integration on puck

Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.

Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
This commit is contained in:
2026-05-11 13:54:14 -04:00
parent e92ab80bbf
commit 8c95173705
19 changed files with 1336 additions and 27 deletions

View File

@@ -312,6 +312,78 @@ groups:
summary: "Daedalus S3 error rate above 1%"
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
# ============================================================================
# Mnemosyne Application Alerts
# ============================================================================
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
# The Django app container hosts the single prometheus_client registry that
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
# call counters) write to, so "MCP is broken" signals show up as
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
# than a separate up{} series.
- name: mnemosyne_alerts
rules:
- alert: MnemosyneDown
expr: up{job="mnemosyne"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Mnemosyne is down"
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
- alert: MnemosyneHighErrorRate
expr: |
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne HTTP 5xx error rate above 5%"
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
- alert: MnemosyneSlowResponses
expr: |
histogram_quantile(0.95,
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne p95 response time above 5s"
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
# MCP tool-call error surface — owned by mcp_server.metrics on the
# same /metrics endpoint. This complements MnemosyneDown by catching
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
# failing, or Neo4j vector search is 500-ing.
- alert: MnemosyneMCPToolErrors
expr: |
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne MCP tool error rate above 10%"
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
# Celery queue depth — high pending count usually means the embedding
# worker is stuck or throttled by the embedding provider. Requires
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
# if that is not deployed yet, this rule simply never fires.
- alert: MnemosyneCeleryBacklog
expr: |
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -0,0 +1,23 @@
# Grafana dashboard file provider
# Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml
#
# Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports
# any JSON file it finds. Each dashboard JSON lives in that directory and
# is owned by Ansible — operators should not edit dashboards through the
# Grafana UI (changes won't survive a deploy; export the final JSON and
# land it in this role).
apiVersion: 1
providers:
- name: 'puck'
orgId: 1
folder: 'Puck Services'
folderUid: puck-services
type: file
disableDeletion: false
editable: true
allowUiUpdates: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View File

@@ -208,6 +208,32 @@
group: grafana
mode: '750'
- name: Ensure Grafana dashboard provisioning directory exists
ansible.builtin.file:
path: /etc/grafana/provisioning/dashboards
state: directory
owner: grafana
group: grafana
mode: '750'
- name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards)
ansible.builtin.template:
src: "dashboards_provider.yml.j2"
dest: "/etc/grafana/provisioning/dashboards/puck.yaml"
owner: grafana
group: grafana
mode: '640'
notify: restart grafana
- name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus)
ansible.builtin.template:
src: "puck_services_dashboard.json.j2"
dest: "/var/lib/grafana/dashboards/puck_services.json"
owner: grafana
group: grafana
mode: '640'
notify: restart grafana
- name: Template Grafana main configuration
ansible.builtin.template:
src: "grafana.ini.j2"

View File

@@ -47,7 +47,18 @@ scrape_configs:
- job_name: 'daedalus'
static_configs:
- targets: ['puck.incus:22181']
- targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}']
metrics_path: '/metrics'
scrape_interval: 15s
# Mnemosyne — single /metrics endpoint on the app container serves both
# django-prometheus HTTP/Celery metrics and the MCP server's tool-call
# counters (the mcp_server.metrics module registers into the same
# prometheus_client process registry on the Django side). The mcp
# container itself does not expose /metrics; run 'em on the WSGI side.
- job_name: 'mnemosyne'
static_configs:
- targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
metrics_path: '/metrics'
scrape_interval: 15s

View File

@@ -0,0 +1,242 @@
{
"title": "Puck Services — Logs & Health",
"uid": "puck-services-logs",
"tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"templating": {
"list": [
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Mnemosyne",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "timeseries",
"title": "Mnemosyne — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 3,
"type": "logs",
"title": "Mnemosyne — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
"targets": [
{
"refId": "A",
"expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 4,
"type": "stat",
"title": "Mnemosyne — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {
"reduceOptions": {"calcs": ["lastNotNull"]},
"colorMode": "value",
"textMode": "auto"
},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 5,
"type": "stat",
"title": "Mnemosyne — p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
},
{
"id": 6,
"type": "stat",
"title": "Mnemosyne — MCP tool error rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 10,
"type": "row",
"title": "Pallas (Kottos agents)",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
},
{
"id": 11,
"type": "timeseries",
"title": "Pallas — log rate by agent (component)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
"targets": [
{
"refId": "A",
"expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
"legendFormat": "{{component}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "logs",
"title": "Pallas — forward trace errors (opaque MCP transport failures)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 13,
"type": "logs",
"title": "Pallas — last 25 ERROR lines (any agent)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
},
{
"id": 20,
"type": "row",
"title": "Daedalus",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
},
{
"id": 21,
"type": "timeseries",
"title": "Daedalus — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 22,
"type": "stat",
"title": "Daedalus — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 23,
"type": "stat",
"title": "Daedalus — MCP p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
},
{
"id": 24,
"type": "logs",
"title": "Daedalus — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
"targets": [
{
"refId": "A",
"expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
}
]
}