Files
ouranos/ansible/pplg/puck_services_dashboard.json.j2
Robert Helewka 8c95173705 feat(alloy): add journal relabeling and kottos integration on puck
Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.

Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00

243 lines
8.2 KiB
Django/Jinja

{
"title": "Puck Services — Logs & Health",
"uid": "puck-services-logs",
"tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"templating": {
"list": [
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Mnemosyne",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "timeseries",
"title": "Mnemosyne — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 3,
"type": "logs",
"title": "Mnemosyne — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
"targets": [
{
"refId": "A",
"expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 4,
"type": "stat",
"title": "Mnemosyne — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {
"reduceOptions": {"calcs": ["lastNotNull"]},
"colorMode": "value",
"textMode": "auto"
},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 5,
"type": "stat",
"title": "Mnemosyne — p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
},
{
"id": 6,
"type": "stat",
"title": "Mnemosyne — MCP tool error rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 10,
"type": "row",
"title": "Pallas (Kottos agents)",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
},
{
"id": 11,
"type": "timeseries",
"title": "Pallas — log rate by agent (component)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
"targets": [
{
"refId": "A",
"expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
"legendFormat": "{{component}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "logs",
"title": "Pallas — forward trace errors (opaque MCP transport failures)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 13,
"type": "logs",
"title": "Pallas — last 25 ERROR lines (any agent)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
},
{
"id": 20,
"type": "row",
"title": "Daedalus",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
},
{
"id": 21,
"type": "timeseries",
"title": "Daedalus — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 22,
"type": "stat",
"title": "Daedalus — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 23,
"type": "stat",
"title": "Daedalus — MCP p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
},
{
"id": 24,
"type": "logs",
"title": "Daedalus — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
"targets": [
{
"refId": "A",
"expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
}
]
}