feat(alloy): add journal relabeling and kottos integration on puck
Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.
Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
This commit is contained in:
242
ansible/pplg/puck_services_dashboard.json.j2
Normal file
242
ansible/pplg/puck_services_dashboard.json.j2
Normal file
@@ -0,0 +1,242 @@
|
||||
{
|
||||
"title": "Puck Services — Logs & Health",
|
||||
"uid": "puck-services-logs",
|
||||
"tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"weekStart": "",
|
||||
"refresh": "30s",
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "loki",
|
||||
"type": "datasource",
|
||||
"query": "loki",
|
||||
"current": {"selected": false, "text": "Loki", "value": "Loki"},
|
||||
"hide": 0,
|
||||
"label": "Loki datasource"
|
||||
},
|
||||
{
|
||||
"name": "prom",
|
||||
"type": "datasource",
|
||||
"query": "prometheus",
|
||||
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
|
||||
"hide": 0,
|
||||
"label": "Prometheus datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "row",
|
||||
"title": "Mnemosyne",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Mnemosyne — log rate by level",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
|
||||
"legendFormat": "{{level}}"
|
||||
}
|
||||
],
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "logs",
|
||||
"title": "Mnemosyne — errors (last 25)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
|
||||
"maxLines": 25
|
||||
}
|
||||
],
|
||||
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Mnemosyne — HTTP 5xx rate",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {"calcs": ["lastNotNull"]},
|
||||
"colorMode": "value",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Mnemosyne — p95 latency",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
|
||||
}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
|
||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Mnemosyne — MCP tool error rate",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
|
||||
}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "row",
|
||||
"title": "Pallas (Kottos agents)",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "Pallas — log rate by agent (component)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
|
||||
"legendFormat": "{{component}}"
|
||||
}
|
||||
],
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "logs",
|
||||
"title": "Pallas — forward trace errors (opaque MCP transport failures)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
|
||||
"maxLines": 25
|
||||
}
|
||||
],
|
||||
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "logs",
|
||||
"title": "Pallas — last 25 ERROR lines (any agent)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
|
||||
"maxLines": 25
|
||||
}
|
||||
],
|
||||
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "row",
|
||||
"title": "Daedalus",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Daedalus — log rate by level",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
|
||||
"legendFormat": "{{level}}"
|
||||
}
|
||||
],
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "stat",
|
||||
"title": "Daedalus — HTTP 5xx rate",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
|
||||
}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"type": "stat",
|
||||
"title": "Daedalus — MCP p95 latency",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
|
||||
}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
|
||||
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"type": "logs",
|
||||
"title": "Daedalus — errors (last 25)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
|
||||
"maxLines": 25
|
||||
}
|
||||
],
|
||||
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user