Files
ouranos/dashboards/daedalus_stack.json
Robert Helewka 3c2f8c57ca feat(observability): add SearXNG, Argos, and Pallas monitoring
- Add SearXNG syslog ingestion and blackbox health probes on miranda
  and rosalind for per-host attributable failure detection
- Scrape Argos MCP application metrics from miranda
- Add Pallas dashboard panels for downstream availability and turn
  error ratios
2026-05-24 23:52:53 -04:00

703 lines
35 KiB
JSON

{
"title": "Daedalus Stack",
"uid": "daedalus-stack",
"tags": ["daedalus", "mnemosyne", "pallas", "ouranos"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Neo4j dashboard",
"tooltip": "Detailed Neo4j metrics (ariel, umbriel)",
"type": "link",
"url": "/d/neo4j"
},
{
"asDropdown": false,
"icon": "doc",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Explore Logs",
"tooltip": "Loki: daedalus + mnemosyne + pallas",
"type": "link",
"url": "/explore?orgId=1&left=%7B%22datasource%22:%22Loki%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bservice%3D~%5C%22daedalus%7Cmnemosyne%7Cpallas%5C%22%7D%22%7D%5D%7D"
}
],
"templating": {
"list": [
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "pallas_inst",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(up{job=\"pallas\"}, instance)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Pallas instance"
},
{
"name": "agent",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(pallas_send_message_total{instance=~\"$pallas_inst\"}, agent)",
"refresh": 2,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Agent"
}
]
},
"panels": [
{
"id": 100,
"type": "row",
"title": "Summary",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 101,
"type": "stat",
"title": "Daedalus",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 0, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"daedalus\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 102,
"type": "stat",
"title": "Mnemosyne app",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 3, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"app\"}", "legendFormat": "app"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 103,
"type": "stat",
"title": "Mnemosyne web",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 6, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"web\"}", "legendFormat": "web"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 104,
"type": "stat",
"title": "Pallas up ratio",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 9, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(up{job=\"pallas\"}) / count(up{job=\"pallas\"})", "legendFormat": "up ratio"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.67}, {"color": "green", "value": 1}]}}}
},
{
"id": 105,
"type": "stat",
"title": "Agents healthy",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 12, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(daedalus_agents_by_health{status=\"ok\"}) / clamp_min(daedalus_agents_total, 1)", "legendFormat": "healthy"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.7}, {"color": "green", "value": 1}]}}}
},
{
"id": 106,
"type": "stat",
"title": "Chat p95 (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 15, "y": 1},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "chat p95"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 10}, {"color": "red", "value": 30}]}}}
},
{
"id": 107,
"type": "timeseries",
"title": "Stack up (last hour)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=~\"daedalus|mnemosyne|pallas\"}", "legendFormat": "{{job}} {{instance}} {{component}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "min": 0, "max": 1, "custom": {"drawStyle": "line", "lineInterpolation": "stepBefore", "fillOpacity": 10}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 200,
"type": "row",
"title": "Daedalus",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
},
{
"id": 201,
"type": "stat",
"title": "Daedalus up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
"targets": [
{"refId": "A", "expr": "daedalus_up", "legendFormat": "daedalus"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 202,
"type": "stat",
"title": "5xx error rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)", "legendFormat": "5xx"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 203,
"type": "stat",
"title": "MCP connections active",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(daedalus_mcp_connections_active)", "legendFormat": "active"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 204,
"type": "stat",
"title": "Avg context window %",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
"targets": [
{"refId": "A", "expr": "avg(daedalus_chat_context_pct)", "legendFormat": "avg"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percent", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 70}, {"color": "red", "value": 90}]}}}
},
{
"id": 205,
"type": "stat",
"title": "Tokens/sec (5m, total)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(rate(daedalus_chat_tokens_total[5m]))", "legendFormat": "tok/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 210,
"type": "timeseries",
"title": "Chat latency (p50/p95/p99)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.50, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p50"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p95"},
{"refId": "C", "expr": "histogram_quantile(0.99, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p99"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 211,
"type": "timeseries",
"title": "Voice pipeline p95 (STT / agent / TTS / total)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_stt_duration_seconds_bucket[5m])))", "legendFormat": "stt"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_agent_duration_seconds_bucket[5m])))", "legendFormat": "agent"},
{"refId": "C", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_tts_duration_seconds_bucket[5m])))", "legendFormat": "tts"},
{"refId": "D", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_pipeline_duration_seconds_bucket[5m])))", "legendFormat": "total"}
],
"fieldConfig": {"defaults": {"unit": "s", "custom": {"stacking": {"mode": "none"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 220,
"type": "timeseries",
"title": "Pallas reach — MCP error ratio by server (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 18},
"targets": [
{"refId": "A", "expr": "sum by (server) (rate(daedalus_mcp_requests_total{status=\"error\"}[5m])) / clamp_min(sum by (server) (rate(daedalus_mcp_requests_total[5m])), 0.0001)", "legendFormat": "{{server}}"}
],
"fieldConfig": {"defaults": {"unit": "percentunit"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 221,
"type": "timeseries",
"title": "Mnemosyne reach — p95 latency by operation (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 18},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(daedalus_mnemosyne_request_duration_seconds_bucket[5m])))", "legendFormat": "{{operation}} p95"},
{"refId": "B", "expr": "sum(rate(daedalus_mnemosyne_requests_total{status=\"error\"}[5m]))", "legendFormat": "errors/s (right)"}
],
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": [{"matcher": {"id": "byName", "options": "errors/s (right)"}, "properties": [{"id": "unit", "value": "ops"}, {"id": "custom.axisPlacement", "value": "right"}]}]},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 230,
"type": "timeseries",
"title": "Token burn by direction (tokens/sec, 5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 26},
"targets": [
{"refId": "A", "expr": "sum by (direction) (rate(daedalus_chat_tokens_total[5m]))", "legendFormat": "{{direction}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20, "stacking": {"mode": "normal"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 231,
"type": "timeseries",
"title": "Mnemosyne ingest jobs (status, 5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 26},
"targets": [
{"refId": "A", "expr": "sum by (status) (rate(daedalus_mnemosyne_ingest_jobs_total[5m]))", "legendFormat": "{{status}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 300,
"type": "row",
"title": "Mnemosyne",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 34}
},
{
"id": 301,
"type": "stat",
"title": "App",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 35},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"app\"}", "legendFormat": "app"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 302,
"type": "stat",
"title": "Web (nginx)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 35},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"web\"}", "legendFormat": "web"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 303,
"type": "stat",
"title": "Search rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 35},
"targets": [
{"refId": "A", "expr": "sum(rate(mnemosyne_search_requests_total[5m]))", "legendFormat": "req/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "reqps"}}
},
{
"id": 304,
"type": "stat",
"title": "Embedding queue depth",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 35},
"targets": [
{"refId": "A", "expr": "mnemosyne_embedding_queue_size", "legendFormat": "queue"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 10}, {"color": "red", "value": 100}]}}}
},
{
"id": 305,
"type": "stat",
"title": "Pipeline in-progress",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 35},
"targets": [
{"refId": "A", "expr": "mnemosyne_pipeline_items_in_progress", "legendFormat": "in-flight"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 306,
"type": "stat",
"title": "MCP tool error rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 35},
"targets": [
{"refId": "A", "expr": "sum(rate(mcp_tool_invocations_total{status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total[5m])), 0.0001)", "legendFormat": "err"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 310,
"type": "timeseries",
"title": "Search rate by type (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 39},
"targets": [
{"refId": "A", "expr": "sum by (search_type) (rate(mnemosyne_search_requests_total[5m]))", "legendFormat": "{{search_type}}"}
],
"fieldConfig": {"defaults": {"unit": "reqps"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 311,
"type": "timeseries",
"title": "Search latency p95 by type (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 39},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, search_type) (rate(mnemosyne_search_duration_seconds_bucket[5m])))", "legendFormat": "{{search_type}} p95"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mnemosyne_search_total_duration_seconds_bucket[5m])))", "legendFormat": "end-to-end p95"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 320,
"type": "timeseries",
"title": "Embedding queue depth over time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 47},
"targets": [
{"refId": "A", "expr": "mnemosyne_embedding_queue_size", "legendFormat": "queue"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 321,
"type": "timeseries",
"title": "Embeddings generated (per sec, by model)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 47},
"targets": [
{"refId": "A", "expr": "sum by (model_name) (rate(mnemosyne_embeddings_generated_total[5m]))", "legendFormat": "{{model_name}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 322,
"type": "timeseries",
"title": "Pipeline items (per sec, by status)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 47},
"targets": [
{"refId": "A", "expr": "sum by (status) (rate(mnemosyne_pipeline_items_total[5m]))", "legendFormat": "{{status}}"},
{"refId": "B", "expr": "sum by (error_type) (rate(mnemosyne_embedding_api_errors_total[5m]))", "legendFormat": "api err: {{error_type}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 330,
"type": "timeseries",
"title": "Neo4j @ umbriel — transactions (rate / open)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 55},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"umbriel.*\"}[5m])", "legendFormat": "{{instance}} open rate"},
{"refId": "B", "expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"umbriel.*\"}", "legendFormat": "{{instance}} current open"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 331,
"type": "stat",
"title": "Neo4j @ umbriel — rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 55},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"umbriel.*\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"umbriel.*\"}[10m]), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 332,
"type": "stat",
"title": "Neo4j @ umbriel — store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 55},
"targets": [
{"refId": "A", "expr": "neo4j_monitor_store_totalStoreSize{instance=~\"umbriel.*\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
},
{
"id": 400,
"type": "row",
"title": "Pallas Agents",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 63}
},
{
"id": 401,
"type": "stat",
"title": "Instance up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 64},
"targets": [
{"refId": "A", "expr": "up{job=\"pallas\", instance=~\"$pallas_inst\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": ""}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 402,
"type": "stat",
"title": "Aggregate agent health (min)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 64},
"targets": [
{"refId": "A", "expr": "min by (instance) (pallas_agent_health_status{instance=~\"$pallas_inst\"})", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "ERROR", "color": "red"}, "0.5": {"text": "DEGRADED", "color": "orange"}, "1": {"text": "OK", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.5}, {"color": "green", "value": 1}]}}}
},
{
"id": 403,
"type": "stat",
"title": "Downstream MCPs up (ratio)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 64},
"targets": [
{"refId": "A", "expr": "sum by (instance) (pallas_downstream_up{instance=~\"$pallas_inst\"}) / clamp_min(count by (instance) (pallas_downstream_up{instance=~\"$pallas_inst\"}), 1)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.5}, {"color": "green", "value": 1}]}}}
},
{
"id": 404,
"type": "stat",
"title": "Turn error ratio (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 64},
"targets": [
{"refId": "A", "expr": "sum by (instance) (rate(pallas_send_message_total{outcome=\"error\", instance=~\"$pallas_inst\"}[5m])) / clamp_min(sum by (instance) (rate(pallas_send_message_total{instance=~\"$pallas_inst\"}[5m])), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 410,
"type": "timeseries",
"title": "Turn latency p95 by agent (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 68},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, agent, instance) (rate(pallas_send_message_duration_seconds_bucket{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 411,
"type": "table",
"title": "Long-running agents — p99 turn (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 68},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.99, sum by (agent, instance, le) (rate(pallas_send_message_duration_seconds_bucket{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "", "format": "table", "instant": true}
],
"fieldConfig": {"defaults": {"unit": "s", "custom": {"align": "auto"}, "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 30}, {"color": "red", "value": 60}]}, "color": {"mode": "thresholds"}}, "overrides": [{"matcher": {"id": "byName", "options": "Value"}, "properties": [{"id": "displayName", "value": "p99 (s)"}, {"id": "custom.cellOptions", "value": {"type": "color-background"}}]}]},
"options": {"showHeader": true, "sortBy": [{"displayName": "p99 (s)", "desc": true}]},
"transformations": [{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true}}}]
},
{
"id": 420,
"type": "timeseries",
"title": "Turn errors per agent (15m increase)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 76},
"targets": [
{"refId": "A", "expr": "sum by (agent, instance) (increase(pallas_send_message_total{outcome=\"error\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[15m]))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 421,
"type": "timeseries",
"title": "Tokens/sec by kind (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 76},
"targets": [
{"refId": "A", "expr": "sum by (kind) (rate(pallas_llm_tokens_total{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "{{kind}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20, "stacking": {"mode": "normal"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 422,
"type": "table",
"title": "Top-burning agents (24h, input+output tokens)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 84},
"targets": [
{"refId": "A", "expr": "topk(10, sum by (agent, model, instance) (increase(pallas_llm_tokens_total{kind=~\"input|output\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[24h])))", "legendFormat": "", "format": "table", "instant": true}
],
"fieldConfig": {"defaults": {"unit": "short"}, "overrides": [{"matcher": {"id": "byName", "options": "Value"}, "properties": [{"id": "displayName", "value": "tokens (24h)"}, {"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "gradient"}}]}]},
"options": {"showHeader": true, "sortBy": [{"displayName": "tokens (24h)", "desc": true}]},
"transformations": [{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true}}}]
},
{
"id": 423,
"type": "stat",
"title": "Cache effectiveness (1h)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 84},
"targets": [
{"refId": "A", "expr": "sum(rate(pallas_llm_tokens_total{kind=\"cache_read\", instance=~\"$pallas_inst\"}[1h])) / clamp_min(sum(rate(pallas_llm_tokens_total{kind=~\"input|cache_read\", instance=~\"$pallas_inst\"}[1h])), 0.0001)", "legendFormat": "cache hit"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.2}, {"color": "green", "value": 0.5}]}}}
},
{
"id": 424,
"type": "stat",
"title": "LLM turns/sec (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 84},
"targets": [
{"refId": "A", "expr": "sum(rate(pallas_llm_turns_total{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "turns/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "ops"}}
},
{
"id": 430,
"type": "timeseries",
"title": "Cypher tool calls — rate by outcome (Pallas → ariel)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 92},
"targets": [
{"refId": "A", "expr": "sum by (outcome) (rate(pallas_tool_calls_total{server=\"neo4j_cypher\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "{{outcome}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 431,
"type": "timeseries",
"title": "Cypher tool calls — p95 latency by agent",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 92},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, agent, instance) (rate(pallas_tool_call_duration_seconds_bucket{server=\"neo4j_cypher\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 440,
"type": "timeseries",
"title": "Neo4j @ ariel — transactions (rate / open)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 100},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"ariel.*\"}[5m])", "legendFormat": "{{instance}} open rate"},
{"refId": "B", "expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"ariel.*\"}", "legendFormat": "{{instance}} current open"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 441,
"type": "stat",
"title": "Neo4j @ ariel — rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 100},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"ariel.*\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"ariel.*\"}[10m]), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 442,
"type": "stat",
"title": "Neo4j @ ariel — store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 100},
"targets": [
{"refId": "A", "expr": "neo4j_monitor_store_totalStoreSize{instance=~\"ariel.*\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
}
]
}