feat(observability): add SearXNG, Argos, and Pallas monitoring
- Add SearXNG syslog ingestion and blackbox health probes on miranda and rosalind for per-host attributable failure detection - Scrape Argos MCP application metrics from miranda - Add Pallas dashboard panels for downstream availability and turn error ratios
This commit is contained in:
307
dashboards/argos.json
Normal file
307
dashboards/argos.json
Normal file
@@ -0,0 +1,307 @@
|
||||
{
|
||||
"title": "Argos",
|
||||
"uid": "argos",
|
||||
"tags": ["argos", "mcp", "searxng", "ouranos"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"weekStart": "",
|
||||
"refresh": "30s",
|
||||
"time": {"from": "now-1h", "to": "now"},
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": false,
|
||||
"icon": "external link",
|
||||
"includeVars": true,
|
||||
"keepTime": true,
|
||||
"tags": [],
|
||||
"targetBlank": true,
|
||||
"title": "SearXNG dashboard",
|
||||
"tooltip": "SearXNG instance probes (miranda, rosalind)",
|
||||
"type": "link",
|
||||
"url": "/d/searxng"
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "prom",
|
||||
"type": "datasource",
|
||||
"query": "prometheus",
|
||||
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
|
||||
"hide": 0,
|
||||
"label": "Prometheus datasource"
|
||||
},
|
||||
{
|
||||
"name": "loki",
|
||||
"type": "datasource",
|
||||
"query": "loki",
|
||||
"current": {"selected": false, "text": "Loki", "value": "Loki"},
|
||||
"hide": 0,
|
||||
"label": "Loki datasource"
|
||||
},
|
||||
{
|
||||
"name": "instance",
|
||||
"type": "query",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"query": "label_values(up{job=\"argos\"}, instance)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"current": {"selected": true, "text": "All", "value": "$__all"},
|
||||
"label": "Argos host"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "row",
|
||||
"title": "Health",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Argos up",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "up{job=\"argos\", instance=~\"$instance\"}", "legendFormat": "{{instance}}"}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
|
||||
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "SearXNG instances healthy (per Argos)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 4, "y": 1},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (instance) (argos_searxng_instance_up{instance=~\"$instance\"})", "legendFormat": "{{instance}}"},
|
||||
{"refId": "B", "expr": "count by (instance) (argos_searxng_instance_up{instance=~\"$instance\"})", "legendFormat": "{{instance}} total", "hide": true}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name", "colorMode": "value"},
|
||||
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 1}, {"color": "green", "value": 2}]}}}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Tool error ratio (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 10, "y": 1},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum(rate(argos_tool_calls_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum(rate(argos_tool_calls_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "errors"}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.20}]}}}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Tool calls/sec (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 14, "y": 1},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum(rate(argos_tool_calls_total{instance=~\"$instance\"}[5m]))", "legendFormat": "calls/s"}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Build",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "argos_build_info{instance=~\"$instance\"}", "legendFormat": "{{instance}} v{{version}}"}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name", "colorMode": "none"},
|
||||
"fieldConfig": {"defaults": {"unit": "none"}}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 10,
|
||||
"type": "row",
|
||||
"title": "Tools",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "Tool calls/sec by tool (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (tool) (rate(argos_tool_calls_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{tool}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "Tool error ratio by tool (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (tool) (rate(argos_tool_calls_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum by (tool) (rate(argos_tool_calls_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "{{tool}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "timeseries",
|
||||
"title": "Tool latency p50 / p95 / p99 (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "histogram_quantile(0.50, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p50"},
|
||||
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p95"},
|
||||
{"refId": "C", "expr": "histogram_quantile(0.99, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p99"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "timeseries",
|
||||
"title": "Tool latency p95 by tool (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, tool) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "{{tool}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 20,
|
||||
"type": "row",
|
||||
"title": "Upstream SearXNG",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 22}
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "table",
|
||||
"title": "SearXNG instances (per-Argos view)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 23},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "argos_searxng_instance_up{instance=~\"$instance\"}", "legendFormat": "{{searxng_instance}}", "format": "table", "instant": true}
|
||||
],
|
||||
"transformations": [
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true, "environment": true, "hostname": true}}}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "custom": {"cellOptions": {"type": "color-background"}}}}
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "timeseries",
|
||||
"title": "Upstream SearXNG requests/sec by instance (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 23},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (instance, searxng_instance) (rate(argos_searxng_requests_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{instance}} → {{searxng_instance}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"type": "timeseries",
|
||||
"title": "Upstream SearXNG error ratio by instance (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 29},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (searxng_instance) (rate(argos_searxng_requests_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum by (searxng_instance) (rate(argos_searxng_requests_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "{{searxng_instance}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"type": "timeseries",
|
||||
"title": "Upstream SearXNG latency p95 by instance (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 29},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, searxng_instance) (rate(argos_searxng_request_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "{{searxng_instance}} p95"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 30,
|
||||
"type": "row",
|
||||
"title": "Cache & webpage fetch",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 35}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "stat",
|
||||
"title": "Cache hit ratio (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 36},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum(rate(argos_cache_operations_total{operation=\"get\", result=\"hit\", instance=~\"$instance\"}[5m])) / clamp_min(sum(rate(argos_cache_operations_total{operation=\"get\", instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "hits"}
|
||||
],
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
|
||||
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.10}, {"color": "green", "value": 0.30}]}}}
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "timeseries",
|
||||
"title": "Cache ops/sec by result (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 9, "x": 6, "y": 36},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (operation, result) (rate(argos_cache_operations_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{operation}}/{{result}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
{
|
||||
"id": 33,
|
||||
"type": "timeseries",
|
||||
"title": "Webpage fetch outcomes/sec (5m)",
|
||||
"datasource": {"type": "prometheus", "uid": "${prom}"},
|
||||
"gridPos": {"h": 8, "w": 9, "x": 15, "y": 36},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "sum by (status) (rate(argos_webpage_fetch_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{status}}"}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "ops"}},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 90,
|
||||
"type": "row",
|
||||
"title": "Logs",
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 44}
|
||||
},
|
||||
{
|
||||
"id": 91,
|
||||
"type": "logs",
|
||||
"title": "argos (Loki)",
|
||||
"datasource": {"type": "loki", "uid": "${loki}"},
|
||||
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 45},
|
||||
"targets": [
|
||||
{"refId": "A", "expr": "{job=\"argos\"}"}
|
||||
],
|
||||
"options": {"showTime": true, "wrapLogMessage": true, "enableLogDetails": true, "dedupStrategy": "none"}
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user