feat(observability): add SearXNG, Argos, and Pallas monitoring

- Add SearXNG syslog ingestion and blackbox health probes on miranda
  and rosalind for per-host attributable failure detection
- Scrape Argos MCP application metrics from miranda
- Add Pallas dashboard panels for downstream availability and turn
  error ratios
This commit is contained in:
2026-05-24 23:52:53 -04:00
parent 43fae203d1
commit 3c2f8c57ca
24 changed files with 1968 additions and 938 deletions

View File

@@ -51,28 +51,44 @@ scrape_configs:
metrics_path: '/metrics'
scrape_interval: 15s
# Mnemosyne — single /metrics endpoint on the app container serves both
# django-prometheus HTTP/Celery metrics and the MCP server's tool-call
# counters (the mcp_server.metrics module registers into the same
# prometheus_client process registry on the Django side). The mcp
# container itself does not expose /metrics; run 'em on the WSGI side.
# Mnemosyne — app exposes /metrics on the Django container (proxied via
# nginx); a single prometheus_client process registry serves both
# django-prometheus (HTTP/Celery) and the MCP server's tool-call counters
# (the mcp container itself does not expose /metrics). Web is an
# nginx-prometheus-exporter sidecar that scrapes the web container's
# stub_status and re-exposes it in Prometheus format.
- job_name: 'mnemosyne'
static_configs:
- targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
metrics_path: '/metrics'
scrape_interval: 15s
static_configs:
- targets: ['{{ mnemosyne_app_metrics_host }}:{{ mnemosyne_app_metrics_port }}']
labels:
component: app
- targets: ['{{ mnemosyne_web_metrics_host }}:{{ mnemosyne_web_metrics_port }}']
labels:
component: web
# Pallas — each deployment is one scrape target (registry port).
# Pallas uses a single process-global registry, so per-agent /metrics
# endpoints serve the same snapshot; the `agent` dimension is carried
# as a metric label, not a target. Targets are defined per
# environment in pallas_metrics_targets (host_vars on the Prometheus
# host); instances are differentiated by the `instance` label.
{% if pallas_metrics_targets | default([]) %}
- job_name: 'pallas'
metrics_path: '/metrics'
scrape_interval: 15s
static_configs: {{ pallas_metrics_targets | to_json }}
{% endif %}
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
# metrics on the standard metrics port (22094). Both Ariel (LLM memory
# via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
# same port — they are differentiated by hostname only.
# metrics. Targets are listed per-environment in neo4j_metrics_targets
# (host_vars on the Prometheus host) — instances are differentiated by
# hostname only.
- job_name: 'neo4j'
static_configs:
- targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
- targets: {{ neo4j_metrics_targets | to_json }}
metrics_path: '/metrics'
scrape_interval: 15s
# Red Panda Approved Prometheus Configuration