feat(observability): add SearXNG, Argos, and Pallas monitoring

- Add SearXNG syslog ingestion and blackbox health probes on miranda and rosalind for per-host attributable failure detection - Scrape Argos MCP application metrics from miranda - Add Pallas dashboard panels for downstream availability and turn error ratios
2026-05-24 23:52:53 -04:00
parent 43fae203d1
commit 3c2f8c57ca
24 changed files with 1968 additions and 938 deletions
--- a/ansible/pplg/prometheus.yml.j2
+++ b/ansible/pplg/prometheus.yml.j2
@@ -51,28 +51,44 @@ scrape_configs:
    metrics_path: '/metrics'
    scrape_interval: 15s

-  # Mnemosyne — single /metrics endpoint on the app container serves both
-  # django-prometheus HTTP/Celery metrics and the MCP server's tool-call
-  # counters (the mcp_server.metrics module registers into the same
-  # prometheus_client process registry on the Django side).  The mcp
-  # container itself does not expose /metrics; run 'em on the WSGI side.
+  # Mnemosyne — app exposes /metrics on the Django container (proxied via
+  # nginx); a single prometheus_client process registry serves both
+  # django-prometheus (HTTP/Celery) and the MCP server's tool-call counters
+  # (the mcp container itself does not expose /metrics). Web is an
+  # nginx-prometheus-exporter sidecar that scrapes the web container's
+  # stub_status and re-exposes it in Prometheus format.
  - job_name: 'mnemosyne'
-    static_configs:
-      - targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
    metrics_path: '/metrics'
    scrape_interval: 15s
+    static_configs:
+      - targets: ['{{ mnemosyne_app_metrics_host }}:{{ mnemosyne_app_metrics_port }}']
+        labels:
+          component: app
+      - targets: ['{{ mnemosyne_web_metrics_host }}:{{ mnemosyne_web_metrics_port }}']
+        labels:
+          component: web
+
+  # Pallas — each deployment is one scrape target (registry port).
+  # Pallas uses a single process-global registry, so per-agent /metrics
+  # endpoints serve the same snapshot; the `agent` dimension is carried
+  # as a metric label, not a target. Targets are defined per
+  # environment in pallas_metrics_targets (host_vars on the Prometheus
+  # host); instances are differentiated by the `instance` label.
+{% if pallas_metrics_targets | default([]) %}
+  - job_name: 'pallas'
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+    static_configs: {{ pallas_metrics_targets | to_json }}
+{% endif %}

  # Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
  # Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
-  # metrics on the standard metrics port (22094).  Both Ariel (LLM memory
-  # via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
-  # same port — they are differentiated by hostname only.
+  # metrics.  Targets are listed per-environment in neo4j_metrics_targets
+  # (host_vars on the Prometheus host) — instances are differentiated by
+  # hostname only.
  - job_name: 'neo4j'
    static_configs:
-      - targets:
-          - 'ariel.incus:22094'
-          - 'umbriel.incus:22094'
+      - targets: {{ neo4j_metrics_targets | to_json }}
    metrics_path: '/metrics'
    scrape_interval: 15s

-# Red Panda Approved Prometheus Configuration