feat(observability): add SearXNG, Argos, and Pallas monitoring

- Add SearXNG syslog ingestion and blackbox health probes on miranda and rosalind for per-host attributable failure detection - Scrape Argos MCP application metrics from miranda - Add Pallas dashboard panels for downstream availability and turn error ratios
2026-05-24 23:52:53 -04:00
parent 43fae203d1
commit 3c2f8c57ca
24 changed files with 1968 additions and 938 deletions
--- a/ansible/pplg/dashboards_provider.yml.j2
+++ b/ansible/pplg/dashboards_provider.yml.j2
@@ -1,23 +0,0 @@
-# Grafana dashboard file provider
-# Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml
-#
-# Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports
-# any JSON file it finds.  Each dashboard JSON lives in that directory and
-# is owned by Ansible — operators should not edit dashboards through the
-# Grafana UI (changes won't survive a deploy; export the final JSON and
-# land it in this role).
-apiVersion: 1
-
-providers:
-  - name: 'puck'
-    orgId: 1
-    folder: 'Puck Services'
-    folderUid: puck-services
-    type: file
-    disableDeletion: false
-    editable: true
-    allowUiUpdates: false
-    updateIntervalSeconds: 30
-    options:
-      path: /var/lib/grafana/dashboards
-      foldersFromFilesStructure: false
--- a/ansible/pplg/deploy.yml
+++ b/ansible/pplg/deploy.yml
@@ -200,40 +200,6 @@
  # Grafana
  # ===========================================================================

-  - name: Create dashboards directory
-    ansible.builtin.file:
-      path: /var/lib/grafana/dashboards
-      state: directory
-      owner: grafana
-      group: grafana
-      mode: '750'
-
-  - name: Ensure Grafana dashboard provisioning directory exists
-    ansible.builtin.file:
-      path: /etc/grafana/provisioning/dashboards
-      state: directory
-      owner: grafana
-      group: grafana
-      mode: '750'
-
-  - name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards)
-    ansible.builtin.template:
-      src: "dashboards_provider.yml.j2"
-      dest: "/etc/grafana/provisioning/dashboards/puck.yaml"
-      owner: grafana
-      group: grafana
-      mode: '640'
-    notify: restart grafana
-
-  - name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus)
-    ansible.builtin.template:
-      src: "puck_services_dashboard.json.j2"
-      dest: "/var/lib/grafana/dashboards/puck_services.json"
-      owner: grafana
-      group: grafana
-      mode: '640'
-    notify: restart grafana
-
  - name: Template Grafana main configuration
    ansible.builtin.template:
      src: "grafana.ini.j2"
--- a/ansible/pplg/prometheus.yml.j2
+++ b/ansible/pplg/prometheus.yml.j2
@@ -51,28 +51,44 @@ scrape_configs:
    metrics_path: '/metrics'
    scrape_interval: 15s

-  # Mnemosyne — single /metrics endpoint on the app container serves both
-  # django-prometheus HTTP/Celery metrics and the MCP server's tool-call
-  # counters (the mcp_server.metrics module registers into the same
-  # prometheus_client process registry on the Django side).  The mcp
-  # container itself does not expose /metrics; run 'em on the WSGI side.
+  # Mnemosyne — app exposes /metrics on the Django container (proxied via
+  # nginx); a single prometheus_client process registry serves both
+  # django-prometheus (HTTP/Celery) and the MCP server's tool-call counters
+  # (the mcp container itself does not expose /metrics). Web is an
+  # nginx-prometheus-exporter sidecar that scrapes the web container's
+  # stub_status and re-exposes it in Prometheus format.
  - job_name: 'mnemosyne'
-    static_configs:
-      - targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
    metrics_path: '/metrics'
    scrape_interval: 15s
+    static_configs:
+      - targets: ['{{ mnemosyne_app_metrics_host }}:{{ mnemosyne_app_metrics_port }}']
+        labels:
+          component: app
+      - targets: ['{{ mnemosyne_web_metrics_host }}:{{ mnemosyne_web_metrics_port }}']
+        labels:
+          component: web
+
+  # Pallas — each deployment is one scrape target (registry port).
+  # Pallas uses a single process-global registry, so per-agent /metrics
+  # endpoints serve the same snapshot; the `agent` dimension is carried
+  # as a metric label, not a target. Targets are defined per
+  # environment in pallas_metrics_targets (host_vars on the Prometheus
+  # host); instances are differentiated by the `instance` label.
+{% if pallas_metrics_targets | default([]) %}
+  - job_name: 'pallas'
+    metrics_path: '/metrics'
+    scrape_interval: 15s
+    static_configs: {{ pallas_metrics_targets | to_json }}
+{% endif %}

  # Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
  # Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
-  # metrics on the standard metrics port (22094).  Both Ariel (LLM memory
-  # via neo4j-cypher MCP) and Umbriel (Mnemosyne graph+vector DB) use the
-  # same port — they are differentiated by hostname only.
+  # metrics.  Targets are listed per-environment in neo4j_metrics_targets
+  # (host_vars on the Prometheus host) — instances are differentiated by
+  # hostname only.
  - job_name: 'neo4j'
    static_configs:
-      - targets:
-          - 'ariel.incus:22094'
-          - 'umbriel.incus:22094'
+      - targets: {{ neo4j_metrics_targets | to_json }}
    metrics_path: '/metrics'
    scrape_interval: 15s

-# Red Panda Approved Prometheus Configuration
--- a/ansible/pplg/puck_services_dashboard.json.j2
+++ b/ansible/pplg/puck_services_dashboard.json.j2
@@ -1,242 +0,0 @@
-{
-  "title": "Puck Services — Logs & Health",
-  "uid": "puck-services-logs",
-  "tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
-  "timezone": "browser",
-  "schemaVersion": 39,
-  "version": 1,
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "weekStart": "",
-  "refresh": "30s",
-  "time": {"from": "now-1h", "to": "now"},
-  "templating": {
-    "list": [
-      {
-        "name": "loki",
-        "type": "datasource",
-        "query": "loki",
-        "current": {"selected": false, "text": "Loki", "value": "Loki"},
-        "hide": 0,
-        "label": "Loki datasource"
-      },
-      {
-        "name": "prom",
-        "type": "datasource",
-        "query": "prometheus",
-        "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
-        "hide": 0,
-        "label": "Prometheus datasource"
-      }
-    ]
-  },
-  "panels": [
-    {
-      "id": 1,
-      "type": "row",
-      "title": "Mnemosyne",
-      "collapsed": false,
-      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
-    },
-    {
-      "id": 2,
-      "type": "timeseries",
-      "title": "Mnemosyne — log rate by level",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
-          "legendFormat": "{{level}}"
-        }
-      ],
-      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
-    },
-    {
-      "id": 3,
-      "type": "logs",
-      "title": "Mnemosyne — errors (last 25)",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
-          "maxLines": 25
-        }
-      ],
-      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
-    },
-    {
-      "id": 4,
-      "type": "stat",
-      "title": "Mnemosyne — HTTP 5xx rate",
-      "datasource": {"type": "prometheus", "uid": "${prom}"},
-      "gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
-        }
-      ],
-      "options": {
-        "reduceOptions": {"calcs": ["lastNotNull"]},
-        "colorMode": "value",
-        "textMode": "auto"
-      },
-      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
-    },
-    {
-      "id": 5,
-      "type": "stat",
-      "title": "Mnemosyne — p95 latency",
-      "datasource": {"type": "prometheus", "uid": "${prom}"},
-      "gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
-        }
-      ],
-      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
-      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
-    },
-    {
-      "id": 6,
-      "type": "stat",
-      "title": "Mnemosyne — MCP tool error rate",
-      "datasource": {"type": "prometheus", "uid": "${prom}"},
-      "gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
-        }
-      ],
-      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
-      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
-    },
-
-    {
-      "id": 10,
-      "type": "row",
-      "title": "Pallas (Kottos agents)",
-      "collapsed": false,
-      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
-    },
-    {
-      "id": 11,
-      "type": "timeseries",
-      "title": "Pallas — log rate by agent (component)",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
-          "legendFormat": "{{component}}"
-        }
-      ],
-      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
-    },
-    {
-      "id": 12,
-      "type": "logs",
-      "title": "Pallas — forward trace errors (opaque MCP transport failures)",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
-          "maxLines": 25
-        }
-      ],
-      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
-    },
-    {
-      "id": 13,
-      "type": "logs",
-      "title": "Pallas — last 25 ERROR lines (any agent)",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
-          "maxLines": 25
-        }
-      ],
-      "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
-    },
-
-    {
-      "id": 20,
-      "type": "row",
-      "title": "Daedalus",
-      "collapsed": false,
-      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
-    },
-    {
-      "id": 21,
-      "type": "timeseries",
-      "title": "Daedalus — log rate by level",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
-          "legendFormat": "{{level}}"
-        }
-      ],
-      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
-    },
-    {
-      "id": 22,
-      "type": "stat",
-      "title": "Daedalus — HTTP 5xx rate",
-      "datasource": {"type": "prometheus", "uid": "${prom}"},
-      "gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
-        }
-      ],
-      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
-      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
-    },
-    {
-      "id": 23,
-      "type": "stat",
-      "title": "Daedalus — MCP p95 latency",
-      "datasource": {"type": "prometheus", "uid": "${prom}"},
-      "gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
-        }
-      ],
-      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
-      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
-    },
-    {
-      "id": 24,
-      "type": "logs",
-      "title": "Daedalus — errors (last 25)",
-      "datasource": {"type": "loki", "uid": "${loki}"},
-      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
-      "targets": [
-        {
-          "refId": "A",
-          "expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
-          "maxLines": 25
-        }
-      ],
-      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
-    }
-  ]
-}