feat(observability): add SearXNG, Argos, and Pallas monitoring

- Add SearXNG syslog ingestion and blackbox health probes on miranda and rosalind for per-host attributable failure detection - Scrape Argos MCP application metrics from miranda - Add Pallas dashboard panels for downstream availability and turn error ratios
2026-05-24 23:52:53 -04:00
parent 43fae203d1
commit 3c2f8c57ca
24 changed files with 1968 additions and 938 deletions
--- a/ansible/alloy/miranda/config.alloy.j2
+++ b/ansible/alloy/miranda/config.alloy.j2
@@ -93,6 +93,20 @@ loki.source.syslog "gitea_mcp_logs" {
  forward_to = [loki.write.default.receiver]
 }

+loki.source.syslog "searxng_logs" {
+  listener {
+    address  = "127.0.0.1:{{searxng_syslog_port}}"
+    protocol = "tcp"
+    syslog_format = "{{ syslog_format }}"
+    labels = {
+      job = "searxng",
+      hostname = "{{inventory_hostname}}",
+      environment = "{{deployment_environment}}",
+    }
+  }
+  forward_to = [loki.write.default.receiver]
+}
+
 prometheus.exporter.unix "default" {
  include_exporter_metrics = true
  disable_collectors = ["mdadm"]
@@ -104,6 +118,45 @@ prometheus.scrape "default" {
  job_name = "mcp_docker_host"
 }

+// Argos MCP application metrics (/metrics is exposed by argos itself; see
+// argos/argos_searxng/metrics.py).
+prometheus.scrape "argos" {
+  targets = [{
+    __address__ = "127.0.0.1:{{argos_port}}",
+    job         = "argos",
+    instance    = "{{inventory_hostname}}",
+    hostname    = "{{inventory_hostname}}",
+    environment = "{{deployment_environment}}",
+  }]
+  forward_to      = [prometheus.remote_write.default.receiver]
+  scrape_interval = "30s"
+  metrics_path    = "/metrics"
+}
+
+// Independent verification that this host's SearXNG instance answers /healthz
+// (Argos's own per-instance gauge can lie — argos itself could be sick).
+prometheus.exporter.blackbox "searxng" {
+  config = "{ modules: { http_2xx: { prober: http, timeout: 5s, http: { valid_status_codes: [200] } } } }"
+
+  target {
+    name    = "{{inventory_hostname}}"
+    address = "http://127.0.0.1:{{searxng_port}}/healthz"
+    module  = "http_2xx"
+    labels  = {
+      service     = "searxng",
+      hostname    = "{{inventory_hostname}}",
+      environment = "{{deployment_environment}}",
+    }
+  }
+}
+
+prometheus.scrape "searxng_blackbox" {
+  targets         = prometheus.exporter.blackbox.searxng.targets
+  forward_to      = [prometheus.remote_write.default.receiver]
+  scrape_interval = "30s"
+  job_name        = "searxng_blackbox"
+}
+
 prometheus.remote_write "default" {
  endpoint {
    url = "{{prometheus_remote_write_url}}"
--- a/ansible/alloy/rosalind/config.alloy.j2
+++ b/ansible/alloy/rosalind/config.alloy.j2
@@ -190,6 +190,31 @@ prometheus.scrape "gitea" {
  bearer_token = "{{gitea_metrics_token}}"
 }

+// Independent verification that this host's SearXNG instance answers /healthz.
+// Argos (on miranda) load-balances across this instance and miranda's own;
+// each host's Alloy probes its local SearXNG so failures are attributable.
+prometheus.exporter.blackbox "searxng" {
+  config = "{ modules: { http_2xx: { prober: http, timeout: 5s, http: { valid_status_codes: [200] } } } }"
+
+  target {
+    name    = "{{inventory_hostname}}"
+    address = "http://127.0.0.1:{{searxng_port}}/healthz"
+    module  = "http_2xx"
+    labels  = {
+      service     = "searxng",
+      hostname    = "{{inventory_hostname}}",
+      environment = "{{deployment_environment}}",
+    }
+  }
+}
+
+prometheus.scrape "searxng_blackbox" {
+  targets         = prometheus.exporter.blackbox.searxng.targets
+  forward_to      = [prometheus.remote_write.default.receiver]
+  scrape_interval = "30s"
+  job_name        = "searxng_blackbox"
+}
+
 // Prometheus remote write endpoint
 prometheus.remote_write "default" {
  endpoint {