feat(observability): add SearXNG, Argos, and Pallas monitoring

- Add SearXNG syslog ingestion and blackbox health probes on miranda
  and rosalind for per-host attributable failure detection
- Scrape Argos MCP application metrics from miranda
- Add Pallas dashboard panels for downstream availability and turn
  error ratios
This commit is contained in:
2026-05-24 23:52:53 -04:00
parent 43fae203d1
commit 3c2f8c57ca
24 changed files with 1968 additions and 938 deletions

View File

@@ -15,8 +15,7 @@ neo4j_syslog_port: 51414
neo4j_user: neo4j
neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_password: "{{ vault_neo4j_cypher_password }}"
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094

View File

@@ -10,21 +10,23 @@ services:
- grafana_mcp
- mcpo
- neo4j_mcp
- searxng
# Alloy
alloy_log_level: "warn"
argos_syslog_port: 51434
neo4j_cypher_syslog_port: 51431
grafana_mcp_syslog_port: 51433
gitea_mcp_syslog_port: 51435
argos_syslog_port: 51418
neo4j_cypher_syslog_port: 51414
grafana_mcp_syslog_port: 51413
gitea_mcp_syslog_port: 51412
searxng_syslog_port: 51419
# Argos MCP Configuration
argos_user: argos
argos_group: argos
argos_directory: /srv/argos
argos_port: 25534
argos_port: 20861
argos_log_level: INFO
argos_searxng_instances: http://rosalind.incus:22089/
argos_searxng_instances: http://miranda.incus:22089/,http://rosalind.incus:22089/
argos_cache_ttl: 300
argos_max_results: 10
argos_request_timeout: 30.0
@@ -48,7 +50,7 @@ neo4j_mcp_directory: /srv/neo4j_mcp
grafana_mcp_user: grafana_mcp
grafana_mcp_group: grafana_mcp
grafana_mcp_directory: /srv/grafana_mcp
grafana_mcp_port: 25533
grafana_mcp_port: 22063
grafana_mcp_grafana_host: prospero.incus
grafana_mcp_grafana_port: 3000
grafana_service_account_token: "{{ vault_grafana_service_account_token }}"
@@ -57,21 +59,28 @@ grafana_service_account_token: "{{ vault_grafana_service_account_token }}"
gitea_mcp_user: gitea_mcp
gitea_mcp_group: gitea_mcp
gitea_mcp_directory: /srv/gitea_mcp
gitea_mcp_port: 25535
gitea_mcp_port: 22062
gitea_mcp_host: https://gitea.ouranos.helu.ca
gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}"
# Neo4j Cypher MCP
neo4j_host: ariel.incus
neo4j_bolt_port: 7687
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_cypher_mcp_port: 25531
# Nike MCP
nike_mcp_url: http://puck.incus:25576/mcp
neo4j_bolt_port: 22074
neo4j_cypher_password: "{{ vault_neo4j_cypher_password }}"
neo4j_cypher_mcp_port: 22064
neo4j_mcp_server_allowed_hosts: localhost,127.0.0.1,miranda.incus
# MCPO Config
mcpo_user: mcpo
mcpo_group: mcpo
mcpo_directory: /srv/mcpo
mcpo_port: 25530
# SearXNG Configuration
searxng_user: searxng
searxng_group: searxng
searxng_directory: /srv/searxng
searxng_port: 22089
searxng_base_url: http://miranda.incus:22089/
searxng_instance_name: "Ouranos Search"
searxng_secret_key: "{{ vault_searxng_secret_key }}"

View File

@@ -74,6 +74,22 @@ prometheus_targets:
- 'rosalind.incus:9100'
- 'umbriel.incus:9100'
# Neo4j scrape targets (neo4j-apoc-exporter sidecar on each Neo4j host)
neo4j_metrics_targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
# Pallas scrape targets — one entry per Pallas deployment (registry
# port). The `instance` label distinguishes deployments; the `agent`
# dimension comes from labels on the metrics themselves.
pallas_metrics_targets:
- targets: ['caliban.incus:24000']
labels: {instance: iolaus}
- targets: ['caliban.incus:24100']
labels: {instance: kottos}
- targets: ['caliban.incus:24200']
labels: {instance: mentor}
# Prometheus OAuth2-Proxy Sidecar
prometheus_proxy_port: 9091
prometheus_oauth2_proxy_dir: /etc/oauth2-proxy-prometheus
@@ -127,10 +143,23 @@ pgadmin_oauth_client_id: "{{ vault_pgadmin_oauth_client_id }}"
pgadmin_oauth_client_secret: "{{ vault_pgadmin_oauth_client_secret }}"
# ============================================================================
# Casdoor Metrics (for Prometheus scraping)
# Prometheus Metrics Scraping
# ============================================================================
casdoor_metrics_host: "titania.incus"
# Casdoor
casdoor_metrics_host: titania.incus
casdoor_metrics_port: 22081
casdoor_prometheus_access_key: "{{ vault_casdoor_prometheus_access_key }}"
casdoor_prometheus_access_secret: "{{ vault_casdoor_prometheus_access_secret }}"
# Daedalus Metrics
daedalus_metrics_host: caliban.incus
daedalus_metrics_port: 23081
# Mnemosyne — two scrape targets:
# app: Django /metrics via nginx (django-prometheus + custom pipeline/MCP counters)
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
mnemosyne_app_metrics_host: caliban.incus
mnemosyne_app_metrics_port: 23181
mnemosyne_web_metrics_host: caliban.incus
mnemosyne_web_metrics_port: 23191

View File

@@ -37,14 +37,17 @@ daedalus_syslog_port: 51430
# Daedalus (FastAPI on puck, behind nginx)
daedalus_metrics_host: "puck.incus"
daedalus_metrics_port: 22181
daedalus_metrics_port: 23081
# Mnemosyne — /metrics is served by nginx (mnemosyne-web:23181) and
# proxied to the Django app container, which owns the single
# prometheus_client process registry that both django-prometheus
# (HTTP / Celery) and the MCP server's tool-call counters write to.
mnemosyne_metrics_host: "puck.incus"
mnemosyne_metrics_port: 23181
# Mnemosyne — two metrics surfaces:
# app (23181): /metrics served by nginx → Django app container, which owns
# the single prometheus_client process registry that both django-prometheus
# (HTTP / Celery) and the MCP server's tool-call counters write to.
# web (23191): nginx-prometheus-exporter sidecar scraping nginx stub_status.
mnemosyne_app_metrics_host: "puck.incus"
mnemosyne_app_metrics_port: 23181
mnemosyne_web_metrics_host: "puck.incus"
mnemosyne_web_metrics_port: 23191
# =============================================================================
# Kottos Configuration (Pallas FastAgent runtime)

View File

@@ -122,8 +122,8 @@ haproxy_backends:
health_path: "/api/healthz"
- subdomain: "daedalus"
backend_host: "puck.incus"
backend_port: 20080
backend_host: "caliban.incus"
backend_port: 20081
health_path: "/ready/"
timeout_server: 120s
@@ -133,8 +133,8 @@ haproxy_backends:
health_path: "/chat"
- subdomain: "mnemosyne"
backend_host: "puck.incus"
backend_port: 23181
backend_host: "caliban.incus"
backend_port: 23081
health_path: "/ready/"
- subdomain: "nextcloud"

View File

@@ -19,8 +19,7 @@ neo4j_syslog_port: 51414
neo4j_user: neo4j
neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_mnemosyne_neo4j_auth_password }}"
neo4j_password: "{{ vault_neo4j_mnemosyne_password }}"
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094