Files
ouranos/ansible/inventory/host_vars/prospero.incus.yml
Robert Helewka 343b0e13d6 fix(certbot): harden renewal hook and fix permission errors
The renewal deploy-hook ran as the certbot user but lacked permissions to
write the combined PEM to /etc/haproxy/certs and to reload HAProxy,
causing silent failures that left a stale certificate in production until
expiry.

- Add certbot user to the haproxy group so it can write the combined PEM
- Grant certbot NOPASSWD sudo for `systemctl reload haproxy` only
- Make the Prometheus textfile directory group-owned by certbot (0775)
  so cert-metrics.sh can atomically update ssl_cert.prom
- Refactor renewal-hook.sh to always refresh cert metrics on exit via a
  trap, ensuring expiry alerts fire when the hook itself is broken
- Replace `set -e` with explicit error handling and structured logging
2026-06-17 09:58:46 -04:00

174 lines
6.0 KiB
YAML

---
# Prospero Configuration - PPLG Observability & Admin Stack
# Services: pplg (PgAdmin, Prometheus, Loki, Grafana + OAuth2-Proxy)
services:
- alloy
- pplg
# Alloy
alloy_log_level: "warn"
# ============================================================================
# PPLG Domain (TLS termination handled by Titania HAProxy)
# ============================================================================
pplg_domain: "ouranos.helu.ca"
# ============================================================================
# Grafana
# ============================================================================
# Grafana Datasources
prometheus_datasource_name: Prospero-Prometheus
prometheus_host: prospero.incus
prometheus_port: 9090
prometheus_datasource_uid: prospero-prometheus
loki_datasource_name: Prospero-Loki
loki_host: prospero.incus
loki_port: 3100
loki_datasource_uid: prospero-loki
# Grafana Users
grafana_admin_name: "{{ vault_grafana_admin_name }}"
grafana_admin_login: "{{ vault_grafana_admin_login }}"
grafana_admin_password: "{{ vault_grafana_admin_password }}"
grafana_viewer_name: "{{ vault_grafana_viewer_name }}"
grafana_viewer_login: "{{ vault_grafana_viewer_login }}"
grafana_viewer_password: "{{ vault_grafana_viewer_password }}"
# Grafana OAuth (Casdoor SSO)
grafana_oauth_enabled: true
grafana_oauth_name: "Casdoor"
grafana_oauth_client_id: "{{ vault_grafana_oauth_client_id }}"
grafana_oauth_client_secret: "{{ vault_grafana_oauth_client_secret }}"
grafana_oauth_auth_url: "https://id.ouranos.helu.ca/login/oauth/authorize"
grafana_oauth_token_url: "https://id.ouranos.helu.ca/api/login/oauth/access_token"
grafana_oauth_api_url: "https://id.ouranos.helu.ca/api/userinfo"
grafana_oauth_scopes: "openid profile email"
grafana_root_url: "https://grafana.ouranos.helu.ca"
grafana_oauth_allow_sign_up: true
grafana_oauth_skip_tls_verify: false
# ============================================================================
# Prometheus
# ============================================================================
prometheus_user: prometheus
prometheus_group: prometheus
prometheus_scrape_interval: 15s
prometheus_evaluation_interval: 15s
alertmanager_host: prospero.incus
alertmanager_port: 9093
loki_metrics_port: 3100
prometheus_targets:
- 'oberon.incus:9100'
- 'portia.incus:9100'
- 'ariel.incus:9100'
- 'puck.incus:9100'
- 'puck.incus:25571'
- 'miranda.incus:9100'
- 'sycorax.incus:9100'
- 'prospero.incus:9100'
- 'rosalind.incus:9100'
- 'umbriel.incus:9100'
# Neo4j scrape targets (neo4j-apoc-exporter sidecar on each Neo4j host)
neo4j_metrics_targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
# Pallas scrape targets — one entry per Pallas deployment (registry
# port). The `instance` label distinguishes deployments; the `agent`
# dimension comes from labels on the metrics themselves.
pallas_metrics_targets:
- targets: ['caliban.incus:24000']
labels: {instance: iolaus}
- targets: ['caliban.incus:24100']
labels: {instance: kottos}
- targets: ['caliban.incus:24200']
labels: {instance: mentor}
# Prometheus OAuth2-Proxy Sidecar
prometheus_proxy_port: 9091
prometheus_oauth2_proxy_dir: /etc/oauth2-proxy-prometheus
prometheus_oauth2_proxy_version: "7.6.0"
prometheus_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca"
prometheus_oauth2_client_id: "{{ vault_prometheus_oauth_client_id }}"
prometheus_oauth2_client_secret: "{{ vault_prometheus_oauth_client_secret }}"
prometheus_oauth2_cookie_secret: "{{ vault_prometheus_oauth_cookie_secret }}"
# ============================================================================
# Alertmanager
# ============================================================================
alertmanager_user: prometheus
alertmanager_group: prometheus
alertmanager_resolve_timeout: 5m
alertmanager_group_wait: 30s
alertmanager_group_interval: 5m
alertmanager_repeat_interval: 4h
pushover_user_key: "{{ vault_pushover_user_key }}"
pushover_api_token: "{{ vault_pushover_api_token }}"
pushover_priority: 1
pushover_retry: 30
pushover_expire: 3600
# ============================================================================
# Loki
# ============================================================================
loki_user: loki
loki_group: loki
loki_data_dir: /var/lib/loki
loki_config_dir: /etc/loki
loki_config_file: config.yml
loki_grpc_port: 9096
# ============================================================================
# PgAdmin (Gunicorn - no Apache)
# ============================================================================
pgadmin_user: pgadmin
pgadmin_group: pgadmin
pgadmin_port: 5050
pgadmin_data_dir: /var/lib/pgadmin
pgadmin_log_dir: /var/log/pgadmin
pgadmin_email: "{{ vault_pgadmin_email }}"
pgadmin_password: "{{ vault_pgadmin_password }}"
# PgAdmin OAuth (Casdoor SSO)
pgadmin_oauth_client_id: "{{ vault_pgadmin_oauth_client_id }}"
pgadmin_oauth_client_secret: "{{ vault_pgadmin_oauth_client_secret }}"
# ============================================================================
# Prometheus Metrics Scraping
# ============================================================================
# Casdoor
casdoor_metrics_host: titania.incus
casdoor_metrics_port: 22081
casdoor_prometheus_access_key: "{{ vault_casdoor_prometheus_access_key }}"
casdoor_prometheus_access_secret: "{{ vault_casdoor_prometheus_access_secret }}"
# Daedalus Metrics
daedalus_metrics_host: caliban.incus
daedalus_metrics_port: 23081
# Mnemosyne — two scrape targets:
# app: Django /metrics via nginx (django-prometheus + custom pipeline/MCP counters)
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
mnemosyne_app_metrics_host: caliban.incus
mnemosyne_app_metrics_port: 23181
mnemosyne_web_metrics_host: caliban.incus
mnemosyne_web_metrics_port: 23191
# Athena — two scrape targets (same shape as Mnemosyne):
# app: Django /metrics via nginx (django-prometheus)
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
athena_app_metrics_host: puck.incus
athena_app_metrics_port: 22481
athena_web_metrics_host: puck.incus
athena_web_metrics_port: 22491