The renewal deploy-hook ran as the certbot user but lacked permissions to write the combined PEM to /etc/haproxy/certs and to reload HAProxy, causing silent failures that left a stale certificate in production until expiry. - Add certbot user to the haproxy group so it can write the combined PEM - Grant certbot NOPASSWD sudo for `systemctl reload haproxy` only - Make the Prometheus textfile directory group-owned by certbot (0775) so cert-metrics.sh can atomically update ssl_cert.prom - Refactor renewal-hook.sh to always refresh cert metrics on exit via a trap, ensuring expiry alerts fire when the hook itself is broken - Replace `set -e` with explicit error handling and structured logging
174 lines
6.0 KiB
YAML
174 lines
6.0 KiB
YAML
---
|
|
# Prospero Configuration - PPLG Observability & Admin Stack
|
|
# Services: pplg (PgAdmin, Prometheus, Loki, Grafana + OAuth2-Proxy)
|
|
|
|
|
|
services:
|
|
- alloy
|
|
- pplg
|
|
|
|
# Alloy
|
|
alloy_log_level: "warn"
|
|
|
|
# ============================================================================
|
|
# PPLG Domain (TLS termination handled by Titania HAProxy)
|
|
# ============================================================================
|
|
|
|
pplg_domain: "ouranos.helu.ca"
|
|
|
|
# ============================================================================
|
|
# Grafana
|
|
# ============================================================================
|
|
|
|
# Grafana Datasources
|
|
prometheus_datasource_name: Prospero-Prometheus
|
|
prometheus_host: prospero.incus
|
|
prometheus_port: 9090
|
|
prometheus_datasource_uid: prospero-prometheus
|
|
loki_datasource_name: Prospero-Loki
|
|
loki_host: prospero.incus
|
|
loki_port: 3100
|
|
loki_datasource_uid: prospero-loki
|
|
|
|
# Grafana Users
|
|
grafana_admin_name: "{{ vault_grafana_admin_name }}"
|
|
grafana_admin_login: "{{ vault_grafana_admin_login }}"
|
|
grafana_admin_password: "{{ vault_grafana_admin_password }}"
|
|
grafana_viewer_name: "{{ vault_grafana_viewer_name }}"
|
|
grafana_viewer_login: "{{ vault_grafana_viewer_login }}"
|
|
grafana_viewer_password: "{{ vault_grafana_viewer_password }}"
|
|
|
|
# Grafana OAuth (Casdoor SSO)
|
|
grafana_oauth_enabled: true
|
|
grafana_oauth_name: "Casdoor"
|
|
grafana_oauth_client_id: "{{ vault_grafana_oauth_client_id }}"
|
|
grafana_oauth_client_secret: "{{ vault_grafana_oauth_client_secret }}"
|
|
grafana_oauth_auth_url: "https://id.ouranos.helu.ca/login/oauth/authorize"
|
|
grafana_oauth_token_url: "https://id.ouranos.helu.ca/api/login/oauth/access_token"
|
|
grafana_oauth_api_url: "https://id.ouranos.helu.ca/api/userinfo"
|
|
grafana_oauth_scopes: "openid profile email"
|
|
grafana_root_url: "https://grafana.ouranos.helu.ca"
|
|
grafana_oauth_allow_sign_up: true
|
|
grafana_oauth_skip_tls_verify: false
|
|
|
|
# ============================================================================
|
|
# Prometheus
|
|
# ============================================================================
|
|
|
|
prometheus_user: prometheus
|
|
prometheus_group: prometheus
|
|
prometheus_scrape_interval: 15s
|
|
prometheus_evaluation_interval: 15s
|
|
alertmanager_host: prospero.incus
|
|
alertmanager_port: 9093
|
|
loki_metrics_port: 3100
|
|
prometheus_targets:
|
|
- 'oberon.incus:9100'
|
|
- 'portia.incus:9100'
|
|
- 'ariel.incus:9100'
|
|
- 'puck.incus:9100'
|
|
- 'puck.incus:25571'
|
|
- 'miranda.incus:9100'
|
|
- 'sycorax.incus:9100'
|
|
- 'prospero.incus:9100'
|
|
- 'rosalind.incus:9100'
|
|
- 'umbriel.incus:9100'
|
|
|
|
# Neo4j scrape targets (neo4j-apoc-exporter sidecar on each Neo4j host)
|
|
neo4j_metrics_targets:
|
|
- 'ariel.incus:22094'
|
|
- 'umbriel.incus:22094'
|
|
|
|
# Pallas scrape targets — one entry per Pallas deployment (registry
|
|
# port). The `instance` label distinguishes deployments; the `agent`
|
|
# dimension comes from labels on the metrics themselves.
|
|
pallas_metrics_targets:
|
|
- targets: ['caliban.incus:24000']
|
|
labels: {instance: iolaus}
|
|
- targets: ['caliban.incus:24100']
|
|
labels: {instance: kottos}
|
|
- targets: ['caliban.incus:24200']
|
|
labels: {instance: mentor}
|
|
|
|
# Prometheus OAuth2-Proxy Sidecar
|
|
prometheus_proxy_port: 9091
|
|
prometheus_oauth2_proxy_dir: /etc/oauth2-proxy-prometheus
|
|
prometheus_oauth2_proxy_version: "7.6.0"
|
|
prometheus_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca"
|
|
prometheus_oauth2_client_id: "{{ vault_prometheus_oauth_client_id }}"
|
|
prometheus_oauth2_client_secret: "{{ vault_prometheus_oauth_client_secret }}"
|
|
prometheus_oauth2_cookie_secret: "{{ vault_prometheus_oauth_cookie_secret }}"
|
|
|
|
# ============================================================================
|
|
# Alertmanager
|
|
# ============================================================================
|
|
|
|
alertmanager_user: prometheus
|
|
alertmanager_group: prometheus
|
|
alertmanager_resolve_timeout: 5m
|
|
alertmanager_group_wait: 30s
|
|
alertmanager_group_interval: 5m
|
|
alertmanager_repeat_interval: 4h
|
|
pushover_user_key: "{{ vault_pushover_user_key }}"
|
|
pushover_api_token: "{{ vault_pushover_api_token }}"
|
|
pushover_priority: 1
|
|
pushover_retry: 30
|
|
pushover_expire: 3600
|
|
|
|
# ============================================================================
|
|
# Loki
|
|
# ============================================================================
|
|
|
|
loki_user: loki
|
|
loki_group: loki
|
|
loki_data_dir: /var/lib/loki
|
|
loki_config_dir: /etc/loki
|
|
loki_config_file: config.yml
|
|
loki_grpc_port: 9096
|
|
|
|
# ============================================================================
|
|
# PgAdmin (Gunicorn - no Apache)
|
|
# ============================================================================
|
|
|
|
pgadmin_user: pgadmin
|
|
pgadmin_group: pgadmin
|
|
pgadmin_port: 5050
|
|
pgadmin_data_dir: /var/lib/pgadmin
|
|
pgadmin_log_dir: /var/log/pgadmin
|
|
pgadmin_email: "{{ vault_pgadmin_email }}"
|
|
pgadmin_password: "{{ vault_pgadmin_password }}"
|
|
|
|
# PgAdmin OAuth (Casdoor SSO)
|
|
pgadmin_oauth_client_id: "{{ vault_pgadmin_oauth_client_id }}"
|
|
pgadmin_oauth_client_secret: "{{ vault_pgadmin_oauth_client_secret }}"
|
|
|
|
# ============================================================================
|
|
# Prometheus Metrics Scraping
|
|
# ============================================================================
|
|
|
|
# Casdoor
|
|
casdoor_metrics_host: titania.incus
|
|
casdoor_metrics_port: 22081
|
|
casdoor_prometheus_access_key: "{{ vault_casdoor_prometheus_access_key }}"
|
|
casdoor_prometheus_access_secret: "{{ vault_casdoor_prometheus_access_secret }}"
|
|
|
|
# Daedalus Metrics
|
|
daedalus_metrics_host: caliban.incus
|
|
daedalus_metrics_port: 23081
|
|
|
|
# Mnemosyne — two scrape targets:
|
|
# app: Django /metrics via nginx (django-prometheus + custom pipeline/MCP counters)
|
|
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
|
|
mnemosyne_app_metrics_host: caliban.incus
|
|
mnemosyne_app_metrics_port: 23181
|
|
mnemosyne_web_metrics_host: caliban.incus
|
|
mnemosyne_web_metrics_port: 23191
|
|
|
|
# Athena — two scrape targets (same shape as Mnemosyne):
|
|
# app: Django /metrics via nginx (django-prometheus)
|
|
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
|
|
athena_app_metrics_host: puck.incus
|
|
athena_app_metrics_port: 22481
|
|
athena_web_metrics_host: puck.incus
|
|
athena_web_metrics_port: 22491
|