diff --git a/ansible/alloy/prospero/config.alloy.j2 b/ansible/alloy/prospero/config.alloy.j2 index b0c09b4..0f23c93 100644 --- a/ansible/alloy/prospero/config.alloy.j2 +++ b/ansible/alloy/prospero/config.alloy.j2 @@ -1,6 +1,6 @@ // Prospero Alloy Configuration // Red Panda Approved 🐼 -// Services: PPLG stack (Grafana, Prometheus, Loki, Alertmanager, PgAdmin, HAProxy, OAuth2-Proxy) +// Services: PPLG stack (Grafana, Prometheus, Loki, Alertmanager, PgAdmin, OAuth2-Proxy) logging { level = "{{alloy_log_level}}" @@ -19,20 +19,6 @@ loki.source.file "system_logs" { forward_to = [loki.write.default.receiver] } -// PPLG HAProxy syslog receiver (HAProxy syslog β†’ Alloy β†’ Loki) -loki.source.syslog "pplg_haproxy" { - listener { - address = "127.0.0.1:{{pplg_haproxy_syslog_port}}" - protocol = "tcp" - labels = { - job = "pplg-haproxy", - hostname = "{{inventory_hostname}}", - environment = "{{deployment_environment}}", - } - } - forward_to = [loki.write.default.receiver] -} - // Journal relabeling - assign dedicated job labels per systemd unit loki.relabel "journal" { forward_to = [] diff --git a/ansible/inventory/host_vars/prospero.incus.yml b/ansible/inventory/host_vars/prospero.incus.yml index 2745a42..37c7738 100644 --- a/ansible/inventory/host_vars/prospero.incus.yml +++ b/ansible/inventory/host_vars/prospero.incus.yml @@ -1,6 +1,6 @@ --- # Prospero Configuration - PPLG Observability & Admin Stack -# Services: pplg (PgAdmin, Prometheus, Loki, Grafana + HAProxy + OAuth2-Proxy) +# Services: pplg (PgAdmin, Prometheus, Loki, Grafana + OAuth2-Proxy) ansible_user: robert @@ -12,17 +12,10 @@ services: alloy_log_level: "warn" # ============================================================================ -# PPLG HAProxy Configuration +# PPLG Domain (TLS termination handled by Titania HAProxy) # ============================================================================ -pplg_haproxy_user: haproxy -pplg_haproxy_group: haproxy -pplg_haproxy_uid: 800 -pplg_haproxy_gid: 800 -pplg_haproxy_domain: "ouranos.helu.ca" -pplg_haproxy_cert_path: /etc/haproxy/certs/ouranos.pem -pplg_haproxy_stats_port: 8404 -pplg_haproxy_syslog_port: 51405 +pplg_domain: "ouranos.helu.ca" # ============================================================================ # Grafana diff --git a/ansible/inventory/host_vars/titania.incus.yml b/ansible/inventory/host_vars/titania.incus.yml index 3d2a835..cc71120 100644 --- a/ansible/inventory/host_vars/titania.incus.yml +++ b/ansible/inventory/host_vars/titania.incus.yml @@ -89,31 +89,26 @@ haproxy_backends: backend_host: "prospero.incus" backend_port: 5050 health_path: "/misc/ping" - ssl_backend: true - subdomain: "grafana" backend_host: "prospero.incus" backend_port: 3000 health_path: "/api/health" - ssl_backend: true - subdomain: "prometheus" backend_host: "prospero.incus" - backend_port: 9090 + backend_port: 9091 # OAuth2-Proxy sidecar (skips auth for /api/v1/write and /ping) health_path: "/ping" - ssl_backend: true - subdomain: "loki" backend_host: "prospero.incus" backend_port: 3100 health_path: "/ready" - ssl_backend: true - subdomain: "alertmanager" backend_host: "prospero.incus" backend_port: 9093 health_path: "/-/healthy" - ssl_backend: true - subdomain: "gitea" backend_host: "rosalind.incus" diff --git a/ansible/pplg/deploy.yml b/ansible/pplg/deploy.yml index 17f84e0..b820c5f 100644 --- a/ansible/pplg/deploy.yml +++ b/ansible/pplg/deploy.yml @@ -1,6 +1,7 @@ --- # PPLG - Consolidated Observability & Admin Stack for Prospero -# PgAdmin, Prometheus, Loki, Grafana + HAProxy (TLS) + OAuth2-Proxy (Prometheus UI) +# PgAdmin, Prometheus, Loki, Grafana + OAuth2-Proxy (Prometheus UI) +# TLS termination handled by Titania HAProxy # Red Panda Approved - name: Deploy PPLG Stack @@ -47,7 +48,6 @@ ansible.builtin.apt: name: - acl - - haproxy - prometheus - loki - grafana @@ -372,83 +372,6 @@ state: started daemon_reload: true - # =========================================================================== - # SSL Certificate Distribution (from Titania) - # =========================================================================== - - - name: Create haproxy group - ansible.builtin.group: - name: "{{pplg_haproxy_group}}" - gid: "{{pplg_haproxy_gid}}" - system: true - - - name: Create haproxy user - ansible.builtin.user: - name: "{{pplg_haproxy_user}}" - comment: "PPLG HAProxy" - group: "{{pplg_haproxy_group}}" - uid: "{{pplg_haproxy_uid}}" - system: true - - - name: Create HAProxy directories - ansible.builtin.file: - path: "{{item}}" - state: directory - owner: "{{pplg_haproxy_user}}" - group: "{{pplg_haproxy_group}}" - mode: '750' - loop: - - /etc/haproxy - - /etc/haproxy/certs - - - name: Fetch wildcard certificate from Titania - ansible.builtin.fetch: - src: /etc/haproxy/certs/ouranos.pem - dest: /tmp/ouranos-haproxy.pem - flat: yes - delegate_to: titania.incus - when: "'titania.incus' in groups['ubuntu']" - - - name: Deploy wildcard certificate - ansible.builtin.copy: - src: /tmp/ouranos-haproxy.pem - dest: "{{pplg_haproxy_cert_path}}" - owner: "{{pplg_haproxy_user}}" - group: "{{pplg_haproxy_group}}" - mode: '0640' - when: "'titania.incus' in groups['ubuntu']" - - - name: Generate self-signed wildcard certificate (fallback) - command: > - openssl req -x509 -nodes -days 365 -newkey rsa:2048 - -keyout {{pplg_haproxy_cert_path}} - -out {{pplg_haproxy_cert_path}} - -subj "/C=US/ST=State/L=City/O=Ouranos/CN=*.{{pplg_haproxy_domain}}" - -addext "subjectAltName=DNS:*.{{pplg_haproxy_domain}},DNS:{{pplg_haproxy_domain}}" - when: "'titania.incus' not in groups['ubuntu']" - args: - creates: "{{pplg_haproxy_cert_path}}" - - # =========================================================================== - # HAProxy (TLS Termination) - # =========================================================================== - - - name: Template HAProxy configuration - ansible.builtin.template: - src: pplg-haproxy.cfg.j2 - dest: /etc/haproxy/haproxy.cfg - owner: "{{pplg_haproxy_user}}" - group: "{{pplg_haproxy_group}}" - mode: "640" - validate: haproxy -c -f %s - notify: restart haproxy - - - name: Enable and start HAProxy service - ansible.builtin.systemd: - name: haproxy - enabled: true - state: started - # =========================================================================== # Handlers # =========================================================================== @@ -484,11 +407,6 @@ ansible.builtin.systemd: daemon_reload: true - - name: restart haproxy - ansible.builtin.systemd: - name: haproxy - state: reloaded - - name: restart oauth2-proxy-prometheus ansible.builtin.systemd: name: oauth2-proxy-prometheus diff --git a/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 b/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 index caa894b..4081d24 100644 --- a/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 +++ b/ansible/pplg/oauth2-proxy-prometheus.cfg.j2 @@ -10,7 +10,7 @@ client_id = "{{prometheus_oauth2_client_id}}" client_secret = "{{prometheus_oauth2_client_secret}}" # Redirect URL after authentication -redirect_url = "https://prometheus.{{pplg_haproxy_domain}}/oauth2/callback" +redirect_url = "https://prometheus.{{pplg_domain}}/oauth2/callback" # Upstream service (Prometheus) upstreams = [ @@ -24,7 +24,7 @@ cookie_secure = true cookie_httponly = true cookie_expire = "168h" cookie_refresh = "1h" -cookie_domains = ".{{pplg_haproxy_domain}}" +cookie_domains = ".{{pplg_domain}}" session_store_type = "cookie" # Authentication settings @@ -49,9 +49,10 @@ http_address = "0.0.0.0:{{prometheus_proxy_port}}" reverse_proxy = true real_client_ip_header = "X-Forwarded-For" -# Skip authentication for health check endpoints +# Skip authentication for health check and machine-to-machine endpoints skip_auth_routes = [ - "^/ping$" + "^/ping$", + "^/api/v1/write$" ] # OIDC specific settings diff --git a/ansible/pplg/pgadmin.service.j2 b/ansible/pplg/pgadmin.service.j2 index 7d299f5..c582690 100644 --- a/ansible/pplg/pgadmin.service.j2 +++ b/ansible/pplg/pgadmin.service.j2 @@ -9,7 +9,7 @@ User={{pgadmin_user}} Group={{pgadmin_group}} WorkingDirectory=/usr/pgadmin4/web ExecStart=/usr/pgadmin4/venv/bin/python3 -m gunicorn pgAdmin4:app \ - --bind 127.0.0.1:{{pgadmin_port}} \ + --bind 0.0.0.0:{{pgadmin_port}} \ --workers 1 \ --threads 4 \ --timeout 120 \ diff --git a/ansible/pplg/pplg-haproxy.cfg.j2 b/ansible/pplg/pplg-haproxy.cfg.j2 deleted file mode 100644 index 4fb4a7f..0000000 --- a/ansible/pplg/pplg-haproxy.cfg.j2 +++ /dev/null @@ -1,127 +0,0 @@ -# PPLG HAProxy - Internal TLS Termination for Prospero -# Services: Grafana, PgAdmin, Prometheus (via OAuth2-Proxy), Loki, Alertmanager -# Managed by Ansible - Red Panda Approved - -global - log 127.0.0.1:{{pplg_haproxy_syslog_port}} local0 - stats timeout 30s - - # Default SSL material locations - ca-base /etc/ssl/certs - crt-base /etc/ssl/private - - # SSL/TLS configuration - ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384 - ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256 - ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets - -defaults - log global - mode http - option httplog - option dontlognull - log-format "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r" - timeout connect 5s - timeout client 50s - timeout server 50s - -# Stats page with Prometheus metrics -listen stats - bind *:{{pplg_haproxy_stats_port}} - mode http - stats enable - stats uri /metrics - stats refresh 15s - stats show-legends - stats show-node - - # Prometheus metrics endpoint - http-request use-service prometheus-exporter if { path /metrics } - -# HTTP frontend - redirect all traffic to HTTPS -frontend http_frontend - bind *:80 - mode http - option httplog - http-request redirect scheme https code 301 - -# HTTPS frontend with subdomain-based routing -frontend https_frontend - bind *:443 ssl crt {{pplg_haproxy_cert_path}} - mode http - option httplog - option forwardfor - - # Forward original protocol and host - http-request set-header X-Forwarded-Proto https - http-request set-header X-Forwarded-Port %[dst_port] - http-request set-header X-Forwarded-Host %[req.hdr(Host)] - - # Security headers - http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains" - http-response set-header X-Frame-Options "SAMEORIGIN" - http-response set-header X-Content-Type-Options "nosniff" - http-response set-header X-XSS-Protection "1; mode=block" - - # Subdomain ACLs - acl host_grafana hdr_beg(host) -i grafana.{{pplg_haproxy_domain}} - acl host_pgadmin hdr_beg(host) -i pgadmin.{{pplg_haproxy_domain}} - acl host_prometheus hdr_beg(host) -i prometheus.{{pplg_haproxy_domain}} - acl host_loki hdr_beg(host) -i loki.{{pplg_haproxy_domain}} - acl host_alertmanager hdr_beg(host) -i alertmanager.{{pplg_haproxy_domain}} - - # Prometheus write API - bypass OAuth2-Proxy (machine-to-machine) - acl is_prometheus_write path_beg /api/v1/write - - use_backend backend_grafana if host_grafana - use_backend backend_pgadmin if host_pgadmin - use_backend backend_prometheus_direct if host_prometheus is_prometheus_write - use_backend backend_prometheus if host_prometheus - use_backend backend_loki if host_loki - use_backend backend_alertmanager if host_alertmanager - -# Grafana - Native Casdoor OAuth SSO -backend backend_grafana - mode http - balance roundrobin - option httpchk GET /api/health - http-check expect status 200 - server grafana_1 127.0.0.1:3000 check - -# PgAdmin - Native Casdoor OAuth SSO -backend backend_pgadmin - mode http - balance roundrobin - option httpchk GET /misc/ping - http-check expect status 200 - server pgadmin_1 127.0.0.1:{{pgadmin_port}} check - -# Prometheus UI - via OAuth2-Proxy sidecar -backend backend_prometheus - mode http - balance roundrobin - option httpchk GET /ping - http-check expect status 200 - server prometheus_1 127.0.0.1:{{prometheus_proxy_port}} check - -# Prometheus Write API - direct (no auth, machine-to-machine) -backend backend_prometheus_direct - mode http - balance roundrobin - server prometheus_write_1 127.0.0.1:9090 check - -# Loki - no auth (machine-to-machine log ingestion) -backend backend_loki - mode http - balance roundrobin - option httpchk GET /ready - http-check expect status 200 - server loki_1 127.0.0.1:{{loki_port}} check - -# Alertmanager - internal only -backend backend_alertmanager - mode http - balance roundrobin - option httpchk GET /-/healthy - http-check expect status 200 - server alertmanager_1 127.0.0.1:{{alertmanager_port}} check diff --git a/docs/pplg.md b/docs/pplg.md index b6bb5c5..7a22a80 100644 --- a/docs/pplg.md +++ b/docs/pplg.md @@ -2,12 +2,11 @@ ## Overview -PPLG is the consolidated observability and administration stack running on **Prospero**. It bundles PgAdmin, Prometheus, Loki, and Grafana behind an internal HAProxy for TLS termination, with Casdoor SSO for user-facing services and OAuth2-Proxy as a sidecar for Prometheus UI authentication. +PPLG is the consolidated observability and administration stack running on **Prospero**. It bundles PgAdmin, Prometheus, Loki, and Grafana with Casdoor SSO for user-facing services and OAuth2-Proxy as a sidecar for Prometheus UI authentication. TLS termination is handled by Titania's HAProxy, which routes directly to each service on Prospero. **Host:** prospero.incus **Role:** Observability -**Incus Ports:** 25510 β†’ 443 (HTTPS), 25511 β†’ 80 (HTTP redirect) -**External Access:** Via Titania HAProxy β†’ `prospero.incus:443` +**External Access:** Via Titania HAProxy β†’ `prospero.incus` (direct to service ports) | Subdomain | Service | Auth Method | |-----------|---------|-------------| @@ -23,33 +22,23 @@ PPLG is the consolidated observability and administration stack running on **Pro β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Client │─────▢│ HAProxy │─────▢│ Prospero (PPLG) β”‚ β”‚ β”‚ β”‚ (Titania) β”‚ β”‚ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ :443 β†’ :443 β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ HAProxy (systemd, :443/:80) β”‚ β”‚ - β”‚ β”‚ TLS termination + subdomain routing β”‚ β”‚ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β””β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ -β”‚ Alloy │──push──────────────────────────▢│ β”‚ β”‚ β”‚ β”‚ -β”‚ (agents) β”‚ loki.ouranos.helu.ca β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ -β”‚ β”‚ prometheus.ouranos.helu.ca β”‚ β”‚ β”‚ β”‚ β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β–Ό β–Ό β–Ό β–Ό β–Ό β”‚ - β”‚ Grafana PgAdmin OAuth2 Loki Alertmanager β”‚ - β”‚ :3000 :5050 Proxy :3100 :9093 β”‚ - β”‚ :9091 β”‚ - β”‚ β”‚ β”‚ - β”‚ β–Ό β”‚ - β”‚ Prometheus β”‚ - β”‚ :9090 β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ :443 TLS β”‚ β”‚ Grafana (:3000) β€” Casdoor OAuth β”‚ + β”‚ terminationβ”‚ β”‚ PgAdmin (:5050) β€” Casdoor OAuth β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ OAuth2-Proxy (:9091) β†’ Prometheus (:9090) β”‚ +β”‚ Alloy │─────────────────────────▢│ Loki (:3100) β€” no auth β”‚ +β”‚ (agents) β”‚ β”‚ Alertmanager (:9093) β€” no auth β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` ### Traffic Flow | Source | Destination | Path | Auth | |--------|-------------|------|------| -| Browser β†’ Grafana | Titania :443 β†’ Prospero :443 β†’ HAProxy β†’ :3000 | Subdomain ACL | Casdoor OAuth | -| Browser β†’ PgAdmin | Titania :443 β†’ Prospero :443 β†’ HAProxy β†’ :5050 | Subdomain ACL | Casdoor OAuth | -| Browser β†’ Prometheus | Titania :443 β†’ Prospero :443 β†’ HAProxy β†’ OAuth2-Proxy :9091 β†’ :9090 | Subdomain ACL | OAuth2-Proxy β†’ Casdoor | -| Alloy β†’ Loki | `https://loki.ouranos.helu.ca` β†’ HAProxy :443 β†’ :3100 | Subdomain ACL | None | -| Alloy β†’ Prometheus | `https://prometheus.ouranos.helu.ca/api/v1/write` β†’ HAProxy :443 β†’ :9090 | `skip_auth_route` | None | +| Browser β†’ Grafana | Titania :443 β†’ Prospero :3000 | Subdomain ACL | Casdoor OAuth | +| Browser β†’ PgAdmin | Titania :443 β†’ Prospero :5050 | Subdomain ACL | Casdoor OAuth | +| Browser β†’ Prometheus | Titania :443 β†’ Prospero :9091 (OAuth2-Proxy) β†’ :9090 | Subdomain ACL | OAuth2-Proxy β†’ Casdoor | +| Alloy β†’ Loki | Titania :443 β†’ Prospero :3100 | Subdomain ACL | None | +| Alloy β†’ Prometheus | Titania :443 β†’ Prospero :9091 β†’ :9090 | `skip_auth_routes` | None | ## Deployment @@ -72,7 +61,6 @@ ansible-playbook pplg/deploy.yml | File | Purpose | |------|---------| | `pplg/deploy.yml` | Main consolidated deployment playbook | -| `pplg/pplg-haproxy.cfg.j2` | HAProxy TLS termination config (5 backends) | | `pplg/prometheus.yml.j2` | Prometheus scrape configuration | | `pplg/alert_rules.yml.j2` | Prometheus alerting rules | | `pplg/alertmanager.yml.j2` | Alertmanager routing and Pushover notifications | @@ -88,15 +76,13 @@ ansible-playbook pplg/deploy.yml ### Deployment Steps 1. **APT Repositories**: Add Grafana and PgAdmin repos -2. **Install Packages**: haproxy, prometheus, loki, grafana, pgadmin4-web, gunicorn +2. **Install Packages**: prometheus, loki, grafana, pgadmin4-web 3. **Prometheus**: Config, alert rules, systemd override for remote write receiver 4. **Alertmanager**: Install, config with Pushover integration 5. **Loki**: Create user/dirs, template config 6. **Grafana**: Provisioning (datasources, users, dashboards), OAuth config 7. **PgAdmin**: Create user/dirs, gunicorn systemd service, Casdoor OAuth config 8. **OAuth2-Proxy**: Download binary (v7.6.0), config for Prometheus sidecar -9. **SSL Certificate**: Fetch Let's Encrypt wildcard cert from Titania (self-signed fallback) -10. **HAProxy**: Template config, enable and start systemd service ### Deployment Order @@ -298,35 +284,18 @@ Register in Casdoor Admin UI (`https://id.ouranos.helu.ca`) or add to `ansible/c | **Loki** | None | Machine-to-machine (Alloy agents push logs) | | **Alertmanager** | None | Internal only | -## HAProxy Configuration +## OAuth2-Proxy skip_auth_routes -### Backends +The Prometheus write API (`/api/v1/write`) and health check (`/ping`) are accessed by Alloy agents for machine-to-machine metric pushes. OAuth2-Proxy's `skip_auth_routes` config bypasses authentication for these paths: -| Backend | Upstream | Health Check | Auth | -|---------|----------|-------------|------| -| `backend_grafana` | `127.0.0.1:3000` | `GET /api/health` | Grafana OAuth | -| `backend_pgadmin` | `127.0.0.1:5050` | `GET /misc/ping` | PgAdmin OAuth | -| `backend_prometheus` | `127.0.0.1:9091` (OAuth2-Proxy) | `GET /ping` | OAuth2-Proxy | -| `backend_prometheus_direct` | `127.0.0.1:9090` | β€” | None (write API) | -| `backend_loki` | `127.0.0.1:3100` | `GET /ready` | None | -| `backend_alertmanager` | `127.0.0.1:9093` | `GET /-/healthy` | None | - -### skip_auth_route Pattern - -The Prometheus write API (`/api/v1/write`) is accessed by Alloy agents for machine-to-machine metric pushes. HAProxy uses an ACL to bypass OAuth2-Proxy: - -``` -acl is_prometheus_write path_beg /api/v1/write -use_backend backend_prometheus_direct if host_prometheus is_prometheus_write +```toml +skip_auth_routes = [ + "^/ping$", + "^/api/v1/write$" +] ``` -This routes `https://prometheus.ouranos.helu.ca/api/v1/write` directly to Prometheus on `:9090`, while all other Prometheus traffic goes through OAuth2-Proxy on `:9091`. - -### SSL Certificate - -- **Primary**: Let's Encrypt wildcard cert (`*.ouranos.helu.ca`) fetched from Titania -- **Fallback**: Self-signed cert generated on Prospero (if Titania unavailable) -- **Path**: `/etc/haproxy/certs/ouranos.pem` +This allows `https://prometheus.ouranos.helu.ca/api/v1/write` to reach Prometheus without OAuth, while all other Prometheus traffic requires Casdoor SSO authentication. ## Host Variables @@ -340,7 +309,7 @@ services: ``` Key variable groups defined in `prospero.incus.yml`: -- PPLG HAProxy (user, group, uid/gid 800, syslog port) +- PPLG domain (`ouranos.helu.ca`) - Grafana (datasources, users, OAuth config) - Prometheus (scrape targets, OAuth2-Proxy sidecar config) - Alertmanager (Pushover integration) @@ -348,56 +317,36 @@ Key variable groups defined in `prospero.incus.yml`: - PgAdmin (user, data/log directories, OAuth config) - Casdoor Metrics (access key/secret for Prometheus scraping) -## Terraform +## Titania Backend Routing -### Prospero Port Mapping - -```hcl -devices = [ - { - name = "https_internal" - type = "proxy" - properties = { - listen = "tcp:0.0.0.0:25510" - connect = "tcp:127.0.0.1:443" - } - }, - { - name = "http_redirect" - type = "proxy" - properties = { - listen = "tcp:0.0.0.0:25511" - connect = "tcp:127.0.0.1:80" - } - } -] -``` - -Run `terraform apply` before deploying if port mappings changed. - -### Titania Backend Routing - -Titania's HAProxy routes external subdomains to Prospero's HTTPS port: +Titania's HAProxy routes external subdomains directly to Prospero service ports: ```yaml # In titania.incus.yml haproxy_backends - subdomain: "grafana" backend_host: "prospero.incus" - backend_port: 443 + backend_port: 3000 health_path: "/api/health" - ssl_backend: true - subdomain: "pgadmin" backend_host: "prospero.incus" - backend_port: 443 + backend_port: 5050 health_path: "/misc/ping" - ssl_backend: true - subdomain: "prometheus" backend_host: "prospero.incus" - backend_port: 443 + backend_port: 9091 # OAuth2-Proxy sidecar health_path: "/ping" - ssl_backend: true + +- subdomain: "loki" + backend_host: "prospero.incus" + backend_port: 3100 + health_path: "/ready" + +- subdomain: "alertmanager" + backend_host: "prospero.incus" + backend_port: 9093 + health_path: "/-/healthy" ``` ## Monitoring @@ -406,7 +355,6 @@ Titania's HAProxy routes external subdomains to Prospero's HTTPS port: **File:** `ansible/alloy/prospero/config.alloy.j2` -- **HAProxy Syslog**: `loki.source.syslog` on `127.0.0.1:51405` (TCP) receives Docker syslog from HAProxy container - **Journal Labels**: Dedicated job labels for `grafana-server`, `prometheus`, `loki`, `alertmanager`, `pgadmin`, `oauth2-proxy-prometheus` - **System Logs**: `/var/log/syslog`, `/var/log/auth.log` β†’ Loki - **Metrics**: Node exporter + process exporter β†’ Prometheus remote write @@ -477,22 +425,11 @@ ssh prospero.incus sudo systemctl status prometheus grafana-server loki prometheus-alertmanager pgadmin oauth2-proxy-prometheus ``` -### HAProxy Service - -```bash -ssh prospero.incus -sudo systemctl status haproxy -sudo journalctl -u haproxy -f -``` - ### View Logs ```bash # All PPLG services via journal sudo journalctl -u prometheus -u grafana-server -u loki -u prometheus-alertmanager -u pgadmin -u oauth2-proxy-prometheus -f - -# HAProxy logs (shipped via syslog to Alloy β†’ Loki) -# Query in Grafana: {job="pplg-haproxy"} ``` ### Test Endpoints (from Prospero) @@ -512,18 +449,17 @@ curl -s http://127.0.0.1:3100/ready # Alertmanager curl -s http://127.0.0.1:9093/-/healthy - -# HAProxy stats -curl -s http://127.0.0.1:8404/metrics | head ``` -### Test TLS (from any host) +### Test External Access (from any host) ```bash -# Direct to Prospero container -curl -sk https://prospero.incus/api/health # Via Titania HAProxy curl -s https://grafana.ouranos.helu.ca/api/health +curl -s https://pgadmin.ouranos.helu.ca/misc/ping +curl -s https://prometheus.ouranos.helu.ca/ping +curl -s https://loki.ouranos.helu.ca/ready +curl -s https://alertmanager.ouranos.helu.ca/-/healthy ``` ### Common Errors