docs: rewrite README with structured overview and quick start guide

Replaces the minimal project description with a comprehensive README
including a component overview table, quick start instructions, common
Ansible operations, and links to detailed documentation. Aligns with
Red Panda Approval™ standards.
This commit is contained in:
2026-03-03 12:49:06 +00:00
parent c7be03a743
commit b4d60f2f38
219 changed files with 34586 additions and 2 deletions

View File

@@ -0,0 +1,249 @@
# Prometheus Alert Rules
# Red Panda Approved 🐼
# Deployed to: /etc/prometheus/alert_rules.yml
{% raw %}
groups:
# ============================================================================
# Node/Infrastructure Alerts
# ============================================================================
- name: node_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighLoadAverage
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
# ============================================================================
# Process-Level Alerts (puck.incus)
# ============================================================================
- name: puck_process_alerts
rules:
- alert: PuckHighCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
- alert: PuckCriticalCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
- alert: PuckHighMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
- alert: PuckCriticalMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
for: 1m
labels:
severity: critical
annotations:
summary: "Critical memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
- alert: PuckProcessCrashLoop
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
for: 1m
labels:
severity: warning
annotations:
summary: "Process count dropped on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
# ============================================================================
# Docker Container Alerts (puck.incus)
# ============================================================================
- name: puck_container_alerts
rules:
- alert: PuckHighContainerCount
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High container count on puck"
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
- alert: PuckDuplicateContainers
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Duplicate containers on puck: {{ $labels.image }}"
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
- alert: PuckOrphanedContainer
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Possible orphaned container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
- alert: PuckMCPContainerOnPuck
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
for: 1m
labels:
severity: critical
annotations:
summary: "MCP container detected on puck (WRONG HOST)"
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
- alert: PuckContainerHighCPU
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
- alert: PuckContainerHighMemory
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
- alert: PuckContainerOOMKilled
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Container OOM killed on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} was killed by OOM killer"
# ============================================================================
# Service/Application Alerts
# ============================================================================
- name: service_alerts
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target missing: {{ $labels.instance }}"
description: "A Prometheus target has been down for more than 5 minutes."
- alert: PrometheusJobMissing
expr: absent(up{job="node-exporter"})
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus job missing"
description: "A Prometheus job has disappeared from target discovery."
- alert: AlertmanagerDown
expr: absent(up{job="alertmanager"})
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager is not responding. Alerts may not be delivered."
# ============================================================================
# Loki/Logging Alerts
# ============================================================================
- name: loki_alerts
rules:
- alert: LokiHighLogVolume
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
for: 10m
labels:
severity: warning
annotations:
summary: "High log ingestion rate"
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -0,0 +1,148 @@
global:
resolve_timeout: 5m
smtp_smarthost: '{{ smtp_host }}:{{ smtp_port }}'
smtp_from: '{{ smtp_from }}'
smtp_require_tls: false
route:
group_by: ['alertname', 'instance', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'email'
routes:
- match:
severity: critical
receiver: 'email-critical'
continue: true
- match:
severity: warning
receiver: 'email-warning'
continue: true
- match:
severity: info
receiver: 'email-info'
repeat_interval: 24h
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
receivers:
- name: 'email-critical'
email_configs:
- to: 'hostmaster+critical@ouranos.helu.ca'
send_resolved: true
html: true
headers:
Subject: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
text: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
- name: 'email-warning'
email_configs:
- to: 'hostmaster+warning@ouranos.helu.ca'
send_resolved: true
html: true
headers:
Subject: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
text: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
- name: 'email-info'
email_configs:
- to: 'hostmaster+info@ouranos.helu.ca'
send_resolved: false
html: true
headers:
Subject: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
text: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
- name: 'email'
email_configs:
- to: 'hostmaster+alerts@ouranos.helu.ca'
send_resolved: true
html: true
headers:
Subject: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
text: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
# --- Pushover receivers (disabled for smtp4dev testing) ---
# To re-enable: uncomment these receivers and update the route receiver names
# from email-*/email back to pushover-*/pushover
#
# - name: 'pushover-critical'
# pushover_configs:
# - user_key: '{{ pushover_user_key }}'
# token: '{{ pushover_api_token }}'
# send_resolved: true
# html: true
# priority: '2'
# retry: 30
# expire: 3600
# title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
# message: |-
# {{ "{{" }} range .Alerts {{ "}}" }}
# {{ "{{" }} .Annotations.description {{ "}}" }}
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
# {{ "{{" }} end {{ "}}" }}
#
# - name: 'pushover-warning'
# pushover_configs:
# - user_key: '{{ pushover_user_key }}'
# token: '{{ pushover_api_token }}'
# send_resolved: true
# html: true
# priority: '1'
# retry: 30
# expire: 3600
# title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
# message: |-
# {{ "{{" }} range .Alerts {{ "}}" }}
# {{ "{{" }} .Annotations.description {{ "}}" }}
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
# {{ "{{" }} end {{ "}}" }}
#
# - name: 'pushover-info'
# pushover_configs:
# - user_key: '{{ pushover_user_key }}'
# token: '{{ pushover_api_token }}'
# send_resolved: false
# html: true
# priority: '0'
# title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
# message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
#
# - name: 'pushover'
# pushover_configs:
# - user_key: '{{ pushover_user_key }}'
# token: '{{ pushover_api_token }}'
# send_resolved: true
# html: true
# priority: '1'
# retry: 30
# expire: 3600
# title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
# message: |-
# {{ "{{" }} range .Alerts {{ "}}" }}
# {{ "{{" }} .Annotations.description {{ "}}" }}
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
# Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
# {{ "{{" }} end {{ "}}" }}

View File

@@ -0,0 +1,41 @@
auth_enabled: false
server:
http_listen_port: {{ loki_port }}
grpc_listen_port: {{ loki_grpc_port }}
common:
path_prefix: {{ loki_data_dir }}
storage:
filesystem:
chunks_directory: {{ loki_data_dir }}/chunks
rules_directory: {{ loki_data_dir }}/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2024-04-01
object_store: filesystem
store: tsdb
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://{{ alertmanager_host }}:{{ alertmanager_port }}
# Red Panda Approved Configuration
analytics:
reporting_enabled: false

View File

@@ -0,0 +1,55 @@
# PgAdmin4 Local Configuration - Managed by Ansible
# Gunicorn-based deployment (no Apache) with Casdoor OAuth SSO
# Red Panda Approved
import os
# Server settings
DEFAULT_SERVER = '0.0.0.0'
DEFAULT_SERVER_PORT = {{pgadmin_port}}
# Data directory
DATA_DIR = '{{pgadmin_data_dir}}'
SESSION_DB_PATH = os.path.join(DATA_DIR, 'sessions')
STORAGE_DIR = os.path.join(DATA_DIR, 'storage')
SQLITE_PATH = os.path.join(DATA_DIR, 'pgadmin4.db')
# Log settings
LOG_FILE = '{{pgadmin_log_dir}}/pgadmin4.log'
# Default admin credentials (for initial setup)
SETUP_EMAIL = '{{pgadmin_email}}'
SETUP_PASSWORD = '{{pgadmin_password}}'
# Authentication - OAuth2 (Casdoor) + internal fallback
AUTHENTICATION_SOURCES = ['oauth2', 'internal']
# Master password disabled (use OAuth)
MASTER_PASSWORD_REQUIRED = False
# Reverse proxy settings (Titania HAProxy -> Prospero HAProxy -> Gunicorn)
ENHANCED_COOKIE_PROTECTION = False
PROXY_X_FOR_COUNT = 2
PROXY_X_PROTO_COUNT = 2
PROXY_X_HOST_COUNT = 2
X_FRAME_OPTIONS = 'SAMEORIGIN'
SESSION_COOKIE_SECURE = True
SESSION_COOKIE_SAMESITE = 'Lax'
WTF_CSRF_SSL_STRICT = False
# OAuth2 Configuration (Casdoor OIDC)
OAUTH2_AUTO_CREATE_USER = True
OAUTH2_CONFIG = [{
'OAUTH2_NAME': 'Casdoor',
'OAUTH2_DISPLAY_NAME': 'Casdoor SSO',
'OAUTH2_CLIENT_ID': '{{pgadmin_oauth_client_id}}',
'OAUTH2_CLIENT_SECRET': '{{pgadmin_oauth_client_secret}}',
'OAUTH2_TOKEN_URL': 'https://id.ouranos.helu.ca/api/login/oauth/access_token',
'OAUTH2_AUTHORIZATION_URL': 'https://id.ouranos.helu.ca/login/oauth/authorize',
'OAUTH2_API_BASE_URL': 'https://id.ouranos.helu.ca/',
'OAUTH2_USERINFO_ENDPOINT': 'api/userinfo',
'OAUTH2_SERVER_METADATA_URL': 'https://id.ouranos.helu.ca/.well-known/openid-configuration',
'OAUTH2_SCOPE': 'openid profile email',
'OAUTH2_ICON': 'fa-openid',
'OAUTH2_BUTTON_COLOR': '#2db7f5',
}]

View File

@@ -0,0 +1,15 @@
apiVersion: 1
datasources:
- name: {{prometheus_datasource_name}}
type: prometheus
access: proxy
url: http://{{prometheus_host}}:{{prometheus_port}}
isDefault: true
editable: false
uid: {{prometheus_datasource_uid}}
- name: {{loki_datasource_name}}
type: loki
access: proxy
url: http://{{loki_host}}:{{loki_port}}
editable: false
uid: {{loki_datasource_uid}}

495
ansible/pplg/deploy.yml Normal file
View File

@@ -0,0 +1,495 @@
---
# PPLG - Consolidated Observability & Admin Stack for Prospero
# PgAdmin, Prometheus, Loki, Grafana + HAProxy (TLS) + OAuth2-Proxy (Prometheus UI)
# Red Panda Approved
- name: Deploy PPLG Stack
hosts: ubuntu
become: true
tasks:
- name: Check if host has pplg service
ansible.builtin.set_fact:
has_pplg_service: "{{'pplg' in services}}"
- name: Skip hosts without pplg service
ansible.builtin.meta: end_host
when: not has_pplg_service
# ===========================================================================
# APT Repositories
# ===========================================================================
- name: Add Grafana APT repository (Grafana + Loki)
ansible.builtin.deb822_repository:
name: grafana
types: [deb]
uris: https://apt.grafana.com
suites: [stable]
components: [main]
signed_by: https://apt.grafana.com/gpg.key
state: present
- name: Add PgAdmin APT repository
ansible.builtin.deb822_repository:
name: pgadmin4
types: [deb]
uris: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/{{ansible_distribution_release}}
suites: [pgadmin4]
components: [main]
signed_by: https://www.pgadmin.org/static/packages_pgadmin_org.pub
state: present
# ===========================================================================
# Package Installation
# ===========================================================================
- name: Install PPLG packages
ansible.builtin.apt:
name:
- acl
- haproxy
- prometheus
- loki
- grafana
- pgadmin4-web
state: present
update_cache: true
- name: Stop and disable Apache (pulled in by pgadmin4-web)
ansible.builtin.systemd:
name: apache2
state: stopped
enabled: false
# ===========================================================================
# Prometheus
# ===========================================================================
- name: Fix Prometheus directory permissions
ansible.builtin.file:
path: /var/lib/prometheus
owner: prometheus
group: prometheus
mode: '750'
recurse: true
- name: Create textfile collector directory
ansible.builtin.file:
path: /var/lib/prometheus/node-exporter
state: directory
owner: prometheus
group: prometheus
mode: '750'
- name: Template prometheus.yml
ansible.builtin.template:
src: prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
owner: prometheus
group: prometheus
mode: '640'
notify: restart prometheus
- name: Template alert_rules.yml
ansible.builtin.template:
src: alert_rules.yml.j2
dest: /etc/prometheus/alert_rules.yml
owner: prometheus
group: prometheus
mode: '640'
notify: restart prometheus
- name: Create Prometheus systemd override directory
ansible.builtin.file:
path: /etc/systemd/system/prometheus.service.d
state: directory
mode: '755'
- name: Enable remote write receiver
ansible.builtin.copy:
content: |
[Service]
ExecStart=
ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver
dest: /etc/systemd/system/prometheus.service.d/override.conf
mode: '644'
notify: restart prometheus
- name: Start and enable Prometheus service
ansible.builtin.systemd:
name: prometheus
state: started
enabled: true
daemon_reload: true
# ===========================================================================
# Prometheus Alertmanager
# ===========================================================================
- name: Install Alertmanager
ansible.builtin.apt:
name: prometheus-alertmanager
state: present
- name: Create alertmanager configuration directory
ansible.builtin.file:
path: /etc/alertmanager
state: directory
owner: prometheus
group: prometheus
mode: '750'
- name: Template alertmanager.yml
ansible.builtin.template:
src: alertmanager.yml.j2
dest: /etc/alertmanager/alertmanager.yml
owner: prometheus
group: prometheus
mode: '640'
notify: restart alertmanager
- name: Start and enable Alertmanager service
ansible.builtin.systemd:
name: prometheus-alertmanager
state: started
enabled: true
daemon_reload: true
# ===========================================================================
# Loki
# ===========================================================================
- name: Create loki group
ansible.builtin.group:
name: "{{loki_group}}"
- name: Create loki user
ansible.builtin.user:
name: "{{loki_user}}"
comment: "{{loki_user}}"
group: "{{loki_group}}"
system: true
- name: Create loki directories
ansible.builtin.file:
path: "{{item}}"
owner: "{{loki_user}}"
group: "{{loki_group}}"
state: directory
mode: '750'
loop:
- "{{loki_data_dir}}"
- "{{loki_config_dir}}"
- name: Template Loki configuration
ansible.builtin.template:
src: "{{loki_config_file}}.j2"
dest: "{{loki_config_dir}}/{{loki_config_file}}"
owner: "{{loki_user}}"
group: "{{loki_group}}"
mode: '550'
notify: restart loki
- name: Enable and start Loki service
ansible.builtin.systemd:
name: loki
enabled: true
state: started
# ===========================================================================
# Grafana
# ===========================================================================
- name: Create dashboards directory
ansible.builtin.file:
path: /var/lib/grafana/dashboards
state: directory
owner: grafana
group: grafana
mode: '750'
- name: Template Grafana main configuration
ansible.builtin.template:
src: "grafana.ini.j2"
dest: "/etc/grafana/grafana.ini"
owner: grafana
group: grafana
mode: '640'
when: grafana_oauth_enabled | default(false)
notify: restart grafana
- name: Enable and start Grafana service
ansible.builtin.systemd:
name: grafana-server
enabled: true
state: started
daemon_reload: true
# ===========================================================================
# PgAdmin (Gunicorn - no Apache)
# ===========================================================================
- name: Create pgadmin group
ansible.builtin.group:
name: "{{pgadmin_group}}"
system: true
- name: Create pgadmin user
ansible.builtin.user:
name: "{{pgadmin_user}}"
comment: "PgAdmin Service"
group: "{{pgadmin_group}}"
system: true
create_home: false
shell: /usr/sbin/nologin
- name: Create PgAdmin directories
ansible.builtin.file:
path: "{{item}}"
state: directory
owner: "{{pgadmin_user}}"
group: "{{pgadmin_group}}"
mode: '750'
loop:
- "{{pgadmin_data_dir}}"
- "{{pgadmin_data_dir}}/sessions"
- "{{pgadmin_data_dir}}/storage"
- "{{pgadmin_data_dir}}/certs"
- "{{pgadmin_log_dir}}"
- name: Install gunicorn into PgAdmin venv
ansible.builtin.command:
cmd: /usr/pgadmin4/venv/bin/pip install gunicorn
register: pip_gunicorn
changed_when: "'Successfully installed' in pip_gunicorn.stdout"
- name: Initialize PgAdmin database
ansible.builtin.command:
cmd: /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db
creates: "{{pgadmin_data_dir}}/pgadmin4.db"
become_user: "{{pgadmin_user}}"
- name: Template PgAdmin local config
ansible.builtin.template:
src: config_local.py.j2
dest: /usr/pgadmin4/web/config_local.py
owner: "{{pgadmin_user}}"
group: "{{pgadmin_group}}"
mode: '640'
notify: restart pgadmin
- name: Fetch Titania PostgreSQL SSL cert
ansible.builtin.fetch:
src: /etc/postgresql/17/main/ssl/server.crt
dest: /tmp/titania-postgres-ca.crt
flat: yes
delegate_to: titania.incus
when: "'titania.incus' in groups['ubuntu']"
- name: Copy Titania PostgreSQL SSL cert to PgAdmin
ansible.builtin.copy:
src: /tmp/titania-postgres-ca.crt
dest: "{{pgadmin_data_dir}}/certs/titania-postgres-ca.crt"
owner: "{{pgadmin_user}}"
group: "{{pgadmin_group}}"
mode: '0644'
when: "'titania.incus' in groups['ubuntu']"
- name: Template PgAdmin systemd service
ansible.builtin.template:
src: pgadmin.service.j2
dest: /etc/systemd/system/pgadmin.service
owner: root
group: root
mode: '0644'
notify: restart pgadmin
- name: Enable and start PgAdmin service
ansible.builtin.systemd:
name: pgadmin
enabled: true
state: started
daemon_reload: true
# ===========================================================================
# OAuth2-Proxy Sidecar (Prometheus UI)
# ===========================================================================
- name: Create oauth2-proxy config directory
ansible.builtin.file:
path: "{{prometheus_oauth2_proxy_dir}}"
owner: root
group: root
state: directory
mode: '0755'
- name: Download oauth2-proxy binary
ansible.builtin.get_url:
url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{prometheus_oauth2_proxy_version}}/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64.tar.gz"
dest: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz"
mode: '0644'
- name: Extract oauth2-proxy binary
ansible.builtin.unarchive:
src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz"
dest: /tmp
remote_src: true
creates: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy"
- name: Install oauth2-proxy binary
ansible.builtin.copy:
src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy"
dest: /usr/local/bin/oauth2-proxy
owner: root
group: root
mode: '0755'
remote_src: true
- name: Template oauth2-proxy configuration for Prometheus
ansible.builtin.template:
src: oauth2-proxy-prometheus.cfg.j2
dest: "{{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg"
owner: root
group: root
mode: '0600'
notify: restart oauth2-proxy-prometheus
- name: Template oauth2-proxy systemd service for Prometheus
ansible.builtin.template:
src: oauth2-proxy-prometheus.service.j2
dest: /etc/systemd/system/oauth2-proxy-prometheus.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart oauth2-proxy-prometheus
- name: Enable and start OAuth2-Proxy for Prometheus
ansible.builtin.systemd:
name: oauth2-proxy-prometheus
enabled: true
state: started
daemon_reload: true
# ===========================================================================
# SSL Certificate Distribution (from Titania)
# ===========================================================================
- name: Create haproxy group
ansible.builtin.group:
name: "{{pplg_haproxy_group}}"
gid: "{{pplg_haproxy_gid}}"
system: true
- name: Create haproxy user
ansible.builtin.user:
name: "{{pplg_haproxy_user}}"
comment: "PPLG HAProxy"
group: "{{pplg_haproxy_group}}"
uid: "{{pplg_haproxy_uid}}"
system: true
- name: Create HAProxy directories
ansible.builtin.file:
path: "{{item}}"
state: directory
owner: "{{pplg_haproxy_user}}"
group: "{{pplg_haproxy_group}}"
mode: '750'
loop:
- /etc/haproxy
- /etc/haproxy/certs
- name: Fetch wildcard certificate from Titania
ansible.builtin.fetch:
src: /etc/haproxy/certs/ouranos.pem
dest: /tmp/ouranos-haproxy.pem
flat: yes
delegate_to: titania.incus
when: "'titania.incus' in groups['ubuntu']"
- name: Deploy wildcard certificate
ansible.builtin.copy:
src: /tmp/ouranos-haproxy.pem
dest: "{{pplg_haproxy_cert_path}}"
owner: "{{pplg_haproxy_user}}"
group: "{{pplg_haproxy_group}}"
mode: '0640'
when: "'titania.incus' in groups['ubuntu']"
- name: Generate self-signed wildcard certificate (fallback)
command: >
openssl req -x509 -nodes -days 365 -newkey rsa:2048
-keyout {{pplg_haproxy_cert_path}}
-out {{pplg_haproxy_cert_path}}
-subj "/C=US/ST=State/L=City/O=Agathos/CN=*.{{pplg_haproxy_domain}}"
-addext "subjectAltName=DNS:*.{{pplg_haproxy_domain}},DNS:{{pplg_haproxy_domain}}"
when: "'titania.incus' not in groups['ubuntu']"
args:
creates: "{{pplg_haproxy_cert_path}}"
# ===========================================================================
# HAProxy (TLS Termination)
# ===========================================================================
- name: Template HAProxy configuration
ansible.builtin.template:
src: pplg-haproxy.cfg.j2
dest: /etc/haproxy/haproxy.cfg
owner: "{{pplg_haproxy_user}}"
group: "{{pplg_haproxy_group}}"
mode: "640"
validate: haproxy -c -f %s
notify: restart haproxy
- name: Enable and start HAProxy service
ansible.builtin.systemd:
name: haproxy
enabled: true
state: started
# ===========================================================================
# Handlers
# ===========================================================================
handlers:
- name: restart prometheus
ansible.builtin.systemd:
name: prometheus
state: restarted
daemon_reload: true
- name: restart alertmanager
ansible.builtin.systemd:
name: prometheus-alertmanager
state: restarted
- name: restart loki
ansible.builtin.systemd:
name: loki
state: restarted
- name: restart grafana
ansible.builtin.systemd:
name: grafana-server
state: restarted
- name: restart pgadmin
ansible.builtin.systemd:
name: pgadmin
state: restarted
daemon_reload: true
- name: reload systemd
ansible.builtin.systemd:
daemon_reload: true
- name: restart haproxy
ansible.builtin.systemd:
name: haproxy
state: reloaded
- name: restart oauth2-proxy-prometheus
ansible.builtin.systemd:
name: oauth2-proxy-prometheus
state: restarted

View File

@@ -0,0 +1,36 @@
# Grafana Configuration - Managed by Ansible
# Do not edit manually - changes will be overwritten
[server]
root_url = {{ grafana_root_url }}
[auth]
# Disable login form for OAuth users (admins can still use local auth)
disable_login_form = false
[auth.generic_oauth]
enabled = {{ grafana_oauth_enabled }}
name = {{ grafana_oauth_name | default('Casdoor') }}
allow_sign_up = {{ grafana_oauth_allow_sign_up | default(true) | lower }}
client_id = {{ grafana_oauth_client_id }}
client_secret = {{ grafana_oauth_client_secret }}
scopes = {{ grafana_oauth_scopes | default('openid profile email') }}
auth_url = {{ grafana_oauth_auth_url }}
token_url = {{ grafana_oauth_token_url }}
api_url = {{ grafana_oauth_api_url }}
# Map Casdoor user attributes to Grafana
email_attribute_path = email
login_attribute_path = preferred_username
name_attribute_path = name
# Default role for new OAuth users
role_attribute_path = contains(groups[*], 'grafana-admin') && 'Admin' || contains(groups[*], 'grafana-editor') && 'Editor' || 'Viewer'
# TLS settings for internal communication
tls_skip_verify_insecure = {{ grafana_oauth_skip_tls_verify | default(true) | lower }}
[log]
# Console-only logging — systemd journal captures output, Alloy ships to Loki
mode = console
level = {{ grafana_log_level | default('info') }}
[log.console]
format = text

View File

@@ -0,0 +1,62 @@
# OAuth2-Proxy Configuration for Prometheus UI
# Authenticates users via Casdoor OIDC before proxying to Prometheus
# Red Panda Approved
# Provider Configuration (Casdoor OIDC)
provider = "oidc"
provider_display_name = "Casdoor"
oidc_issuer_url = "{{prometheus_oauth2_oidc_issuer_url}}"
client_id = "{{prometheus_oauth2_client_id}}"
client_secret = "{{prometheus_oauth2_client_secret}}"
# Redirect URL after authentication
redirect_url = "https://prometheus.{{pplg_haproxy_domain}}/oauth2/callback"
# Upstream service (Prometheus)
upstreams = [
"http://127.0.0.1:9090"
]
# Session/Cookie Configuration
cookie_secret = "{{prometheus_oauth2_cookie_secret}}"
cookie_name = "_oauth2_proxy_prometheus"
cookie_secure = true
cookie_httponly = true
cookie_expire = "168h"
cookie_refresh = "1h"
cookie_domains = ".{{pplg_haproxy_domain}}"
session_store_type = "cookie"
# Authentication settings
email_domains = ["*"]
oidc_email_claim = "email"
oidc_groups_claim = "groups"
insecure_oidc_allow_unverified_email = true
# Request settings
pass_access_token = false
pass_authorization_header = false
set_authorization_header = false
set_xauthrequest = true
# Logging
request_logging = true
auth_logging = true
standard_logging = true
# Network settings
http_address = "0.0.0.0:{{prometheus_proxy_port}}"
reverse_proxy = true
real_client_ip_header = "X-Forwarded-For"
# Skip authentication for health check endpoints
skip_auth_routes = [
"^/ping$"
]
# OIDC specific settings
skip_provider_button = true
oidc_extra_audiences = []
# SSL verification
ssl_insecure_skip_verify = false

View File

@@ -0,0 +1,18 @@
[Unit]
Description=OAuth2-Proxy for Prometheus UI
After=network.target prometheus.service
Wants=prometheus.service
[Service]
Type=simple
ExecStart=/usr/local/bin/oauth2-proxy --config={{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg
Restart=on-failure
RestartSec=5
NoNewPrivileges=true
PrivateTmp=true
StandardOutput=journal
StandardError=journal
SyslogIdentifier=oauth2-proxy-prometheus
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,27 @@
[Unit]
Description=PgAdmin4 Web Interface (Gunicorn)
After=network.target
Wants=network.target
[Service]
Type=simple
User={{pgadmin_user}}
Group={{pgadmin_group}}
WorkingDirectory=/usr/pgadmin4/web
ExecStart=/usr/pgadmin4/venv/bin/python3 -m gunicorn pgAdmin4:app \
--bind 127.0.0.1:{{pgadmin_port}} \
--workers 1 \
--threads 4 \
--timeout 120 \
--access-logfile - \
--error-logfile -
Restart=on-failure
RestartSec=5
NoNewPrivileges=true
PrivateTmp=true
StandardOutput=journal
StandardError=journal
SyslogIdentifier=pgadmin
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,127 @@
# PPLG HAProxy - Internal TLS Termination for Prospero
# Services: Grafana, PgAdmin, Prometheus (via OAuth2-Proxy), Loki, Alertmanager
# Managed by Ansible - Red Panda Approved
global
log 127.0.0.1:{{pplg_haproxy_syslog_port}} local0
stats timeout 30s
# Default SSL material locations
ca-base /etc/ssl/certs
crt-base /etc/ssl/private
# SSL/TLS configuration
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
defaults
log global
mode http
option httplog
option dontlognull
log-format "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r"
timeout connect 5s
timeout client 50s
timeout server 50s
# Stats page with Prometheus metrics
listen stats
bind *:{{pplg_haproxy_stats_port}}
mode http
stats enable
stats uri /metrics
stats refresh 15s
stats show-legends
stats show-node
# Prometheus metrics endpoint
http-request use-service prometheus-exporter if { path /metrics }
# HTTP frontend - redirect all traffic to HTTPS
frontend http_frontend
bind *:80
mode http
option httplog
http-request redirect scheme https code 301
# HTTPS frontend with subdomain-based routing
frontend https_frontend
bind *:443 ssl crt {{pplg_haproxy_cert_path}}
mode http
option httplog
option forwardfor
# Forward original protocol and host
http-request set-header X-Forwarded-Proto https
http-request set-header X-Forwarded-Port %[dst_port]
http-request set-header X-Forwarded-Host %[req.hdr(Host)]
# Security headers
http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains"
http-response set-header X-Frame-Options "SAMEORIGIN"
http-response set-header X-Content-Type-Options "nosniff"
http-response set-header X-XSS-Protection "1; mode=block"
# Subdomain ACLs
acl host_grafana hdr_beg(host) -i grafana.{{pplg_haproxy_domain}}
acl host_pgadmin hdr_beg(host) -i pgadmin.{{pplg_haproxy_domain}}
acl host_prometheus hdr_beg(host) -i prometheus.{{pplg_haproxy_domain}}
acl host_loki hdr_beg(host) -i loki.{{pplg_haproxy_domain}}
acl host_alertmanager hdr_beg(host) -i alertmanager.{{pplg_haproxy_domain}}
# Prometheus write API - bypass OAuth2-Proxy (machine-to-machine)
acl is_prometheus_write path_beg /api/v1/write
use_backend backend_grafana if host_grafana
use_backend backend_pgadmin if host_pgadmin
use_backend backend_prometheus_direct if host_prometheus is_prometheus_write
use_backend backend_prometheus if host_prometheus
use_backend backend_loki if host_loki
use_backend backend_alertmanager if host_alertmanager
# Grafana - Native Casdoor OAuth SSO
backend backend_grafana
mode http
balance roundrobin
option httpchk GET /api/health
http-check expect status 200
server grafana_1 127.0.0.1:3000 check
# PgAdmin - Native Casdoor OAuth SSO
backend backend_pgadmin
mode http
balance roundrobin
option httpchk GET /misc/ping
http-check expect status 200
server pgadmin_1 127.0.0.1:{{pgadmin_port}} check
# Prometheus UI - via OAuth2-Proxy sidecar
backend backend_prometheus
mode http
balance roundrobin
option httpchk GET /ping
http-check expect status 200
server prometheus_1 127.0.0.1:{{prometheus_proxy_port}} check
# Prometheus Write API - direct (no auth, machine-to-machine)
backend backend_prometheus_direct
mode http
balance roundrobin
server prometheus_write_1 127.0.0.1:9090 check
# Loki - no auth (machine-to-machine log ingestion)
backend backend_loki
mode http
balance roundrobin
option httpchk GET /ready
http-check expect status 200
server loki_1 127.0.0.1:{{loki_port}} check
# Alertmanager - internal only
backend backend_alertmanager
mode http
balance roundrobin
option httpchk GET /-/healthy
http-check expect status 200
server alertmanager_1 127.0.0.1:{{alertmanager_port}} check

View File

@@ -0,0 +1,48 @@
global:
scrape_interval: {{ prometheus_scrape_interval }}
evaluation_interval: {{ prometheus_evaluation_interval }}
alerting:
alertmanagers:
- static_configs:
- targets:
- {{ alertmanager_host }}:{{ alertmanager_port }}
rule_files:
- "alert_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: {{ prometheus_targets | to_json }}
- job_name: 'alertmanager'
static_configs:
- targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}']
- job_name: 'haproxy'
static_configs:
- targets: ['titania.incus:8404']
metrics_path: '/metrics'
- job_name: 'gitea'
static_configs:
- targets: ['oberon.incus:22084']
metrics_path: '/metrics'
authorization:
type: Bearer
credentials: '{{ vault_gitea_metrics_token }}'
- job_name: 'casdoor'
static_configs:
- targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}']
metrics_path: '/api/metrics'
params:
accessKey: ['{{ casdoor_prometheus_access_key }}']
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
# Red Panda Approved Prometheus Configuration