docs: rewrite README with structured overview and quick start guide
Replaces the minimal project description with a comprehensive README including a component overview table, quick start instructions, common Ansible operations, and links to detailed documentation. Aligns with Red Panda Approval™ standards.
This commit is contained in:
249
ansible/pplg/alert_rules.yml.j2
Normal file
249
ansible/pplg/alert_rules.yml.j2
Normal file
@@ -0,0 +1,249 @@
|
||||
# Prometheus Alert Rules
|
||||
# Red Panda Approved 🐼
|
||||
# Deployed to: /etc/prometheus/alert_rules.yml
|
||||
{% raw %}
|
||||
groups:
|
||||
# ============================================================================
|
||||
# Node/Infrastructure Alerts
|
||||
# ============================================================================
|
||||
- name: node_alerts
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} is down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
|
||||
|
||||
# ============================================================================
|
||||
# Process-Level Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_process_alerts
|
||||
rules:
|
||||
- alert: PuckHighCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
|
||||
|
||||
- alert: PuckCriticalCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
|
||||
|
||||
- alert: PuckHighMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
|
||||
|
||||
- alert: PuckCriticalMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
|
||||
|
||||
- alert: PuckProcessCrashLoop
|
||||
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Process count dropped on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
|
||||
|
||||
# ============================================================================
|
||||
# Docker Container Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_container_alerts
|
||||
rules:
|
||||
- alert: PuckHighContainerCount
|
||||
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High container count on puck"
|
||||
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
|
||||
|
||||
- alert: PuckDuplicateContainers
|
||||
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Duplicate containers on puck: {{ $labels.image }}"
|
||||
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
|
||||
|
||||
- alert: PuckOrphanedContainer
|
||||
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Possible orphaned container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
|
||||
|
||||
- alert: PuckMCPContainerOnPuck
|
||||
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MCP container detected on puck (WRONG HOST)"
|
||||
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
|
||||
|
||||
- alert: PuckContainerHighCPU
|
||||
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
|
||||
|
||||
- alert: PuckContainerHighMemory
|
||||
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
|
||||
|
||||
- alert: PuckContainerOOMKilled
|
||||
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container OOM killed on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} was killed by OOM killer"
|
||||
|
||||
# ============================================================================
|
||||
# Service/Application Alerts
|
||||
# ============================================================================
|
||||
- name: service_alerts
|
||||
rules:
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus target missing: {{ $labels.instance }}"
|
||||
description: "A Prometheus target has been down for more than 5 minutes."
|
||||
|
||||
- alert: PrometheusJobMissing
|
||||
expr: absent(up{job="node-exporter"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus job missing"
|
||||
description: "A Prometheus job has disappeared from target discovery."
|
||||
|
||||
- alert: AlertmanagerDown
|
||||
expr: absent(up{job="alertmanager"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager is down"
|
||||
description: "Alertmanager is not responding. Alerts may not be delivered."
|
||||
|
||||
# ============================================================================
|
||||
# Loki/Logging Alerts
|
||||
# ============================================================================
|
||||
- name: loki_alerts
|
||||
rules:
|
||||
- alert: LokiHighLogVolume
|
||||
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
148
ansible/pplg/alertmanager.yml.j2
Normal file
148
ansible/pplg/alertmanager.yml.j2
Normal file
@@ -0,0 +1,148 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_smarthost: '{{ smtp_host }}:{{ smtp_port }}'
|
||||
smtp_from: '{{ smtp_from }}'
|
||||
smtp_require_tls: false
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'instance', 'severity']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'email'
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'email-critical'
|
||||
continue: true
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'email-warning'
|
||||
continue: true
|
||||
- match:
|
||||
severity: info
|
||||
receiver: 'email-info'
|
||||
repeat_interval: 24h
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
receivers:
|
||||
- name: 'email-critical'
|
||||
email_configs:
|
||||
- to: 'hostmaster+critical@ouranos.helu.ca'
|
||||
send_resolved: true
|
||||
html: true
|
||||
headers:
|
||||
Subject: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
text: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
|
||||
- name: 'email-warning'
|
||||
email_configs:
|
||||
- to: 'hostmaster+warning@ouranos.helu.ca'
|
||||
send_resolved: true
|
||||
html: true
|
||||
headers:
|
||||
Subject: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
text: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
|
||||
- name: 'email-info'
|
||||
email_configs:
|
||||
- to: 'hostmaster+info@ouranos.helu.ca'
|
||||
send_resolved: false
|
||||
html: true
|
||||
headers:
|
||||
Subject: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
text: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
|
||||
|
||||
- name: 'email'
|
||||
email_configs:
|
||||
- to: 'hostmaster+alerts@ouranos.helu.ca'
|
||||
send_resolved: true
|
||||
html: true
|
||||
headers:
|
||||
Subject: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
text: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
|
||||
# --- Pushover receivers (disabled for smtp4dev testing) ---
|
||||
# To re-enable: uncomment these receivers and update the route receiver names
|
||||
# from email-*/email back to pushover-*/pushover
|
||||
#
|
||||
# - name: 'pushover-critical'
|
||||
# pushover_configs:
|
||||
# - user_key: '{{ pushover_user_key }}'
|
||||
# token: '{{ pushover_api_token }}'
|
||||
# send_resolved: true
|
||||
# html: true
|
||||
# priority: '2'
|
||||
# retry: 30
|
||||
# expire: 3600
|
||||
# title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
# message: |-
|
||||
# {{ "{{" }} range .Alerts {{ "}}" }}
|
||||
# {{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
# {{ "{{" }} end {{ "}}" }}
|
||||
#
|
||||
# - name: 'pushover-warning'
|
||||
# pushover_configs:
|
||||
# - user_key: '{{ pushover_user_key }}'
|
||||
# token: '{{ pushover_api_token }}'
|
||||
# send_resolved: true
|
||||
# html: true
|
||||
# priority: '1'
|
||||
# retry: 30
|
||||
# expire: 3600
|
||||
# title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
# message: |-
|
||||
# {{ "{{" }} range .Alerts {{ "}}" }}
|
||||
# {{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
# {{ "{{" }} end {{ "}}" }}
|
||||
#
|
||||
# - name: 'pushover-info'
|
||||
# pushover_configs:
|
||||
# - user_key: '{{ pushover_user_key }}'
|
||||
# token: '{{ pushover_api_token }}'
|
||||
# send_resolved: false
|
||||
# html: true
|
||||
# priority: '0'
|
||||
# title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
# message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
|
||||
#
|
||||
# - name: 'pushover'
|
||||
# pushover_configs:
|
||||
# - user_key: '{{ pushover_user_key }}'
|
||||
# token: '{{ pushover_api_token }}'
|
||||
# send_resolved: true
|
||||
# html: true
|
||||
# priority: '1'
|
||||
# retry: 30
|
||||
# expire: 3600
|
||||
# title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
# message: |-
|
||||
# {{ "{{" }} range .Alerts {{ "}}" }}
|
||||
# {{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
# Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
# Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
|
||||
# {{ "{{" }} end {{ "}}" }}
|
||||
41
ansible/pplg/config.yml.j2
Normal file
41
ansible/pplg/config.yml.j2
Normal file
@@ -0,0 +1,41 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: {{ loki_port }}
|
||||
grpc_listen_port: {{ loki_grpc_port }}
|
||||
|
||||
common:
|
||||
path_prefix: {{ loki_data_dir }}
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: {{ loki_data_dir }}/chunks
|
||||
rules_directory: {{ loki_data_dir }}/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
instance_addr: 127.0.0.1
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-04-01
|
||||
object_store: filesystem
|
||||
store: tsdb
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://{{ alertmanager_host }}:{{ alertmanager_port }}
|
||||
|
||||
# Red Panda Approved Configuration
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
55
ansible/pplg/config_local.py.j2
Normal file
55
ansible/pplg/config_local.py.j2
Normal file
@@ -0,0 +1,55 @@
|
||||
# PgAdmin4 Local Configuration - Managed by Ansible
|
||||
# Gunicorn-based deployment (no Apache) with Casdoor OAuth SSO
|
||||
# Red Panda Approved
|
||||
|
||||
import os
|
||||
|
||||
# Server settings
|
||||
DEFAULT_SERVER = '0.0.0.0'
|
||||
DEFAULT_SERVER_PORT = {{pgadmin_port}}
|
||||
|
||||
# Data directory
|
||||
DATA_DIR = '{{pgadmin_data_dir}}'
|
||||
SESSION_DB_PATH = os.path.join(DATA_DIR, 'sessions')
|
||||
STORAGE_DIR = os.path.join(DATA_DIR, 'storage')
|
||||
SQLITE_PATH = os.path.join(DATA_DIR, 'pgadmin4.db')
|
||||
|
||||
# Log settings
|
||||
LOG_FILE = '{{pgadmin_log_dir}}/pgadmin4.log'
|
||||
|
||||
# Default admin credentials (for initial setup)
|
||||
SETUP_EMAIL = '{{pgadmin_email}}'
|
||||
SETUP_PASSWORD = '{{pgadmin_password}}'
|
||||
|
||||
# Authentication - OAuth2 (Casdoor) + internal fallback
|
||||
AUTHENTICATION_SOURCES = ['oauth2', 'internal']
|
||||
|
||||
# Master password disabled (use OAuth)
|
||||
MASTER_PASSWORD_REQUIRED = False
|
||||
|
||||
# Reverse proxy settings (Titania HAProxy -> Prospero HAProxy -> Gunicorn)
|
||||
ENHANCED_COOKIE_PROTECTION = False
|
||||
PROXY_X_FOR_COUNT = 2
|
||||
PROXY_X_PROTO_COUNT = 2
|
||||
PROXY_X_HOST_COUNT = 2
|
||||
X_FRAME_OPTIONS = 'SAMEORIGIN'
|
||||
SESSION_COOKIE_SECURE = True
|
||||
SESSION_COOKIE_SAMESITE = 'Lax'
|
||||
WTF_CSRF_SSL_STRICT = False
|
||||
|
||||
# OAuth2 Configuration (Casdoor OIDC)
|
||||
OAUTH2_AUTO_CREATE_USER = True
|
||||
OAUTH2_CONFIG = [{
|
||||
'OAUTH2_NAME': 'Casdoor',
|
||||
'OAUTH2_DISPLAY_NAME': 'Casdoor SSO',
|
||||
'OAUTH2_CLIENT_ID': '{{pgadmin_oauth_client_id}}',
|
||||
'OAUTH2_CLIENT_SECRET': '{{pgadmin_oauth_client_secret}}',
|
||||
'OAUTH2_TOKEN_URL': 'https://id.ouranos.helu.ca/api/login/oauth/access_token',
|
||||
'OAUTH2_AUTHORIZATION_URL': 'https://id.ouranos.helu.ca/login/oauth/authorize',
|
||||
'OAUTH2_API_BASE_URL': 'https://id.ouranos.helu.ca/',
|
||||
'OAUTH2_USERINFO_ENDPOINT': 'api/userinfo',
|
||||
'OAUTH2_SERVER_METADATA_URL': 'https://id.ouranos.helu.ca/.well-known/openid-configuration',
|
||||
'OAUTH2_SCOPE': 'openid profile email',
|
||||
'OAUTH2_ICON': 'fa-openid',
|
||||
'OAUTH2_BUTTON_COLOR': '#2db7f5',
|
||||
}]
|
||||
15
ansible/pplg/datasource.yml.j2
Normal file
15
ansible/pplg/datasource.yml.j2
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: {{prometheus_datasource_name}}
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://{{prometheus_host}}:{{prometheus_port}}
|
||||
isDefault: true
|
||||
editable: false
|
||||
uid: {{prometheus_datasource_uid}}
|
||||
- name: {{loki_datasource_name}}
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://{{loki_host}}:{{loki_port}}
|
||||
editable: false
|
||||
uid: {{loki_datasource_uid}}
|
||||
495
ansible/pplg/deploy.yml
Normal file
495
ansible/pplg/deploy.yml
Normal file
@@ -0,0 +1,495 @@
|
||||
---
|
||||
# PPLG - Consolidated Observability & Admin Stack for Prospero
|
||||
# PgAdmin, Prometheus, Loki, Grafana + HAProxy (TLS) + OAuth2-Proxy (Prometheus UI)
|
||||
# Red Panda Approved
|
||||
|
||||
- name: Deploy PPLG Stack
|
||||
hosts: ubuntu
|
||||
become: true
|
||||
tasks:
|
||||
- name: Check if host has pplg service
|
||||
ansible.builtin.set_fact:
|
||||
has_pplg_service: "{{'pplg' in services}}"
|
||||
|
||||
- name: Skip hosts without pplg service
|
||||
ansible.builtin.meta: end_host
|
||||
when: not has_pplg_service
|
||||
|
||||
# ===========================================================================
|
||||
# APT Repositories
|
||||
# ===========================================================================
|
||||
|
||||
- name: Add Grafana APT repository (Grafana + Loki)
|
||||
ansible.builtin.deb822_repository:
|
||||
name: grafana
|
||||
types: [deb]
|
||||
uris: https://apt.grafana.com
|
||||
suites: [stable]
|
||||
components: [main]
|
||||
signed_by: https://apt.grafana.com/gpg.key
|
||||
state: present
|
||||
|
||||
- name: Add PgAdmin APT repository
|
||||
ansible.builtin.deb822_repository:
|
||||
name: pgadmin4
|
||||
types: [deb]
|
||||
uris: https://ftp.postgresql.org/pub/pgadmin/pgadmin4/apt/{{ansible_distribution_release}}
|
||||
suites: [pgadmin4]
|
||||
components: [main]
|
||||
signed_by: https://www.pgadmin.org/static/packages_pgadmin_org.pub
|
||||
state: present
|
||||
|
||||
# ===========================================================================
|
||||
# Package Installation
|
||||
# ===========================================================================
|
||||
|
||||
- name: Install PPLG packages
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- acl
|
||||
- haproxy
|
||||
- prometheus
|
||||
- loki
|
||||
- grafana
|
||||
- pgadmin4-web
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Stop and disable Apache (pulled in by pgadmin4-web)
|
||||
ansible.builtin.systemd:
|
||||
name: apache2
|
||||
state: stopped
|
||||
enabled: false
|
||||
|
||||
# ===========================================================================
|
||||
# Prometheus
|
||||
# ===========================================================================
|
||||
|
||||
- name: Fix Prometheus directory permissions
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/prometheus
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
recurse: true
|
||||
|
||||
- name: Create textfile collector directory
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/prometheus/node-exporter
|
||||
state: directory
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
|
||||
- name: Template prometheus.yml
|
||||
ansible.builtin.template:
|
||||
src: prometheus.yml.j2
|
||||
dest: /etc/prometheus/prometheus.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '640'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Template alert_rules.yml
|
||||
ansible.builtin.template:
|
||||
src: alert_rules.yml.j2
|
||||
dest: /etc/prometheus/alert_rules.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '640'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Create Prometheus systemd override directory
|
||||
ansible.builtin.file:
|
||||
path: /etc/systemd/system/prometheus.service.d
|
||||
state: directory
|
||||
mode: '755'
|
||||
|
||||
- name: Enable remote write receiver
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
[Service]
|
||||
ExecStart=
|
||||
ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver
|
||||
dest: /etc/systemd/system/prometheus.service.d/override.conf
|
||||
mode: '644'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Start and enable Prometheus service
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
# ===========================================================================
|
||||
# Prometheus Alertmanager
|
||||
# ===========================================================================
|
||||
|
||||
- name: Install Alertmanager
|
||||
ansible.builtin.apt:
|
||||
name: prometheus-alertmanager
|
||||
state: present
|
||||
|
||||
- name: Create alertmanager configuration directory
|
||||
ansible.builtin.file:
|
||||
path: /etc/alertmanager
|
||||
state: directory
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
|
||||
- name: Template alertmanager.yml
|
||||
ansible.builtin.template:
|
||||
src: alertmanager.yml.j2
|
||||
dest: /etc/alertmanager/alertmanager.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '640'
|
||||
notify: restart alertmanager
|
||||
|
||||
- name: Start and enable Alertmanager service
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus-alertmanager
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
# ===========================================================================
|
||||
# Loki
|
||||
# ===========================================================================
|
||||
|
||||
- name: Create loki group
|
||||
ansible.builtin.group:
|
||||
name: "{{loki_group}}"
|
||||
|
||||
- name: Create loki user
|
||||
ansible.builtin.user:
|
||||
name: "{{loki_user}}"
|
||||
comment: "{{loki_user}}"
|
||||
group: "{{loki_group}}"
|
||||
system: true
|
||||
|
||||
- name: Create loki directories
|
||||
ansible.builtin.file:
|
||||
path: "{{item}}"
|
||||
owner: "{{loki_user}}"
|
||||
group: "{{loki_group}}"
|
||||
state: directory
|
||||
mode: '750'
|
||||
loop:
|
||||
- "{{loki_data_dir}}"
|
||||
- "{{loki_config_dir}}"
|
||||
|
||||
- name: Template Loki configuration
|
||||
ansible.builtin.template:
|
||||
src: "{{loki_config_file}}.j2"
|
||||
dest: "{{loki_config_dir}}/{{loki_config_file}}"
|
||||
owner: "{{loki_user}}"
|
||||
group: "{{loki_group}}"
|
||||
mode: '550'
|
||||
notify: restart loki
|
||||
|
||||
- name: Enable and start Loki service
|
||||
ansible.builtin.systemd:
|
||||
name: loki
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
# ===========================================================================
|
||||
# Grafana
|
||||
# ===========================================================================
|
||||
|
||||
- name: Create dashboards directory
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/grafana/dashboards
|
||||
state: directory
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: '750'
|
||||
|
||||
- name: Template Grafana main configuration
|
||||
ansible.builtin.template:
|
||||
src: "grafana.ini.j2"
|
||||
dest: "/etc/grafana/grafana.ini"
|
||||
owner: grafana
|
||||
group: grafana
|
||||
mode: '640'
|
||||
when: grafana_oauth_enabled | default(false)
|
||||
notify: restart grafana
|
||||
|
||||
- name: Enable and start Grafana service
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
# ===========================================================================
|
||||
# PgAdmin (Gunicorn - no Apache)
|
||||
# ===========================================================================
|
||||
|
||||
- name: Create pgadmin group
|
||||
ansible.builtin.group:
|
||||
name: "{{pgadmin_group}}"
|
||||
system: true
|
||||
|
||||
- name: Create pgadmin user
|
||||
ansible.builtin.user:
|
||||
name: "{{pgadmin_user}}"
|
||||
comment: "PgAdmin Service"
|
||||
group: "{{pgadmin_group}}"
|
||||
system: true
|
||||
create_home: false
|
||||
shell: /usr/sbin/nologin
|
||||
|
||||
- name: Create PgAdmin directories
|
||||
ansible.builtin.file:
|
||||
path: "{{item}}"
|
||||
state: directory
|
||||
owner: "{{pgadmin_user}}"
|
||||
group: "{{pgadmin_group}}"
|
||||
mode: '750'
|
||||
loop:
|
||||
- "{{pgadmin_data_dir}}"
|
||||
- "{{pgadmin_data_dir}}/sessions"
|
||||
- "{{pgadmin_data_dir}}/storage"
|
||||
- "{{pgadmin_data_dir}}/certs"
|
||||
- "{{pgadmin_log_dir}}"
|
||||
|
||||
- name: Install gunicorn into PgAdmin venv
|
||||
ansible.builtin.command:
|
||||
cmd: /usr/pgadmin4/venv/bin/pip install gunicorn
|
||||
register: pip_gunicorn
|
||||
changed_when: "'Successfully installed' in pip_gunicorn.stdout"
|
||||
|
||||
- name: Initialize PgAdmin database
|
||||
ansible.builtin.command:
|
||||
cmd: /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db
|
||||
creates: "{{pgadmin_data_dir}}/pgadmin4.db"
|
||||
become_user: "{{pgadmin_user}}"
|
||||
|
||||
- name: Template PgAdmin local config
|
||||
ansible.builtin.template:
|
||||
src: config_local.py.j2
|
||||
dest: /usr/pgadmin4/web/config_local.py
|
||||
owner: "{{pgadmin_user}}"
|
||||
group: "{{pgadmin_group}}"
|
||||
mode: '640'
|
||||
notify: restart pgadmin
|
||||
|
||||
- name: Fetch Titania PostgreSQL SSL cert
|
||||
ansible.builtin.fetch:
|
||||
src: /etc/postgresql/17/main/ssl/server.crt
|
||||
dest: /tmp/titania-postgres-ca.crt
|
||||
flat: yes
|
||||
delegate_to: titania.incus
|
||||
when: "'titania.incus' in groups['ubuntu']"
|
||||
|
||||
- name: Copy Titania PostgreSQL SSL cert to PgAdmin
|
||||
ansible.builtin.copy:
|
||||
src: /tmp/titania-postgres-ca.crt
|
||||
dest: "{{pgadmin_data_dir}}/certs/titania-postgres-ca.crt"
|
||||
owner: "{{pgadmin_user}}"
|
||||
group: "{{pgadmin_group}}"
|
||||
mode: '0644'
|
||||
when: "'titania.incus' in groups['ubuntu']"
|
||||
|
||||
- name: Template PgAdmin systemd service
|
||||
ansible.builtin.template:
|
||||
src: pgadmin.service.j2
|
||||
dest: /etc/systemd/system/pgadmin.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify: restart pgadmin
|
||||
|
||||
- name: Enable and start PgAdmin service
|
||||
ansible.builtin.systemd:
|
||||
name: pgadmin
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
# ===========================================================================
|
||||
# OAuth2-Proxy Sidecar (Prometheus UI)
|
||||
# ===========================================================================
|
||||
|
||||
- name: Create oauth2-proxy config directory
|
||||
ansible.builtin.file:
|
||||
path: "{{prometheus_oauth2_proxy_dir}}"
|
||||
owner: root
|
||||
group: root
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Download oauth2-proxy binary
|
||||
ansible.builtin.get_url:
|
||||
url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{prometheus_oauth2_proxy_version}}/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64.tar.gz"
|
||||
dest: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz"
|
||||
mode: '0644'
|
||||
|
||||
- name: Extract oauth2-proxy binary
|
||||
ansible.builtin.unarchive:
|
||||
src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.tar.gz"
|
||||
dest: /tmp
|
||||
remote_src: true
|
||||
creates: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy"
|
||||
|
||||
- name: Install oauth2-proxy binary
|
||||
ansible.builtin.copy:
|
||||
src: "/tmp/oauth2-proxy-v{{prometheus_oauth2_proxy_version}}.linux-amd64/oauth2-proxy"
|
||||
dest: /usr/local/bin/oauth2-proxy
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
remote_src: true
|
||||
|
||||
- name: Template oauth2-proxy configuration for Prometheus
|
||||
ansible.builtin.template:
|
||||
src: oauth2-proxy-prometheus.cfg.j2
|
||||
dest: "{{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg"
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0600'
|
||||
notify: restart oauth2-proxy-prometheus
|
||||
|
||||
- name: Template oauth2-proxy systemd service for Prometheus
|
||||
ansible.builtin.template:
|
||||
src: oauth2-proxy-prometheus.service.j2
|
||||
dest: /etc/systemd/system/oauth2-proxy-prometheus.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
notify:
|
||||
- reload systemd
|
||||
- restart oauth2-proxy-prometheus
|
||||
|
||||
- name: Enable and start OAuth2-Proxy for Prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: oauth2-proxy-prometheus
|
||||
enabled: true
|
||||
state: started
|
||||
daemon_reload: true
|
||||
|
||||
# ===========================================================================
|
||||
# SSL Certificate Distribution (from Titania)
|
||||
# ===========================================================================
|
||||
|
||||
- name: Create haproxy group
|
||||
ansible.builtin.group:
|
||||
name: "{{pplg_haproxy_group}}"
|
||||
gid: "{{pplg_haproxy_gid}}"
|
||||
system: true
|
||||
|
||||
- name: Create haproxy user
|
||||
ansible.builtin.user:
|
||||
name: "{{pplg_haproxy_user}}"
|
||||
comment: "PPLG HAProxy"
|
||||
group: "{{pplg_haproxy_group}}"
|
||||
uid: "{{pplg_haproxy_uid}}"
|
||||
system: true
|
||||
|
||||
- name: Create HAProxy directories
|
||||
ansible.builtin.file:
|
||||
path: "{{item}}"
|
||||
state: directory
|
||||
owner: "{{pplg_haproxy_user}}"
|
||||
group: "{{pplg_haproxy_group}}"
|
||||
mode: '750'
|
||||
loop:
|
||||
- /etc/haproxy
|
||||
- /etc/haproxy/certs
|
||||
|
||||
- name: Fetch wildcard certificate from Titania
|
||||
ansible.builtin.fetch:
|
||||
src: /etc/haproxy/certs/ouranos.pem
|
||||
dest: /tmp/ouranos-haproxy.pem
|
||||
flat: yes
|
||||
delegate_to: titania.incus
|
||||
when: "'titania.incus' in groups['ubuntu']"
|
||||
|
||||
- name: Deploy wildcard certificate
|
||||
ansible.builtin.copy:
|
||||
src: /tmp/ouranos-haproxy.pem
|
||||
dest: "{{pplg_haproxy_cert_path}}"
|
||||
owner: "{{pplg_haproxy_user}}"
|
||||
group: "{{pplg_haproxy_group}}"
|
||||
mode: '0640'
|
||||
when: "'titania.incus' in groups['ubuntu']"
|
||||
|
||||
- name: Generate self-signed wildcard certificate (fallback)
|
||||
command: >
|
||||
openssl req -x509 -nodes -days 365 -newkey rsa:2048
|
||||
-keyout {{pplg_haproxy_cert_path}}
|
||||
-out {{pplg_haproxy_cert_path}}
|
||||
-subj "/C=US/ST=State/L=City/O=Agathos/CN=*.{{pplg_haproxy_domain}}"
|
||||
-addext "subjectAltName=DNS:*.{{pplg_haproxy_domain}},DNS:{{pplg_haproxy_domain}}"
|
||||
when: "'titania.incus' not in groups['ubuntu']"
|
||||
args:
|
||||
creates: "{{pplg_haproxy_cert_path}}"
|
||||
|
||||
# ===========================================================================
|
||||
# HAProxy (TLS Termination)
|
||||
# ===========================================================================
|
||||
|
||||
- name: Template HAProxy configuration
|
||||
ansible.builtin.template:
|
||||
src: pplg-haproxy.cfg.j2
|
||||
dest: /etc/haproxy/haproxy.cfg
|
||||
owner: "{{pplg_haproxy_user}}"
|
||||
group: "{{pplg_haproxy_group}}"
|
||||
mode: "640"
|
||||
validate: haproxy -c -f %s
|
||||
notify: restart haproxy
|
||||
|
||||
- name: Enable and start HAProxy service
|
||||
ansible.builtin.systemd:
|
||||
name: haproxy
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
# ===========================================================================
|
||||
# Handlers
|
||||
# ===========================================================================
|
||||
handlers:
|
||||
- name: restart prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: restarted
|
||||
daemon_reload: true
|
||||
|
||||
- name: restart alertmanager
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus-alertmanager
|
||||
state: restarted
|
||||
|
||||
- name: restart loki
|
||||
ansible.builtin.systemd:
|
||||
name: loki
|
||||
state: restarted
|
||||
|
||||
- name: restart grafana
|
||||
ansible.builtin.systemd:
|
||||
name: grafana-server
|
||||
state: restarted
|
||||
|
||||
- name: restart pgadmin
|
||||
ansible.builtin.systemd:
|
||||
name: pgadmin
|
||||
state: restarted
|
||||
daemon_reload: true
|
||||
|
||||
- name: reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
|
||||
- name: restart haproxy
|
||||
ansible.builtin.systemd:
|
||||
name: haproxy
|
||||
state: reloaded
|
||||
|
||||
- name: restart oauth2-proxy-prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: oauth2-proxy-prometheus
|
||||
state: restarted
|
||||
36
ansible/pplg/grafana.ini.j2
Normal file
36
ansible/pplg/grafana.ini.j2
Normal file
@@ -0,0 +1,36 @@
|
||||
# Grafana Configuration - Managed by Ansible
|
||||
# Do not edit manually - changes will be overwritten
|
||||
|
||||
[server]
|
||||
root_url = {{ grafana_root_url }}
|
||||
|
||||
[auth]
|
||||
# Disable login form for OAuth users (admins can still use local auth)
|
||||
disable_login_form = false
|
||||
|
||||
[auth.generic_oauth]
|
||||
enabled = {{ grafana_oauth_enabled }}
|
||||
name = {{ grafana_oauth_name | default('Casdoor') }}
|
||||
allow_sign_up = {{ grafana_oauth_allow_sign_up | default(true) | lower }}
|
||||
client_id = {{ grafana_oauth_client_id }}
|
||||
client_secret = {{ grafana_oauth_client_secret }}
|
||||
scopes = {{ grafana_oauth_scopes | default('openid profile email') }}
|
||||
auth_url = {{ grafana_oauth_auth_url }}
|
||||
token_url = {{ grafana_oauth_token_url }}
|
||||
api_url = {{ grafana_oauth_api_url }}
|
||||
# Map Casdoor user attributes to Grafana
|
||||
email_attribute_path = email
|
||||
login_attribute_path = preferred_username
|
||||
name_attribute_path = name
|
||||
# Default role for new OAuth users
|
||||
role_attribute_path = contains(groups[*], 'grafana-admin') && 'Admin' || contains(groups[*], 'grafana-editor') && 'Editor' || 'Viewer'
|
||||
# TLS settings for internal communication
|
||||
tls_skip_verify_insecure = {{ grafana_oauth_skip_tls_verify | default(true) | lower }}
|
||||
|
||||
[log]
|
||||
# Console-only logging — systemd journal captures output, Alloy ships to Loki
|
||||
mode = console
|
||||
level = {{ grafana_log_level | default('info') }}
|
||||
|
||||
[log.console]
|
||||
format = text
|
||||
62
ansible/pplg/oauth2-proxy-prometheus.cfg.j2
Normal file
62
ansible/pplg/oauth2-proxy-prometheus.cfg.j2
Normal file
@@ -0,0 +1,62 @@
|
||||
# OAuth2-Proxy Configuration for Prometheus UI
|
||||
# Authenticates users via Casdoor OIDC before proxying to Prometheus
|
||||
# Red Panda Approved
|
||||
|
||||
# Provider Configuration (Casdoor OIDC)
|
||||
provider = "oidc"
|
||||
provider_display_name = "Casdoor"
|
||||
oidc_issuer_url = "{{prometheus_oauth2_oidc_issuer_url}}"
|
||||
client_id = "{{prometheus_oauth2_client_id}}"
|
||||
client_secret = "{{prometheus_oauth2_client_secret}}"
|
||||
|
||||
# Redirect URL after authentication
|
||||
redirect_url = "https://prometheus.{{pplg_haproxy_domain}}/oauth2/callback"
|
||||
|
||||
# Upstream service (Prometheus)
|
||||
upstreams = [
|
||||
"http://127.0.0.1:9090"
|
||||
]
|
||||
|
||||
# Session/Cookie Configuration
|
||||
cookie_secret = "{{prometheus_oauth2_cookie_secret}}"
|
||||
cookie_name = "_oauth2_proxy_prometheus"
|
||||
cookie_secure = true
|
||||
cookie_httponly = true
|
||||
cookie_expire = "168h"
|
||||
cookie_refresh = "1h"
|
||||
cookie_domains = ".{{pplg_haproxy_domain}}"
|
||||
session_store_type = "cookie"
|
||||
|
||||
# Authentication settings
|
||||
email_domains = ["*"]
|
||||
oidc_email_claim = "email"
|
||||
oidc_groups_claim = "groups"
|
||||
insecure_oidc_allow_unverified_email = true
|
||||
|
||||
# Request settings
|
||||
pass_access_token = false
|
||||
pass_authorization_header = false
|
||||
set_authorization_header = false
|
||||
set_xauthrequest = true
|
||||
|
||||
# Logging
|
||||
request_logging = true
|
||||
auth_logging = true
|
||||
standard_logging = true
|
||||
|
||||
# Network settings
|
||||
http_address = "0.0.0.0:{{prometheus_proxy_port}}"
|
||||
reverse_proxy = true
|
||||
real_client_ip_header = "X-Forwarded-For"
|
||||
|
||||
# Skip authentication for health check endpoints
|
||||
skip_auth_routes = [
|
||||
"^/ping$"
|
||||
]
|
||||
|
||||
# OIDC specific settings
|
||||
skip_provider_button = true
|
||||
oidc_extra_audiences = []
|
||||
|
||||
# SSL verification
|
||||
ssl_insecure_skip_verify = false
|
||||
18
ansible/pplg/oauth2-proxy-prometheus.service.j2
Normal file
18
ansible/pplg/oauth2-proxy-prometheus.service.j2
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=OAuth2-Proxy for Prometheus UI
|
||||
After=network.target prometheus.service
|
||||
Wants=prometheus.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/oauth2-proxy --config={{prometheus_oauth2_proxy_dir}}/oauth2-proxy.cfg
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=oauth2-proxy-prometheus
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
27
ansible/pplg/pgadmin.service.j2
Normal file
27
ansible/pplg/pgadmin.service.j2
Normal file
@@ -0,0 +1,27 @@
|
||||
[Unit]
|
||||
Description=PgAdmin4 Web Interface (Gunicorn)
|
||||
After=network.target
|
||||
Wants=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User={{pgadmin_user}}
|
||||
Group={{pgadmin_group}}
|
||||
WorkingDirectory=/usr/pgadmin4/web
|
||||
ExecStart=/usr/pgadmin4/venv/bin/python3 -m gunicorn pgAdmin4:app \
|
||||
--bind 127.0.0.1:{{pgadmin_port}} \
|
||||
--workers 1 \
|
||||
--threads 4 \
|
||||
--timeout 120 \
|
||||
--access-logfile - \
|
||||
--error-logfile -
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=pgadmin
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
127
ansible/pplg/pplg-haproxy.cfg.j2
Normal file
127
ansible/pplg/pplg-haproxy.cfg.j2
Normal file
@@ -0,0 +1,127 @@
|
||||
# PPLG HAProxy - Internal TLS Termination for Prospero
|
||||
# Services: Grafana, PgAdmin, Prometheus (via OAuth2-Proxy), Loki, Alertmanager
|
||||
# Managed by Ansible - Red Panda Approved
|
||||
|
||||
global
|
||||
log 127.0.0.1:{{pplg_haproxy_syslog_port}} local0
|
||||
stats timeout 30s
|
||||
|
||||
# Default SSL material locations
|
||||
ca-base /etc/ssl/certs
|
||||
crt-base /etc/ssl/private
|
||||
|
||||
# SSL/TLS configuration
|
||||
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384
|
||||
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
|
||||
ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets
|
||||
|
||||
defaults
|
||||
log global
|
||||
mode http
|
||||
option httplog
|
||||
option dontlognull
|
||||
log-format "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r"
|
||||
timeout connect 5s
|
||||
timeout client 50s
|
||||
timeout server 50s
|
||||
|
||||
# Stats page with Prometheus metrics
|
||||
listen stats
|
||||
bind *:{{pplg_haproxy_stats_port}}
|
||||
mode http
|
||||
stats enable
|
||||
stats uri /metrics
|
||||
stats refresh 15s
|
||||
stats show-legends
|
||||
stats show-node
|
||||
|
||||
# Prometheus metrics endpoint
|
||||
http-request use-service prometheus-exporter if { path /metrics }
|
||||
|
||||
# HTTP frontend - redirect all traffic to HTTPS
|
||||
frontend http_frontend
|
||||
bind *:80
|
||||
mode http
|
||||
option httplog
|
||||
http-request redirect scheme https code 301
|
||||
|
||||
# HTTPS frontend with subdomain-based routing
|
||||
frontend https_frontend
|
||||
bind *:443 ssl crt {{pplg_haproxy_cert_path}}
|
||||
mode http
|
||||
option httplog
|
||||
option forwardfor
|
||||
|
||||
# Forward original protocol and host
|
||||
http-request set-header X-Forwarded-Proto https
|
||||
http-request set-header X-Forwarded-Port %[dst_port]
|
||||
http-request set-header X-Forwarded-Host %[req.hdr(Host)]
|
||||
|
||||
# Security headers
|
||||
http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains"
|
||||
http-response set-header X-Frame-Options "SAMEORIGIN"
|
||||
http-response set-header X-Content-Type-Options "nosniff"
|
||||
http-response set-header X-XSS-Protection "1; mode=block"
|
||||
|
||||
# Subdomain ACLs
|
||||
acl host_grafana hdr_beg(host) -i grafana.{{pplg_haproxy_domain}}
|
||||
acl host_pgadmin hdr_beg(host) -i pgadmin.{{pplg_haproxy_domain}}
|
||||
acl host_prometheus hdr_beg(host) -i prometheus.{{pplg_haproxy_domain}}
|
||||
acl host_loki hdr_beg(host) -i loki.{{pplg_haproxy_domain}}
|
||||
acl host_alertmanager hdr_beg(host) -i alertmanager.{{pplg_haproxy_domain}}
|
||||
|
||||
# Prometheus write API - bypass OAuth2-Proxy (machine-to-machine)
|
||||
acl is_prometheus_write path_beg /api/v1/write
|
||||
|
||||
use_backend backend_grafana if host_grafana
|
||||
use_backend backend_pgadmin if host_pgadmin
|
||||
use_backend backend_prometheus_direct if host_prometheus is_prometheus_write
|
||||
use_backend backend_prometheus if host_prometheus
|
||||
use_backend backend_loki if host_loki
|
||||
use_backend backend_alertmanager if host_alertmanager
|
||||
|
||||
# Grafana - Native Casdoor OAuth SSO
|
||||
backend backend_grafana
|
||||
mode http
|
||||
balance roundrobin
|
||||
option httpchk GET /api/health
|
||||
http-check expect status 200
|
||||
server grafana_1 127.0.0.1:3000 check
|
||||
|
||||
# PgAdmin - Native Casdoor OAuth SSO
|
||||
backend backend_pgadmin
|
||||
mode http
|
||||
balance roundrobin
|
||||
option httpchk GET /misc/ping
|
||||
http-check expect status 200
|
||||
server pgadmin_1 127.0.0.1:{{pgadmin_port}} check
|
||||
|
||||
# Prometheus UI - via OAuth2-Proxy sidecar
|
||||
backend backend_prometheus
|
||||
mode http
|
||||
balance roundrobin
|
||||
option httpchk GET /ping
|
||||
http-check expect status 200
|
||||
server prometheus_1 127.0.0.1:{{prometheus_proxy_port}} check
|
||||
|
||||
# Prometheus Write API - direct (no auth, machine-to-machine)
|
||||
backend backend_prometheus_direct
|
||||
mode http
|
||||
balance roundrobin
|
||||
server prometheus_write_1 127.0.0.1:9090 check
|
||||
|
||||
# Loki - no auth (machine-to-machine log ingestion)
|
||||
backend backend_loki
|
||||
mode http
|
||||
balance roundrobin
|
||||
option httpchk GET /ready
|
||||
http-check expect status 200
|
||||
server loki_1 127.0.0.1:{{loki_port}} check
|
||||
|
||||
# Alertmanager - internal only
|
||||
backend backend_alertmanager
|
||||
mode http
|
||||
balance roundrobin
|
||||
option httpchk GET /-/healthy
|
||||
http-check expect status 200
|
||||
server alertmanager_1 127.0.0.1:{{alertmanager_port}} check
|
||||
48
ansible/pplg/prometheus.yml.j2
Normal file
48
ansible/pplg/prometheus.yml.j2
Normal file
@@ -0,0 +1,48 @@
|
||||
global:
|
||||
scrape_interval: {{ prometheus_scrape_interval }}
|
||||
evaluation_interval: {{ prometheus_evaluation_interval }}
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- {{ alertmanager_host }}:{{ alertmanager_port }}
|
||||
|
||||
rule_files:
|
||||
- "alert_rules.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: {{ prometheus_targets | to_json }}
|
||||
|
||||
- job_name: 'alertmanager'
|
||||
static_configs:
|
||||
- targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}']
|
||||
|
||||
- job_name: 'haproxy'
|
||||
static_configs:
|
||||
- targets: ['titania.incus:8404']
|
||||
metrics_path: '/metrics'
|
||||
|
||||
- job_name: 'gitea'
|
||||
static_configs:
|
||||
- targets: ['oberon.incus:22084']
|
||||
metrics_path: '/metrics'
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials: '{{ vault_gitea_metrics_token }}'
|
||||
|
||||
- job_name: 'casdoor'
|
||||
static_configs:
|
||||
- targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}']
|
||||
metrics_path: '/api/metrics'
|
||||
params:
|
||||
accessKey: ['{{ casdoor_prometheus_access_key }}']
|
||||
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
Reference in New Issue
Block a user