docs: rewrite README with structured overview and quick start guide

Replaces the minimal project description with a comprehensive README
including a component overview table, quick start instructions, common
Ansible operations, and links to detailed documentation. Aligns with
Red Panda Approval™ standards.
This commit is contained in:
2026-03-03 12:49:06 +00:00
parent c7be03a743
commit b4d60f2f38
219 changed files with 34586 additions and 2 deletions

View File

@@ -0,0 +1,249 @@
# Prometheus Alert Rules
# Red Panda Approved 🐼
# Deployed to: /etc/prometheus/alert_rules.yml
{% raw %}
groups:
# ============================================================================
# Node/Infrastructure Alerts
# ============================================================================
- name: node_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 2m
labels:
severity: critical
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
- alert: HighLoadAverage
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
# ============================================================================
# Process-Level Alerts (puck.incus)
# ============================================================================
- name: puck_process_alerts
rules:
- alert: PuckHighCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
- alert: PuckCriticalCPUProcess
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
- alert: PuckHighMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
- alert: PuckCriticalMemoryProcess
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
for: 1m
labels:
severity: critical
annotations:
summary: "Critical memory process on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
- alert: PuckProcessCrashLoop
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
for: 1m
labels:
severity: warning
annotations:
summary: "Process count dropped on puck: {{ $labels.groupname }}"
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
# ============================================================================
# Docker Container Alerts (puck.incus)
# ============================================================================
- name: puck_container_alerts
rules:
- alert: PuckHighContainerCount
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High container count on puck"
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
- alert: PuckDuplicateContainers
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Duplicate containers on puck: {{ $labels.image }}"
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
- alert: PuckOrphanedContainer
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
for: 10m
labels:
severity: warning
annotations:
summary: "Possible orphaned container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
- alert: PuckMCPContainerOnPuck
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
for: 1m
labels:
severity: critical
annotations:
summary: "MCP container detected on puck (WRONG HOST)"
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
- alert: PuckContainerHighCPU
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
- alert: PuckContainerHighMemory
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
for: 2m
labels:
severity: warning
annotations:
summary: "High memory container on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
- alert: PuckContainerOOMKilled
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Container OOM killed on puck: {{ $labels.name }}"
description: "Container {{ $labels.name }} was killed by OOM killer"
# ============================================================================
# Service/Application Alerts
# ============================================================================
- name: service_alerts
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus target missing: {{ $labels.instance }}"
description: "A Prometheus target has been down for more than 5 minutes."
- alert: PrometheusJobMissing
expr: absent(up{job="node-exporter"})
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus job missing"
description: "A Prometheus job has disappeared from target discovery."
- alert: AlertmanagerDown
expr: absent(up{job="alertmanager"})
for: 5m
labels:
severity: critical
annotations:
summary: "Alertmanager is down"
description: "Alertmanager is not responding. Alerts may not be delivered."
# ============================================================================
# Loki/Logging Alerts
# ============================================================================
- name: loki_alerts
rules:
- alert: LokiHighLogVolume
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
for: 10m
labels:
severity: warning
annotations:
summary: "High log ingestion rate"
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -0,0 +1,92 @@
lobal:
resolve_timeout: 5m
route:
group_by: ['alertname', 'instance', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'pushover'
routes:
- match:
severity: critical
receiver: 'pushover-critical'
continue: true
- match:
severity: warning
receiver: 'pushover-warning'
continue: true
- match:
severity: info
receiver: 'pushover-info'
repeat_interval: 24h
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
receivers:
- name: 'pushover-critical'
pushover_configs:
- user_key: '{{ pushover_user_key }}'
token: '{{ pushover_api_token }}'
send_resolved: true
html: true
priority: '2' # Emergency priority
retry: 30
expire: 3600
title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
message: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
- name: 'pushover-warning'
pushover_configs:
- user_key: '{{ pushover_user_key }}'
token: '{{ pushover_api_token }}'
send_resolved: true
html: true
priority: '1' # High priority
retry: 30
expire: 3600
title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
message: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
- name: 'pushover-info'
pushover_configs:
- user_key: '{{ pushover_user_key }}'
token: '{{ pushover_api_token }}'
send_resolved: false
html: true
priority: '0' # Normal priority
title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
- name: 'pushover'
pushover_configs:
- user_key: '{{ pushover_user_key }}'
token: '{{ pushover_api_token }}'
send_resolved: true
html: true
priority: '1'
retry: 30
expire: 3600
title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
message: |-
{{ "{{" }} range .Alerts {{ "}}" }}
{{ "{{" }} .Annotations.description {{ "}}" }}
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
{{ "{{" }} end {{ "}}" }}

View File

@@ -0,0 +1,48 @@
---
- name: Deploy Prometheus Alertmanager with Pushover
hosts: ubuntu
become: true
tasks:
- name: Check if host has alertmanager service
ansible.builtin.set_fact:
has_alertmanager_service: "{{'alertmanager' in services}}"
- name: Skip hosts without alertmanager service
ansible.builtin.meta: end_host
when: not has_alertmanager_service
- name: Install Alertmanager
ansible.builtin.apt:
name: prometheus-alertmanager
state: present
update_cache: true
- name: Create alertmanager config directory
ansible.builtin.file:
path: /etc/alertmanager
state: directory
owner: prometheus
group: prometheus
mode: '750'
- name: Template alertmanager configuration
ansible.builtin.template:
src: prometheus/alertmanager.yml
dest: /etc/alertmanager/alertmanager.yml
owner: prometheus
group: prometheus
mode: '550'
notify: restart alertmanager
- name: Start and enable Alertmanager service
ansible.builtin.systemd:
name: prometheus-alertmanager
state: started
enabled: true
daemon_reload: true
handlers:
- name: restart alertmanager
ansible.builtin.systemd:
name: prometheus-alertmanager
state: restarted

View File

@@ -0,0 +1,82 @@
---
- name: Deploy Prometheus
hosts: ubuntu
become: true
tasks:
- name: Check if host has prometheus service
ansible.builtin.set_fact:
has_prometheus_service: "{{'prometheus' in services}}"
- name: Skip hosts without prometheus service
ansible.builtin.meta: end_host
when: not has_prometheus_service
- name: Install Prometheus
ansible.builtin.apt:
name: prometheus
state: present
update_cache: true
- name: Fix Prometheus directory permissions
ansible.builtin.file:
path: /var/lib/prometheus
owner: prometheus
group: prometheus
mode: '750'
recurse: true
- name: Create textfile collector directory
ansible.builtin.file:
path: /var/lib/prometheus/node-exporter
state: directory
owner: prometheus
group: prometheus
mode: '750'
- name: Template prometheus.yml to Prospero
ansible.builtin.template:
src: prometheus.yml.j2
dest: /etc/prometheus/prometheus.yml
owner: prometheus
group: prometheus
mode: '640'
notify: restart prometheus
- name: Template alert_rules.yml to Prospero
ansible.builtin.template:
src: alert_rules.yml.j2
dest: /etc/prometheus/alert_rules.yml
owner: prometheus
group: prometheus
mode: '640'
notify: restart prometheus
- name: Create Prometheus systemd override directory
ansible.builtin.file:
path: /etc/systemd/system/prometheus.service.d
state: directory
mode: '755'
- name: Enable remote write receiver
ansible.builtin.copy:
content: |
[Service]
ExecStart=
ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver
dest: /etc/systemd/system/prometheus.service.d/override.conf
mode: '644'
notify: restart prometheus
- name: Start and enable Prometheus service
ansible.builtin.systemd:
name: prometheus
state: started
enabled: true
daemon_reload: true
handlers:
- name: restart prometheus
ansible.builtin.systemd:
name: prometheus
state: restarted
daemon_reload: true

View File

@@ -0,0 +1,14 @@
---
- hosts: ubuntu
become: true
tasks:
- name: Aptitude Update
apt:
name: "*"
state: latest
update_cache: true
- name: Install Prometheus Node Exporter
apt:
name: prometheus-node-exporter
state: present

View File

@@ -0,0 +1,48 @@
global:
scrape_interval: {{ prometheus_scrape_interval }}
evaluation_interval: {{ prometheus_evaluation_interval }}
alerting:
alertmanagers:
- static_configs:
- targets:
- {{ alertmanager_host }}:{{ alertmanager_port }}
rule_files:
- "alert_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: {{ prometheus_targets | to_json }}
- job_name: 'alertmanager'
static_configs:
- targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}']
- job_name: 'haproxy'
static_configs:
- targets: ['titania.incus:8404']
metrics_path: '/metrics'
- job_name: 'gitea'
static_configs:
- targets: ['oberon.incus:22084']
metrics_path: '/metrics'
authorization:
type: Bearer
credentials: '{{ vault_gitea_metrics_token }}'
- job_name: 'casdoor'
static_configs:
- targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}']
metrics_path: '/api/metrics'
params:
accessKey: ['{{ casdoor_prometheus_access_key }}']
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
# Red Panda Approved Prometheus Configuration