docs: rewrite README with structured overview and quick start guide
Replaces the minimal project description with a comprehensive README including a component overview table, quick start instructions, common Ansible operations, and links to detailed documentation. Aligns with Red Panda Approval™ standards.
This commit is contained in:
249
ansible/prometheus/alert_rules.yml.j2
Normal file
249
ansible/prometheus/alert_rules.yml.j2
Normal file
@@ -0,0 +1,249 @@
|
||||
# Prometheus Alert Rules
|
||||
# Red Panda Approved 🐼
|
||||
# Deployed to: /etc/prometheus/alert_rules.yml
|
||||
{% raw %}
|
||||
groups:
|
||||
# ============================================================================
|
||||
# Node/Infrastructure Alerts
|
||||
# ============================================================================
|
||||
- name: node_alerts
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} is down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 80% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 95% on {{ $labels.instance }} (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 20% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has less than 10% free space (current value: {{ $value | printf \"%.1f\" }}%)"
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} times the CPU count on {{ $labels.instance }}"
|
||||
|
||||
# ============================================================================
|
||||
# Process-Level Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_process_alerts
|
||||
rules:
|
||||
- alert: PuckHighCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU for more than 2 minutes"
|
||||
|
||||
- alert: PuckCriticalCPUProcess
|
||||
expr: sum by (groupname, instance) (rate(namedprocess_namegroup_cpu_seconds_total{instance=~"puck.*"}[1m])) * 100 > 95
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical CPU process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | printf \"%.1f\" }}% CPU - immediate attention required"
|
||||
|
||||
- alert: PuckHighMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory"
|
||||
|
||||
- alert: PuckCriticalMemoryProcess
|
||||
expr: namedprocess_namegroup_memory_bytes{memtype="resident", instance=~"puck.*"} > 2147483648
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical memory process on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} is using {{ $value | humanize }} resident memory - immediate attention required"
|
||||
|
||||
- alert: PuckProcessCrashLoop
|
||||
expr: increase(namedprocess_namegroup_num_procs{instance=~"puck.*"}[5m]) < -1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Process count dropped on puck: {{ $labels.groupname }}"
|
||||
description: "Process {{ $labels.groupname }} count has decreased, indicating possible crash or restart"
|
||||
|
||||
# ============================================================================
|
||||
# Docker Container Alerts (puck.incus)
|
||||
# ============================================================================
|
||||
- name: puck_container_alerts
|
||||
rules:
|
||||
- alert: PuckHighContainerCount
|
||||
expr: count(container_last_seen{instance=~"puck.*", name!=""}) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High container count on puck"
|
||||
description: "puck.incus has {{ $value }} running containers, which exceeds the threshold of 5"
|
||||
|
||||
- alert: PuckDuplicateContainers
|
||||
expr: count by (image, instance) (container_last_seen{instance=~"puck.*", name!=""}) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Duplicate containers on puck: {{ $labels.image }}"
|
||||
description: "{{ $value }} containers running the same image {{ $labels.image }} on puck"
|
||||
|
||||
- alert: PuckOrphanedContainer
|
||||
expr: (time() - container_start_time_seconds{instance=~"puck.*", name=~".*_.*"}) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Possible orphaned container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} with auto-generated name has been running for {{ $value | humanizeDuration }}"
|
||||
|
||||
- alert: PuckMCPContainerOnPuck
|
||||
expr: container_last_seen{instance=~"puck.*", image=~".*mcp-server.*|.*mcp_server.*"}
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MCP container detected on puck (WRONG HOST)"
|
||||
description: "Container {{ $labels.name }} ({{ $labels.image }}) is running on puck but MCP servers should run on miranda.incus"
|
||||
|
||||
- alert: PuckContainerHighCPU
|
||||
expr: sum by (name, instance) (rate(container_cpu_usage_seconds_total{instance=~"puck.*", name!=""}[2m])) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | printf \"%.1f\" }}% CPU"
|
||||
|
||||
- alert: PuckContainerHighMemory
|
||||
expr: container_memory_usage_bytes{instance=~"puck.*", name!=""} > 1073741824
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory container on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} is using {{ $value | humanize }} memory"
|
||||
|
||||
- alert: PuckContainerOOMKilled
|
||||
expr: increase(container_oom_events_total{instance=~"puck.*", name!=""}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container OOM killed on puck: {{ $labels.name }}"
|
||||
description: "Container {{ $labels.name }} was killed by OOM killer"
|
||||
|
||||
# ============================================================================
|
||||
# Service/Application Alerts
|
||||
# ============================================================================
|
||||
- name: service_alerts
|
||||
rules:
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus target missing: {{ $labels.instance }}"
|
||||
description: "A Prometheus target has been down for more than 5 minutes."
|
||||
|
||||
- alert: PrometheusJobMissing
|
||||
expr: absent(up{job="node-exporter"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus job missing"
|
||||
description: "A Prometheus job has disappeared from target discovery."
|
||||
|
||||
- alert: AlertmanagerDown
|
||||
expr: absent(up{job="alertmanager"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Alertmanager is down"
|
||||
description: "Alertmanager is not responding. Alerts may not be delivered."
|
||||
|
||||
# ============================================================================
|
||||
# Loki/Logging Alerts
|
||||
# ============================================================================
|
||||
- name: loki_alerts
|
||||
rules:
|
||||
- alert: LokiHighLogVolume
|
||||
expr: sum(rate(loki_distributor_bytes_received_total[5m])) > 10485760
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
92
ansible/prometheus/alertmanager.yml
Normal file
92
ansible/prometheus/alertmanager.yml
Normal file
@@ -0,0 +1,92 @@
|
||||
lobal:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'instance', 'severity']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'pushover'
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'pushover-critical'
|
||||
continue: true
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'pushover-warning'
|
||||
continue: true
|
||||
- match:
|
||||
severity: info
|
||||
receiver: 'pushover-info'
|
||||
repeat_interval: 24h
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
receivers:
|
||||
- name: 'pushover-critical'
|
||||
pushover_configs:
|
||||
- user_key: '{{ pushover_user_key }}'
|
||||
token: '{{ pushover_api_token }}'
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: '2' # Emergency priority
|
||||
retry: 30
|
||||
expire: 3600
|
||||
title: '🚨 [CRITICAL] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
message: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
|
||||
- name: 'pushover-warning'
|
||||
pushover_configs:
|
||||
- user_key: '{{ pushover_user_key }}'
|
||||
token: '{{ pushover_api_token }}'
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: '1' # High priority
|
||||
retry: 30
|
||||
expire: 3600
|
||||
title: '⚠️ [WARNING] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
message: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
|
||||
- name: 'pushover-info'
|
||||
pushover_configs:
|
||||
- user_key: '{{ pushover_user_key }}'
|
||||
token: '{{ pushover_api_token }}'
|
||||
send_resolved: false
|
||||
html: true
|
||||
priority: '0' # Normal priority
|
||||
title: '{{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
message: '{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}'
|
||||
|
||||
- name: 'pushover'
|
||||
pushover_configs:
|
||||
- user_key: '{{ pushover_user_key }}'
|
||||
token: '{{ pushover_api_token }}'
|
||||
send_resolved: true
|
||||
html: true
|
||||
priority: '1'
|
||||
retry: 30
|
||||
expire: 3600
|
||||
title: '[{{ "{{" }} .GroupLabels.severity | default "ALERT" {{ "}}" }}] {{ "{{" }} .GroupLabels.alertname {{ "}}" }}'
|
||||
message: |-
|
||||
{{ "{{" }} range .Alerts {{ "}}" }}
|
||||
{{ "{{" }} .Annotations.description {{ "}}" }}
|
||||
|
||||
Instance: {{ "{{" }} .Labels.instance {{ "}}" }}
|
||||
Severity: {{ "{{" }} .Labels.severity {{ "}}" }}
|
||||
{{ "{{" }} end {{ "}}" }}
|
||||
48
ansible/prometheus/alertmanager_deploy.yml
Normal file
48
ansible/prometheus/alertmanager_deploy.yml
Normal file
@@ -0,0 +1,48 @@
|
||||
---
|
||||
- name: Deploy Prometheus Alertmanager with Pushover
|
||||
hosts: ubuntu
|
||||
become: true
|
||||
tasks:
|
||||
- name: Check if host has alertmanager service
|
||||
ansible.builtin.set_fact:
|
||||
has_alertmanager_service: "{{'alertmanager' in services}}"
|
||||
|
||||
- name: Skip hosts without alertmanager service
|
||||
ansible.builtin.meta: end_host
|
||||
when: not has_alertmanager_service
|
||||
|
||||
- name: Install Alertmanager
|
||||
ansible.builtin.apt:
|
||||
name: prometheus-alertmanager
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Create alertmanager config directory
|
||||
ansible.builtin.file:
|
||||
path: /etc/alertmanager
|
||||
state: directory
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
|
||||
- name: Template alertmanager configuration
|
||||
ansible.builtin.template:
|
||||
src: prometheus/alertmanager.yml
|
||||
dest: /etc/alertmanager/alertmanager.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '550'
|
||||
notify: restart alertmanager
|
||||
|
||||
- name: Start and enable Alertmanager service
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus-alertmanager
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
handlers:
|
||||
- name: restart alertmanager
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus-alertmanager
|
||||
state: restarted
|
||||
82
ansible/prometheus/deploy.yml
Normal file
82
ansible/prometheus/deploy.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
---
|
||||
- name: Deploy Prometheus
|
||||
hosts: ubuntu
|
||||
become: true
|
||||
tasks:
|
||||
- name: Check if host has prometheus service
|
||||
ansible.builtin.set_fact:
|
||||
has_prometheus_service: "{{'prometheus' in services}}"
|
||||
|
||||
- name: Skip hosts without prometheus service
|
||||
ansible.builtin.meta: end_host
|
||||
when: not has_prometheus_service
|
||||
|
||||
- name: Install Prometheus
|
||||
ansible.builtin.apt:
|
||||
name: prometheus
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Fix Prometheus directory permissions
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/prometheus
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
recurse: true
|
||||
|
||||
- name: Create textfile collector directory
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/prometheus/node-exporter
|
||||
state: directory
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '750'
|
||||
|
||||
- name: Template prometheus.yml to Prospero
|
||||
ansible.builtin.template:
|
||||
src: prometheus.yml.j2
|
||||
dest: /etc/prometheus/prometheus.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '640'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Template alert_rules.yml to Prospero
|
||||
ansible.builtin.template:
|
||||
src: alert_rules.yml.j2
|
||||
dest: /etc/prometheus/alert_rules.yml
|
||||
owner: prometheus
|
||||
group: prometheus
|
||||
mode: '640'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Create Prometheus systemd override directory
|
||||
ansible.builtin.file:
|
||||
path: /etc/systemd/system/prometheus.service.d
|
||||
state: directory
|
||||
mode: '755'
|
||||
|
||||
- name: Enable remote write receiver
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
[Service]
|
||||
ExecStart=
|
||||
ExecStart=/usr/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/metrics2/ --web.console.templates=/etc/prometheus/consoles --web.console.libraries=/etc/prometheus/console_libraries --web.listen-address=0.0.0.0:9090 --web.external-url= --web.enable-remote-write-receiver
|
||||
dest: /etc/systemd/system/prometheus.service.d/override.conf
|
||||
mode: '644'
|
||||
notify: restart prometheus
|
||||
|
||||
- name: Start and enable Prometheus service
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: started
|
||||
enabled: true
|
||||
daemon_reload: true
|
||||
|
||||
handlers:
|
||||
- name: restart prometheus
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: restarted
|
||||
daemon_reload: true
|
||||
14
ansible/prometheus/node_deploy.yml
Normal file
14
ansible/prometheus/node_deploy.yml
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
- hosts: ubuntu
|
||||
become: true
|
||||
tasks:
|
||||
- name: Aptitude Update
|
||||
apt:
|
||||
name: "*"
|
||||
state: latest
|
||||
update_cache: true
|
||||
|
||||
- name: Install Prometheus Node Exporter
|
||||
apt:
|
||||
name: prometheus-node-exporter
|
||||
state: present
|
||||
48
ansible/prometheus/prometheus.yml.j2
Normal file
48
ansible/prometheus/prometheus.yml.j2
Normal file
@@ -0,0 +1,48 @@
|
||||
global:
|
||||
scrape_interval: {{ prometheus_scrape_interval }}
|
||||
evaluation_interval: {{ prometheus_evaluation_interval }}
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- {{ alertmanager_host }}:{{ alertmanager_port }}
|
||||
|
||||
rule_files:
|
||||
- "alert_rules.yml"
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: {{ prometheus_targets | to_json }}
|
||||
|
||||
- job_name: 'alertmanager'
|
||||
static_configs:
|
||||
- targets: ['{{ alertmanager_host }}:{{ alertmanager_port }}']
|
||||
|
||||
- job_name: 'haproxy'
|
||||
static_configs:
|
||||
- targets: ['titania.incus:8404']
|
||||
metrics_path: '/metrics'
|
||||
|
||||
- job_name: 'gitea'
|
||||
static_configs:
|
||||
- targets: ['oberon.incus:22084']
|
||||
metrics_path: '/metrics'
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials: '{{ vault_gitea_metrics_token }}'
|
||||
|
||||
- job_name: 'casdoor'
|
||||
static_configs:
|
||||
- targets: ['{{ casdoor_metrics_host }}:{{ casdoor_metrics_port }}']
|
||||
metrics_path: '/api/metrics'
|
||||
params:
|
||||
accessKey: ['{{ casdoor_prometheus_access_key }}']
|
||||
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
Reference in New Issue
Block a user