Files
ouranos/ansible/validate_puck_monitoring.yml
Robert Helewka b4d60f2f38 docs: rewrite README with structured overview and quick start guide
Replaces the minimal project description with a comprehensive README
including a component overview table, quick start instructions, common
Ansible operations, and links to detailed documentation. Aligns with
Red Panda Approval™ standards.
2026-03-03 12:49:06 +00:00

171 lines
6.9 KiB
YAML

---
# Red Panda Approved Validation Playbook 🐼
# Validates process and container monitoring deployment
#
# Usage: ansible-playbook validate_puck_monitoring.yml
- name: Validate Puck Process & Container Monitoring
hosts: puck.incus
gather_facts: false
tasks:
- name: "🐼 Check Alloy service is running"
become: true
ansible.builtin.systemd:
name: alloy
state: started
check_mode: true
register: alloy_status
- name: "🐼 Verify Alloy is active"
ansible.builtin.assert:
that:
- alloy_status.status.ActiveState == "active"
fail_msg: "Alloy service is not running on puck"
success_msg: "✅ Alloy service is active"
- name: "🐼 Check Alloy can access Docker socket"
become: true
ansible.builtin.command: id alloy
register: alloy_groups
changed_when: false
- name: "🐼 Verify alloy is in docker group"
ansible.builtin.assert:
that:
- "'docker' in alloy_groups.stdout"
fail_msg: "Alloy user is not in docker group - cAdvisor won't work"
success_msg: "✅ Alloy user is in docker group"
- name: "🐼 Wait for metrics to be available (30s)"
ansible.builtin.pause:
seconds: 30
when: alloy_status.changed | default(false)
- name: "🐼 Check Alloy health endpoint"
ansible.builtin.uri:
url: "http://localhost:12345/-/ready"
return_content: true
register: alloy_health
failed_when: false
- name: "🐼 Report Alloy health"
ansible.builtin.debug:
msg: "{{ 'Alloy health: ' + alloy_health.status | string }}"
- name: Validate Prometheus on Prospero
hosts: prospero.incus
gather_facts: false
tasks:
- name: "🐼 Check Prometheus service is running"
become: true
ansible.builtin.systemd:
name: prometheus
state: started
check_mode: true
register: prometheus_status
- name: "🐼 Verify Prometheus is active"
ansible.builtin.assert:
that:
- prometheus_status.status.ActiveState == "active"
fail_msg: "Prometheus service is not running on prospero"
success_msg: "✅ Prometheus service is active"
- name: "🐼 Check Prometheus can query puck process metrics"
ansible.builtin.uri:
url: "http://localhost:9090/api/v1/query?query=namedprocess_namegroup_num_procs{instance=~\"puck.*\"}"
return_content: true
register: process_metrics
failed_when: false
- name: "🐼 Verify process metrics are available"
ansible.builtin.assert:
that:
- process_metrics.status == 200
- process_metrics.json.status == "success"
- process_metrics.json.data.result | length > 0
fail_msg: "No process metrics found from puck - check Alloy remote_write config"
success_msg: "✅ Process metrics are being received from puck ({{ process_metrics.json.data.result | length }} series)"
when: process_metrics.status == 200
- name: "🐼 Check Prometheus can query puck container metrics"
ansible.builtin.uri:
url: "http://localhost:9090/api/v1/query?query=container_last_seen{instance=~\"puck.*\"}"
return_content: true
register: container_metrics
failed_when: false
- name: "🐼 Verify container metrics are available"
ansible.builtin.assert:
that:
- container_metrics.status == 200
- container_metrics.json.status == "success"
fail_msg: "No container metrics found from puck - check cAdvisor config"
success_msg: "✅ Container metrics are being received from puck"
when: container_metrics.status == 200
- name: "🐼 Check alert rules are loaded"
ansible.builtin.uri:
url: "http://localhost:9090/api/v1/rules"
return_content: true
register: alert_rules
- name: "🐼 Verify alert rules are present"
ansible.builtin.assert:
that:
- alert_rules.status == 200
- alert_rules.json.data.groups | length > 0
fail_msg: "No alert rules loaded in Prometheus"
success_msg: "✅ Alert rules are loaded ({{ alert_rules.json.data.groups | length }} groups)"
- name: "🐼 Check for puck process alerts"
ansible.builtin.set_fact:
has_puck_alerts: "{{ alert_rules.json.data.groups | selectattr('name', 'equalto', 'puck_process_alerts') | list | length > 0 }}"
- name: "🐼 Verify puck process alert group exists"
ansible.builtin.assert:
that:
- has_puck_alerts
fail_msg: "puck_process_alerts group not found in Prometheus rules"
success_msg: "✅ puck_process_alerts group is loaded"
- name: "🐼 Check Alertmanager is reachable"
ansible.builtin.uri:
url: "http://localhost:9093/-/healthy"
return_content: true
register: alertmanager_health
failed_when: false
- name: "🐼 Verify Alertmanager is healthy"
ansible.builtin.assert:
that:
- alertmanager_health.status == 200
fail_msg: "Alertmanager is not healthy"
success_msg: "✅ Alertmanager is healthy"
when: alertmanager_health.status is defined
- name: Summary
hosts: localhost
gather_facts: false
tasks:
- name: "🐼 Validation Complete"
ansible.builtin.debug:
msg: |
╔═══════════════════════════════════════════════════════════════╗
║ 🐼 RED PANDA MONITORING VALIDATION COMPLETE 🐼 ║
╠═══════════════════════════════════════════════════════════════╣
║ ║
║ Next Steps: ║
║ 1. Import dashboards to Grafana: ║
║ - ansible/grafana/dashboards/puck_processes.json ║
║ - ansible/grafana/dashboards/puck_containers.json ║
║ ║
║ 2. Verify alerts in Prometheus UI: ║
║ http://prospero.incus:9090/alerts ║
║ ║
║ 3. Test alert routing: ║
║ http://prospero.incus:9093/#/alerts ║
║ ║
╚═══════════════════════════════════════════════════════════════╝