--- # Red Panda Approved Validation Playbook 🐼 # Validates process and container monitoring deployment # # Usage: ansible-playbook validate_puck_monitoring.yml - name: Validate Puck Process & Container Monitoring hosts: puck.incus gather_facts: false tasks: - name: "🐼 Check Alloy service is running" become: true ansible.builtin.systemd: name: alloy state: started check_mode: true register: alloy_status - name: "🐼 Verify Alloy is active" ansible.builtin.assert: that: - alloy_status.status.ActiveState == "active" fail_msg: "Alloy service is not running on puck" success_msg: "✅ Alloy service is active" - name: "🐼 Check Alloy can access Docker socket" become: true ansible.builtin.command: id alloy register: alloy_groups changed_when: false - name: "🐼 Verify alloy is in docker group" ansible.builtin.assert: that: - "'docker' in alloy_groups.stdout" fail_msg: "Alloy user is not in docker group - cAdvisor won't work" success_msg: "✅ Alloy user is in docker group" - name: "🐼 Wait for metrics to be available (30s)" ansible.builtin.pause: seconds: 30 when: alloy_status.changed | default(false) - name: "🐼 Check Alloy health endpoint" ansible.builtin.uri: url: "http://localhost:12345/-/ready" return_content: true register: alloy_health failed_when: false - name: "🐼 Report Alloy health" ansible.builtin.debug: msg: "{{ 'Alloy health: ' + alloy_health.status | string }}" - name: Validate Prometheus on Prospero hosts: prospero.incus gather_facts: false tasks: - name: "🐼 Check Prometheus service is running" become: true ansible.builtin.systemd: name: prometheus state: started check_mode: true register: prometheus_status - name: "🐼 Verify Prometheus is active" ansible.builtin.assert: that: - prometheus_status.status.ActiveState == "active" fail_msg: "Prometheus service is not running on prospero" success_msg: "✅ Prometheus service is active" - name: "🐼 Check Prometheus can query puck process metrics" ansible.builtin.uri: url: "http://localhost:9090/api/v1/query?query=namedprocess_namegroup_num_procs{instance=~\"puck.*\"}" return_content: true register: process_metrics failed_when: false - name: "🐼 Verify process metrics are available" ansible.builtin.assert: that: - process_metrics.status == 200 - process_metrics.json.status == "success" - process_metrics.json.data.result | length > 0 fail_msg: "No process metrics found from puck - check Alloy remote_write config" success_msg: "✅ Process metrics are being received from puck ({{ process_metrics.json.data.result | length }} series)" when: process_metrics.status == 200 - name: "🐼 Check Prometheus can query puck container metrics" ansible.builtin.uri: url: "http://localhost:9090/api/v1/query?query=container_last_seen{instance=~\"puck.*\"}" return_content: true register: container_metrics failed_when: false - name: "🐼 Verify container metrics are available" ansible.builtin.assert: that: - container_metrics.status == 200 - container_metrics.json.status == "success" fail_msg: "No container metrics found from puck - check cAdvisor config" success_msg: "✅ Container metrics are being received from puck" when: container_metrics.status == 200 - name: "🐼 Check alert rules are loaded" ansible.builtin.uri: url: "http://localhost:9090/api/v1/rules" return_content: true register: alert_rules - name: "🐼 Verify alert rules are present" ansible.builtin.assert: that: - alert_rules.status == 200 - alert_rules.json.data.groups | length > 0 fail_msg: "No alert rules loaded in Prometheus" success_msg: "✅ Alert rules are loaded ({{ alert_rules.json.data.groups | length }} groups)" - name: "🐼 Check for puck process alerts" ansible.builtin.set_fact: has_puck_alerts: "{{ alert_rules.json.data.groups | selectattr('name', 'equalto', 'puck_process_alerts') | list | length > 0 }}" - name: "🐼 Verify puck process alert group exists" ansible.builtin.assert: that: - has_puck_alerts fail_msg: "puck_process_alerts group not found in Prometheus rules" success_msg: "✅ puck_process_alerts group is loaded" - name: "🐼 Check Alertmanager is reachable" ansible.builtin.uri: url: "http://localhost:9093/-/healthy" return_content: true register: alertmanager_health failed_when: false - name: "🐼 Verify Alertmanager is healthy" ansible.builtin.assert: that: - alertmanager_health.status == 200 fail_msg: "Alertmanager is not healthy" success_msg: "✅ Alertmanager is healthy" when: alertmanager_health.status is defined - name: Summary hosts: localhost gather_facts: false tasks: - name: "🐼 Validation Complete" ansible.builtin.debug: msg: | ╔═══════════════════════════════════════════════════════════════╗ ║ 🐼 RED PANDA MONITORING VALIDATION COMPLETE 🐼 ║ ╠═══════════════════════════════════════════════════════════════╣ ║ ║ ║ Next Steps: ║ ║ 1. Import dashboards to Grafana: ║ ║ - ansible/grafana/dashboards/puck_processes.json ║ ║ - ansible/grafana/dashboards/puck_containers.json ║ ║ ║ ║ 2. Verify alerts in Prometheus UI: ║ ║ http://prospero.incus:9090/alerts ║ ║ ║ ║ 3. Test alert routing: ║ ║ http://prospero.incus:9093/#/alerts ║ ║ ║ ╚═══════════════════════════════════════════════════════════════╝