Replaces the minimal project description with a comprehensive README including a component overview table, quick start instructions, common Ansible operations, and links to detailed documentation. Aligns with Red Panda Approval™ standards.
171 lines
6.9 KiB
YAML
171 lines
6.9 KiB
YAML
---
|
|
# Red Panda Approved Validation Playbook 🐼
|
|
# Validates process and container monitoring deployment
|
|
#
|
|
# Usage: ansible-playbook validate_puck_monitoring.yml
|
|
|
|
- name: Validate Puck Process & Container Monitoring
|
|
hosts: puck.incus
|
|
gather_facts: false
|
|
tasks:
|
|
- name: "🐼 Check Alloy service is running"
|
|
become: true
|
|
ansible.builtin.systemd:
|
|
name: alloy
|
|
state: started
|
|
check_mode: true
|
|
register: alloy_status
|
|
|
|
- name: "🐼 Verify Alloy is active"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- alloy_status.status.ActiveState == "active"
|
|
fail_msg: "Alloy service is not running on puck"
|
|
success_msg: "✅ Alloy service is active"
|
|
|
|
- name: "🐼 Check Alloy can access Docker socket"
|
|
become: true
|
|
ansible.builtin.command: id alloy
|
|
register: alloy_groups
|
|
changed_when: false
|
|
|
|
- name: "🐼 Verify alloy is in docker group"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- "'docker' in alloy_groups.stdout"
|
|
fail_msg: "Alloy user is not in docker group - cAdvisor won't work"
|
|
success_msg: "✅ Alloy user is in docker group"
|
|
|
|
- name: "🐼 Wait for metrics to be available (30s)"
|
|
ansible.builtin.pause:
|
|
seconds: 30
|
|
when: alloy_status.changed | default(false)
|
|
|
|
- name: "🐼 Check Alloy health endpoint"
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:12345/-/ready"
|
|
return_content: true
|
|
register: alloy_health
|
|
failed_when: false
|
|
|
|
- name: "🐼 Report Alloy health"
|
|
ansible.builtin.debug:
|
|
msg: "{{ 'Alloy health: ' + alloy_health.status | string }}"
|
|
|
|
- name: Validate Prometheus on Prospero
|
|
hosts: prospero.incus
|
|
gather_facts: false
|
|
tasks:
|
|
- name: "🐼 Check Prometheus service is running"
|
|
become: true
|
|
ansible.builtin.systemd:
|
|
name: prometheus
|
|
state: started
|
|
check_mode: true
|
|
register: prometheus_status
|
|
|
|
- name: "🐼 Verify Prometheus is active"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- prometheus_status.status.ActiveState == "active"
|
|
fail_msg: "Prometheus service is not running on prospero"
|
|
success_msg: "✅ Prometheus service is active"
|
|
|
|
- name: "🐼 Check Prometheus can query puck process metrics"
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:9090/api/v1/query?query=namedprocess_namegroup_num_procs{instance=~\"puck.*\"}"
|
|
return_content: true
|
|
register: process_metrics
|
|
failed_when: false
|
|
|
|
- name: "🐼 Verify process metrics are available"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- process_metrics.status == 200
|
|
- process_metrics.json.status == "success"
|
|
- process_metrics.json.data.result | length > 0
|
|
fail_msg: "No process metrics found from puck - check Alloy remote_write config"
|
|
success_msg: "✅ Process metrics are being received from puck ({{ process_metrics.json.data.result | length }} series)"
|
|
when: process_metrics.status == 200
|
|
|
|
- name: "🐼 Check Prometheus can query puck container metrics"
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:9090/api/v1/query?query=container_last_seen{instance=~\"puck.*\"}"
|
|
return_content: true
|
|
register: container_metrics
|
|
failed_when: false
|
|
|
|
- name: "🐼 Verify container metrics are available"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- container_metrics.status == 200
|
|
- container_metrics.json.status == "success"
|
|
fail_msg: "No container metrics found from puck - check cAdvisor config"
|
|
success_msg: "✅ Container metrics are being received from puck"
|
|
when: container_metrics.status == 200
|
|
|
|
- name: "🐼 Check alert rules are loaded"
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:9090/api/v1/rules"
|
|
return_content: true
|
|
register: alert_rules
|
|
|
|
- name: "🐼 Verify alert rules are present"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- alert_rules.status == 200
|
|
- alert_rules.json.data.groups | length > 0
|
|
fail_msg: "No alert rules loaded in Prometheus"
|
|
success_msg: "✅ Alert rules are loaded ({{ alert_rules.json.data.groups | length }} groups)"
|
|
|
|
- name: "🐼 Check for puck process alerts"
|
|
ansible.builtin.set_fact:
|
|
has_puck_alerts: "{{ alert_rules.json.data.groups | selectattr('name', 'equalto', 'puck_process_alerts') | list | length > 0 }}"
|
|
|
|
- name: "🐼 Verify puck process alert group exists"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- has_puck_alerts
|
|
fail_msg: "puck_process_alerts group not found in Prometheus rules"
|
|
success_msg: "✅ puck_process_alerts group is loaded"
|
|
|
|
- name: "🐼 Check Alertmanager is reachable"
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:9093/-/healthy"
|
|
return_content: true
|
|
register: alertmanager_health
|
|
failed_when: false
|
|
|
|
- name: "🐼 Verify Alertmanager is healthy"
|
|
ansible.builtin.assert:
|
|
that:
|
|
- alertmanager_health.status == 200
|
|
fail_msg: "Alertmanager is not healthy"
|
|
success_msg: "✅ Alertmanager is healthy"
|
|
when: alertmanager_health.status is defined
|
|
|
|
- name: Summary
|
|
hosts: localhost
|
|
gather_facts: false
|
|
tasks:
|
|
- name: "🐼 Validation Complete"
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
|
|
╔═══════════════════════════════════════════════════════════════╗
|
|
║ 🐼 RED PANDA MONITORING VALIDATION COMPLETE 🐼 ║
|
|
╠═══════════════════════════════════════════════════════════════╣
|
|
║ ║
|
|
║ Next Steps: ║
|
|
║ 1. Import dashboards to Grafana: ║
|
|
║ - ansible/grafana/dashboards/puck_processes.json ║
|
|
║ - ansible/grafana/dashboards/puck_containers.json ║
|
|
║ ║
|
|
║ 2. Verify alerts in Prometheus UI: ║
|
|
║ http://prospero.incus:9090/alerts ║
|
|
║ ║
|
|
║ 3. Test alert routing: ║
|
|
║ http://prospero.incus:9093/#/alerts ║
|
|
║ ║
|
|
╚═══════════════════════════════════════════════════════════════╝
|