docs: rewrite README with structured overview and quick start guide
Replaces the minimal project description with a comprehensive README including a component overview table, quick start instructions, common Ansible operations, and links to detailed documentation. Aligns with Red Panda Approval™ standards.
This commit is contained in:
170
ansible/validate_puck_monitoring.yml
Normal file
170
ansible/validate_puck_monitoring.yml
Normal file
@@ -0,0 +1,170 @@
|
||||
---
|
||||
# Red Panda Approved Validation Playbook 🐼
|
||||
# Validates process and container monitoring deployment
|
||||
#
|
||||
# Usage: ansible-playbook validate_puck_monitoring.yml
|
||||
|
||||
- name: Validate Puck Process & Container Monitoring
|
||||
hosts: puck.incus
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: "🐼 Check Alloy service is running"
|
||||
become: true
|
||||
ansible.builtin.systemd:
|
||||
name: alloy
|
||||
state: started
|
||||
check_mode: true
|
||||
register: alloy_status
|
||||
|
||||
- name: "🐼 Verify Alloy is active"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- alloy_status.status.ActiveState == "active"
|
||||
fail_msg: "Alloy service is not running on puck"
|
||||
success_msg: "✅ Alloy service is active"
|
||||
|
||||
- name: "🐼 Check Alloy can access Docker socket"
|
||||
become: true
|
||||
ansible.builtin.command: id alloy
|
||||
register: alloy_groups
|
||||
changed_when: false
|
||||
|
||||
- name: "🐼 Verify alloy is in docker group"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- "'docker' in alloy_groups.stdout"
|
||||
fail_msg: "Alloy user is not in docker group - cAdvisor won't work"
|
||||
success_msg: "✅ Alloy user is in docker group"
|
||||
|
||||
- name: "🐼 Wait for metrics to be available (30s)"
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
when: alloy_status.changed | default(false)
|
||||
|
||||
- name: "🐼 Check Alloy health endpoint"
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:12345/-/ready"
|
||||
return_content: true
|
||||
register: alloy_health
|
||||
failed_when: false
|
||||
|
||||
- name: "🐼 Report Alloy health"
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ 'Alloy health: ' + alloy_health.status | string }}"
|
||||
|
||||
- name: Validate Prometheus on Prospero
|
||||
hosts: prospero.incus
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: "🐼 Check Prometheus service is running"
|
||||
become: true
|
||||
ansible.builtin.systemd:
|
||||
name: prometheus
|
||||
state: started
|
||||
check_mode: true
|
||||
register: prometheus_status
|
||||
|
||||
- name: "🐼 Verify Prometheus is active"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- prometheus_status.status.ActiveState == "active"
|
||||
fail_msg: "Prometheus service is not running on prospero"
|
||||
success_msg: "✅ Prometheus service is active"
|
||||
|
||||
- name: "🐼 Check Prometheus can query puck process metrics"
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:9090/api/v1/query?query=namedprocess_namegroup_num_procs{instance=~\"puck.*\"}"
|
||||
return_content: true
|
||||
register: process_metrics
|
||||
failed_when: false
|
||||
|
||||
- name: "🐼 Verify process metrics are available"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- process_metrics.status == 200
|
||||
- process_metrics.json.status == "success"
|
||||
- process_metrics.json.data.result | length > 0
|
||||
fail_msg: "No process metrics found from puck - check Alloy remote_write config"
|
||||
success_msg: "✅ Process metrics are being received from puck ({{ process_metrics.json.data.result | length }} series)"
|
||||
when: process_metrics.status == 200
|
||||
|
||||
- name: "🐼 Check Prometheus can query puck container metrics"
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:9090/api/v1/query?query=container_last_seen{instance=~\"puck.*\"}"
|
||||
return_content: true
|
||||
register: container_metrics
|
||||
failed_when: false
|
||||
|
||||
- name: "🐼 Verify container metrics are available"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- container_metrics.status == 200
|
||||
- container_metrics.json.status == "success"
|
||||
fail_msg: "No container metrics found from puck - check cAdvisor config"
|
||||
success_msg: "✅ Container metrics are being received from puck"
|
||||
when: container_metrics.status == 200
|
||||
|
||||
- name: "🐼 Check alert rules are loaded"
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:9090/api/v1/rules"
|
||||
return_content: true
|
||||
register: alert_rules
|
||||
|
||||
- name: "🐼 Verify alert rules are present"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- alert_rules.status == 200
|
||||
- alert_rules.json.data.groups | length > 0
|
||||
fail_msg: "No alert rules loaded in Prometheus"
|
||||
success_msg: "✅ Alert rules are loaded ({{ alert_rules.json.data.groups | length }} groups)"
|
||||
|
||||
- name: "🐼 Check for puck process alerts"
|
||||
ansible.builtin.set_fact:
|
||||
has_puck_alerts: "{{ alert_rules.json.data.groups | selectattr('name', 'equalto', 'puck_process_alerts') | list | length > 0 }}"
|
||||
|
||||
- name: "🐼 Verify puck process alert group exists"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- has_puck_alerts
|
||||
fail_msg: "puck_process_alerts group not found in Prometheus rules"
|
||||
success_msg: "✅ puck_process_alerts group is loaded"
|
||||
|
||||
- name: "🐼 Check Alertmanager is reachable"
|
||||
ansible.builtin.uri:
|
||||
url: "http://localhost:9093/-/healthy"
|
||||
return_content: true
|
||||
register: alertmanager_health
|
||||
failed_when: false
|
||||
|
||||
- name: "🐼 Verify Alertmanager is healthy"
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- alertmanager_health.status == 200
|
||||
fail_msg: "Alertmanager is not healthy"
|
||||
success_msg: "✅ Alertmanager is healthy"
|
||||
when: alertmanager_health.status is defined
|
||||
|
||||
- name: Summary
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
tasks:
|
||||
- name: "🐼 Validation Complete"
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
|
||||
╔═══════════════════════════════════════════════════════════════╗
|
||||
║ 🐼 RED PANDA MONITORING VALIDATION COMPLETE 🐼 ║
|
||||
╠═══════════════════════════════════════════════════════════════╣
|
||||
║ ║
|
||||
║ Next Steps: ║
|
||||
║ 1. Import dashboards to Grafana: ║
|
||||
║ - ansible/grafana/dashboards/puck_processes.json ║
|
||||
║ - ansible/grafana/dashboards/puck_containers.json ║
|
||||
║ ║
|
||||
║ 2. Verify alerts in Prometheus UI: ║
|
||||
║ http://prospero.incus:9090/alerts ║
|
||||
║ ║
|
||||
║ 3. Test alert routing: ║
|
||||
║ http://prospero.incus:9093/#/alerts ║
|
||||
║ ║
|
||||
╚═══════════════════════════════════════════════════════════════╝
|
||||
Reference in New Issue
Block a user