feat: add Daedalus application configuration, database setup, and monitoring alerts
This commit is contained in:
@@ -244,6 +244,74 @@ groups:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# ============================================================================
|
||||
# Daedalus Application Alerts
|
||||
# ============================================================================
|
||||
- name: daedalus_alerts
|
||||
rules:
|
||||
- alert: DaedalusDown
|
||||
expr: daedalus_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Daedalus is down"
|
||||
description: "Daedalus has been unreachable for more than 1 minute."
|
||||
|
||||
- alert: DaedalusMCPDisconnected
|
||||
expr: daedalus_mcp_connections_active == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus has no active MCP connections"
|
||||
description: "Daedalus has reported zero active MCP connections for 5 minutes."
|
||||
|
||||
- alert: DaedalusHighErrorRate
|
||||
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus HTTP 5xx error rate above 5%"
|
||||
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
|
||||
|
||||
- alert: DaedalusClientExceptionSpike
|
||||
expr: rate(daedalus_client_exceptions_total[1m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus client exception spike"
|
||||
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
|
||||
|
||||
- alert: DaedalusSlowResponses
|
||||
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus p95 response time above 5s"
|
||||
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusMCPLatency
|
||||
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus MCP p95 latency above 30s"
|
||||
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusS3Errors
|
||||
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus S3 error rate above 1%"
|
||||
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
|
||||
Reference in New Issue
Block a user