feat: add Daedalus application configuration, database setup, and monitoring alerts
This commit is contained in:
@@ -111,6 +111,20 @@ loki.source.syslog "jupyterlab_logs" {
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
loki.source.syslog "daedalus_logs" {
|
||||
listener {
|
||||
address = "127.0.0.1:{{daedalus_syslog_port}}"
|
||||
protocol = "tcp"
|
||||
syslog_format = "{{ syslog_format }}"
|
||||
labels = {
|
||||
job = "daedalus",
|
||||
hostname = "{{inventory_hostname}}",
|
||||
environment = "{{deployment_environment}}",
|
||||
}
|
||||
}
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
loki.write "default" {
|
||||
endpoint {
|
||||
url = "{{loki_url}}"
|
||||
|
||||
@@ -240,6 +240,41 @@
|
||||
"expireInHours": 168,
|
||||
"formCss": "<style>.login-panel{background-color:#ffffff;border-radius:10px;box-shadow:0 0 30px 20px rgba(255,164,21,0.12)}.ant-btn-primary{background-color:#4b96ff!important;border-color:#4b96ff!important}.ant-btn-primary:hover{background-color:#58c0ff!important;border-color:#58c0ff!important}a{color:#ffa415}a:hover{color:#ffc219}</style>",
|
||||
"footerHtml": "<div style=\"text-align:center;padding:10px;color:#666;\"><a href=\"https://helu.ca\" style=\"color:#4b96ff;text-decoration:none;\">Powered by Helu.ca</a></div>"
|
||||
},
|
||||
{
|
||||
"owner": "admin",
|
||||
"name": "daedalus",
|
||||
"displayName": "Daedalus",
|
||||
"logo": "https://helu.ca/media/images/helu-ca_logo.original.svg",
|
||||
"homepageUrl": "https://daedalus.ouranos.helu.ca",
|
||||
"organization": "heluca",
|
||||
"cert": "cert-heluca",
|
||||
"enablePassword": true,
|
||||
"enableSignUp": false,
|
||||
"clientId": "{{ vault_daedalus_oauth_client_id }}",
|
||||
"clientSecret": "{{ vault_daedalus_oauth_client_secret }}",
|
||||
"providers": [],
|
||||
"signinMethods": [
|
||||
{"name": "Password", "displayName": "Password", "rule": "All"}
|
||||
],
|
||||
"signupItems": [
|
||||
{"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"},
|
||||
{"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"},
|
||||
{"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"},
|
||||
{"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"},
|
||||
{"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"}
|
||||
],
|
||||
"grantTypes": [
|
||||
"authorization_code",
|
||||
"refresh_token"
|
||||
],
|
||||
"redirectUris": [
|
||||
"https://daedalus.ouranos.helu.ca/oauth/callback"
|
||||
],
|
||||
"tokenFormat": "JWT",
|
||||
"expireInHours": 168,
|
||||
"formCss": "<style>.login-panel{background-color:#ffffff;border-radius:10px;box-shadow:0 0 30px 20px rgba(255,164,21,0.12)}.ant-btn-primary{background-color:#4b96ff!important;border-color:#4b96ff!important}.ant-btn-primary:hover{background-color:#58c0ff!important;border-color:#58c0ff!important}a{color:#ffa415}a:hover{color:#ffc219}</style>",
|
||||
"footerHtml": "<div style=\"text-align:center;padding:10px;color:#666;\"><a href=\"https://helu.ca\" style=\"color:#4b96ff;text-decoration:none;\">Powered by Helu.ca</a></div>"
|
||||
}
|
||||
],
|
||||
"users": [
|
||||
|
||||
@@ -46,6 +46,9 @@ nike_db_password: "{{ vault_nike_db_password }}"
|
||||
periplus_db_name: periplus
|
||||
periplus_db_user: periplus
|
||||
periplus_db_password: "{{ vault_periplus_db_password }}"
|
||||
daedalus_db_name: daedalus
|
||||
daedalus_db_user: daedalus
|
||||
daedalus_db_password: "{{ vault_daedalus_db_password }}"
|
||||
|
||||
# PostgreSQL admin password
|
||||
postgres_password: "{{ vault_postgres_password }}"
|
||||
|
||||
@@ -20,6 +20,7 @@ kairos_syslog_port: 51451
|
||||
icarlos_syslog_port: 51461
|
||||
spelunker_syslog_port: 51481
|
||||
jupyterlab_syslog_port: 51491
|
||||
daedalus_syslog_port: 51501
|
||||
|
||||
# =============================================================================
|
||||
# JupyterLab Configuration
|
||||
|
||||
@@ -119,6 +119,11 @@ haproxy_backends:
|
||||
backend_host: "rosalind.incus"
|
||||
backend_port: 22082
|
||||
health_path: "/api/healthz"
|
||||
|
||||
- subdomain: "daedalus"
|
||||
backend_host: "puck.incus"
|
||||
backend_port: 23081
|
||||
health_path: "/api/health"
|
||||
timeout_server: 120s
|
||||
|
||||
- subdomain: "lobechat"
|
||||
|
||||
@@ -202,6 +202,7 @@
|
||||
- { user: "{{ hass_db_user }}", password: "{{ hass_db_password }}" }
|
||||
- { user: "{{ nike_db_user }}", password: "{{ nike_db_password }}" }
|
||||
- { user: "{{ periplus_db_user }}", password: "{{ periplus_db_password }}" }
|
||||
- { user: "{{ daedalus_db_user }}", password: "{{ daedalus_db_password }}" }
|
||||
no_log: true
|
||||
|
||||
- name: Create application databases with owners
|
||||
@@ -224,6 +225,7 @@
|
||||
- { name: "{{ hass_db_name }}", owner: "{{ hass_db_user }}" }
|
||||
- { name: "{{ nike_db_name }}", owner: "{{ nike_db_user }}" }
|
||||
- { name: "{{ periplus_db_name }}", owner: "{{ periplus_db_user }}" }
|
||||
- { name: "{{ daedalus_db_name }}", owner: "{{ daedalus_db_user }}" }
|
||||
|
||||
- name: Enable postgis and pg_trgm extensions in periplus database
|
||||
community.postgresql.postgresql_ext:
|
||||
@@ -251,6 +253,7 @@
|
||||
- "{{ openwebui_db_name }}"
|
||||
- "{{ spelunker_db_name }}"
|
||||
- "{{ anythingllm_db_name }}"
|
||||
- "{{ daedalus_db_name }}"
|
||||
|
||||
handlers:
|
||||
- name: restart postgresql
|
||||
|
||||
@@ -244,6 +244,74 @@ groups:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# ============================================================================
|
||||
# Daedalus Application Alerts
|
||||
# ============================================================================
|
||||
- name: daedalus_alerts
|
||||
rules:
|
||||
- alert: DaedalusDown
|
||||
expr: daedalus_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Daedalus is down"
|
||||
description: "Daedalus has been unreachable for more than 1 minute."
|
||||
|
||||
- alert: DaedalusMCPDisconnected
|
||||
expr: daedalus_mcp_connections_active == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus has no active MCP connections"
|
||||
description: "Daedalus has reported zero active MCP connections for 5 minutes."
|
||||
|
||||
- alert: DaedalusHighErrorRate
|
||||
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus HTTP 5xx error rate above 5%"
|
||||
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
|
||||
|
||||
- alert: DaedalusClientExceptionSpike
|
||||
expr: rate(daedalus_client_exceptions_total[1m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus client exception spike"
|
||||
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
|
||||
|
||||
- alert: DaedalusSlowResponses
|
||||
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus p95 response time above 5s"
|
||||
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusMCPLatency
|
||||
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus MCP p95 latency above 30s"
|
||||
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusS3Errors
|
||||
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus S3 error rate above 1%"
|
||||
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
|
||||
@@ -45,4 +45,10 @@ scrape_configs:
|
||||
accessKey: ['{{ casdoor_prometheus_access_key }}']
|
||||
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
|
||||
|
||||
- job_name: 'daedalus'
|
||||
static_configs:
|
||||
- targets: ['puck.incus:22181']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
|
||||
@@ -244,6 +244,74 @@ groups:
|
||||
summary: "High log ingestion rate"
|
||||
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
|
||||
|
||||
# ============================================================================
|
||||
# Daedalus Application Alerts
|
||||
# ============================================================================
|
||||
- name: daedalus_alerts
|
||||
rules:
|
||||
- alert: DaedalusDown
|
||||
expr: daedalus_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Daedalus is down"
|
||||
description: "Daedalus has been unreachable for more than 1 minute."
|
||||
|
||||
- alert: DaedalusMCPDisconnected
|
||||
expr: daedalus_mcp_connections_active == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus has no active MCP connections"
|
||||
description: "Daedalus has reported zero active MCP connections for 5 minutes."
|
||||
|
||||
- alert: DaedalusHighErrorRate
|
||||
expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus HTTP 5xx error rate above 5%"
|
||||
description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests."
|
||||
|
||||
- alert: DaedalusClientExceptionSpike
|
||||
expr: rate(daedalus_client_exceptions_total[1m]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus client exception spike"
|
||||
description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)."
|
||||
|
||||
- alert: DaedalusSlowResponses
|
||||
expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus p95 response time above 5s"
|
||||
description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusMCPLatency
|
||||
expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus MCP p95 latency above 30s"
|
||||
description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: DaedalusS3Errors
|
||||
expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Daedalus S3 error rate above 1%"
|
||||
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
|
||||
|
||||
# Red Panda Seal of Approval 🐼
|
||||
# "If the metrics aren't red, go back to bed"
|
||||
{% endraw %}
|
||||
|
||||
@@ -45,4 +45,10 @@ scrape_configs:
|
||||
accessKey: ['{{ casdoor_prometheus_access_key }}']
|
||||
accessSecret: ['{{ casdoor_prometheus_access_secret }}']
|
||||
|
||||
- job_name: 'daedalus'
|
||||
static_configs:
|
||||
- targets: ['puck.incus:22181']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Red Panda Approved Prometheus Configuration
|
||||
|
||||
Reference in New Issue
Block a user