diff --git a/ansible/alloy/puck/config.alloy.j2 b/ansible/alloy/puck/config.alloy.j2 index 2960dcd..3c9aa1a 100644 --- a/ansible/alloy/puck/config.alloy.j2 +++ b/ansible/alloy/puck/config.alloy.j2 @@ -111,6 +111,20 @@ loki.source.syslog "jupyterlab_logs" { forward_to = [loki.write.default.receiver] } +loki.source.syslog "daedalus_logs" { + listener { + address = "127.0.0.1:{{daedalus_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "daedalus", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + loki.write "default" { endpoint { url = "{{loki_url}}" diff --git a/ansible/casdoor/init_data.json.j2 b/ansible/casdoor/init_data.json.j2 index e774666..2682f23 100644 --- a/ansible/casdoor/init_data.json.j2 +++ b/ansible/casdoor/init_data.json.j2 @@ -240,6 +240,41 @@ "expireInHours": 168, "formCss": "", "footerHtml": "
" + }, + { + "owner": "admin", + "name": "daedalus", + "displayName": "Daedalus", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://daedalus.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_daedalus_oauth_client_id }}", + "clientSecret": "{{ vault_daedalus_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://daedalus.ouranos.helu.ca/oauth/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "" } ], "users": [ diff --git a/ansible/inventory/host_vars/portia.incus.yml b/ansible/inventory/host_vars/portia.incus.yml index 107b1c2..ea03bcb 100644 --- a/ansible/inventory/host_vars/portia.incus.yml +++ b/ansible/inventory/host_vars/portia.incus.yml @@ -46,6 +46,9 @@ nike_db_password: "{{ vault_nike_db_password }}" periplus_db_name: periplus periplus_db_user: periplus periplus_db_password: "{{ vault_periplus_db_password }}" +daedalus_db_name: daedalus +daedalus_db_user: daedalus +daedalus_db_password: "{{ vault_daedalus_db_password }}" # PostgreSQL admin password postgres_password: "{{ vault_postgres_password }}" diff --git a/ansible/inventory/host_vars/puck.incus.yml b/ansible/inventory/host_vars/puck.incus.yml index bc0ddef..5190708 100644 --- a/ansible/inventory/host_vars/puck.incus.yml +++ b/ansible/inventory/host_vars/puck.incus.yml @@ -20,6 +20,7 @@ kairos_syslog_port: 51451 icarlos_syslog_port: 51461 spelunker_syslog_port: 51481 jupyterlab_syslog_port: 51491 +daedalus_syslog_port: 51501 # ============================================================================= # JupyterLab Configuration diff --git a/ansible/inventory/host_vars/titania.incus.yml b/ansible/inventory/host_vars/titania.incus.yml index fa7dc9c..707ef52 100644 --- a/ansible/inventory/host_vars/titania.incus.yml +++ b/ansible/inventory/host_vars/titania.incus.yml @@ -119,6 +119,11 @@ haproxy_backends: backend_host: "rosalind.incus" backend_port: 22082 health_path: "/api/healthz" + + - subdomain: "daedalus" + backend_host: "puck.incus" + backend_port: 23081 + health_path: "/api/health" timeout_server: 120s - subdomain: "lobechat" diff --git a/ansible/postgresql/deploy.yml b/ansible/postgresql/deploy.yml index 939e899..1e9bfcb 100644 --- a/ansible/postgresql/deploy.yml +++ b/ansible/postgresql/deploy.yml @@ -202,6 +202,7 @@ - { user: "{{ hass_db_user }}", password: "{{ hass_db_password }}" } - { user: "{{ nike_db_user }}", password: "{{ nike_db_password }}" } - { user: "{{ periplus_db_user }}", password: "{{ periplus_db_password }}" } + - { user: "{{ daedalus_db_user }}", password: "{{ daedalus_db_password }}" } no_log: true - name: Create application databases with owners @@ -224,6 +225,7 @@ - { name: "{{ hass_db_name }}", owner: "{{ hass_db_user }}" } - { name: "{{ nike_db_name }}", owner: "{{ nike_db_user }}" } - { name: "{{ periplus_db_name }}", owner: "{{ periplus_db_user }}" } + - { name: "{{ daedalus_db_name }}", owner: "{{ daedalus_db_user }}" } - name: Enable postgis and pg_trgm extensions in periplus database community.postgresql.postgresql_ext: @@ -251,6 +253,7 @@ - "{{ openwebui_db_name }}" - "{{ spelunker_db_name }}" - "{{ anythingllm_db_name }}" + - "{{ daedalus_db_name }}" handlers: - name: restart postgresql diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 index 5efc81a..521ef9d 100644 --- a/ansible/pplg/alert_rules.yml.j2 +++ b/ansible/pplg/alert_rules.yml.j2 @@ -244,6 +244,74 @@ groups: summary: "High log ingestion rate" description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + # ============================================================================ + # Daedalus Application Alerts + # ============================================================================ + - name: daedalus_alerts + rules: + - alert: DaedalusDown + expr: daedalus_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Daedalus is down" + description: "Daedalus has been unreachable for more than 1 minute." + + - alert: DaedalusMCPDisconnected + expr: daedalus_mcp_connections_active == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus has no active MCP connections" + description: "Daedalus has reported zero active MCP connections for 5 minutes." + + - alert: DaedalusHighErrorRate + expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus HTTP 5xx error rate above 5%" + description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests." + + - alert: DaedalusClientExceptionSpike + expr: rate(daedalus_client_exceptions_total[1m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Daedalus client exception spike" + description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)." + + - alert: DaedalusSlowResponses + expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus p95 response time above 5s" + description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusMCPLatency + expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus MCP p95 latency above 30s" + description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusS3Errors + expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus S3 error rate above 1%" + description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 index 1a369c8..9c6a50f 100644 --- a/ansible/pplg/prometheus.yml.j2 +++ b/ansible/pplg/prometheus.yml.j2 @@ -45,4 +45,10 @@ scrape_configs: accessKey: ['{{ casdoor_prometheus_access_key }}'] accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + - job_name: 'daedalus' + static_configs: + - targets: ['puck.incus:22181'] + metrics_path: '/metrics' + scrape_interval: 15s + # Red Panda Approved Prometheus Configuration diff --git a/ansible/prometheus/alert_rules.yml.j2 b/ansible/prometheus/alert_rules.yml.j2 index 5efc81a..521ef9d 100644 --- a/ansible/prometheus/alert_rules.yml.j2 +++ b/ansible/prometheus/alert_rules.yml.j2 @@ -244,6 +244,74 @@ groups: summary: "High log ingestion rate" description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + # ============================================================================ + # Daedalus Application Alerts + # ============================================================================ + - name: daedalus_alerts + rules: + - alert: DaedalusDown + expr: daedalus_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Daedalus is down" + description: "Daedalus has been unreachable for more than 1 minute." + + - alert: DaedalusMCPDisconnected + expr: daedalus_mcp_connections_active == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus has no active MCP connections" + description: "Daedalus has reported zero active MCP connections for 5 minutes." + + - alert: DaedalusHighErrorRate + expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus HTTP 5xx error rate above 5%" + description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests." + + - alert: DaedalusClientExceptionSpike + expr: rate(daedalus_client_exceptions_total[1m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Daedalus client exception spike" + description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)." + + - alert: DaedalusSlowResponses + expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus p95 response time above 5s" + description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusMCPLatency + expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus MCP p95 latency above 30s" + description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusS3Errors + expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus S3 error rate above 1%" + description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/prometheus/prometheus.yml.j2 b/ansible/prometheus/prometheus.yml.j2 index 1a369c8..9c6a50f 100644 --- a/ansible/prometheus/prometheus.yml.j2 +++ b/ansible/prometheus/prometheus.yml.j2 @@ -45,4 +45,10 @@ scrape_configs: accessKey: ['{{ casdoor_prometheus_access_key }}'] accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + - job_name: 'daedalus' + static_configs: + - targets: ['puck.incus:22181'] + metrics_path: '/metrics' + scrape_interval: 15s + # Red Panda Approved Prometheus Configuration diff --git a/docs/daedalus.md b/docs/daedalus.md new file mode 100644 index 0000000..5eea9d7 --- /dev/null +++ b/docs/daedalus.md @@ -0,0 +1,244 @@ +# Daedalus — Deployment Requirements + +All infrastructure runs within the Agathos Incus sandbox. Hosts are resolved via DNS using the `.incus` suffix. + +--- + +## 1. HAProxy — Titania + +**Host:** `titania.incus` +**Domain:** `daedalus.ouranos.helu.ca` + +HAProxy on Titania terminates TLS and routes traffic to Daedalus on puck. Casdoor SSO enforces authentication before requests reach the backend. + +```haproxy +frontend https + acl host_daedalus hdr(host) -i daedalus.ouranos.helu.ca + use_backend daedalus if host_daedalus + +backend daedalus + option httpchk GET /api/health + server puck puck.incus:22181 check +``` + +**Requirements:** +- ACL entry in the HAProxy `frontend https` block +- Backend definition with health check on `/api/health` +- Casdoor application configured for `daedalus.ouranos.helu.ca` (same pattern as other Agathos services) +- TLS certificate covering `daedalus.ouranos.helu.ca` (wildcard or SAN) + +--- + +## 2. PostgreSQL — Portia + +**Host:** `portia.incus` +**Port:** 5432 +**Database:** `daedalus` + +Stores conversation history, workspace configuration, user preferences, and file metadata (S3 keys). + +**Provisioning:** +```sql +CREATE USER daedalus WITH PASSWORD '