diff --git a/ansible/alloy/puck/config.alloy.j2 b/ansible/alloy/puck/config.alloy.j2 index 2960dcd..3c9aa1a 100644 --- a/ansible/alloy/puck/config.alloy.j2 +++ b/ansible/alloy/puck/config.alloy.j2 @@ -111,6 +111,20 @@ loki.source.syslog "jupyterlab_logs" { forward_to = [loki.write.default.receiver] } +loki.source.syslog "daedalus_logs" { + listener { + address = "127.0.0.1:{{daedalus_syslog_port}}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" + labels = { + job = "daedalus", + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } + } + forward_to = [loki.write.default.receiver] +} + loki.write "default" { endpoint { url = "{{loki_url}}" diff --git a/ansible/casdoor/init_data.json.j2 b/ansible/casdoor/init_data.json.j2 index e774666..2682f23 100644 --- a/ansible/casdoor/init_data.json.j2 +++ b/ansible/casdoor/init_data.json.j2 @@ -240,6 +240,41 @@ "expireInHours": 168, "formCss": "", "footerHtml": "
Powered by Helu.ca
" + }, + { + "owner": "admin", + "name": "daedalus", + "displayName": "Daedalus", + "logo": "https://helu.ca/media/images/helu-ca_logo.original.svg", + "homepageUrl": "https://daedalus.ouranos.helu.ca", + "organization": "heluca", + "cert": "cert-heluca", + "enablePassword": true, + "enableSignUp": false, + "clientId": "{{ vault_daedalus_oauth_client_id }}", + "clientSecret": "{{ vault_daedalus_oauth_client_secret }}", + "providers": [], + "signinMethods": [ + {"name": "Password", "displayName": "Password", "rule": "All"} + ], + "signupItems": [ + {"name": "ID", "visible": false, "required": true, "prompted": false, "rule": "Random"}, + {"name": "Email", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Display name", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Password", "visible": true, "required": true, "prompted": false, "rule": "None"}, + {"name": "Confirm password", "visible": true, "required": true, "prompted": false, "rule": "None"} + ], + "grantTypes": [ + "authorization_code", + "refresh_token" + ], + "redirectUris": [ + "https://daedalus.ouranos.helu.ca/oauth/callback" + ], + "tokenFormat": "JWT", + "expireInHours": 168, + "formCss": "", + "footerHtml": "
Powered by Helu.ca
" } ], "users": [ diff --git a/ansible/inventory/host_vars/portia.incus.yml b/ansible/inventory/host_vars/portia.incus.yml index 107b1c2..ea03bcb 100644 --- a/ansible/inventory/host_vars/portia.incus.yml +++ b/ansible/inventory/host_vars/portia.incus.yml @@ -46,6 +46,9 @@ nike_db_password: "{{ vault_nike_db_password }}" periplus_db_name: periplus periplus_db_user: periplus periplus_db_password: "{{ vault_periplus_db_password }}" +daedalus_db_name: daedalus +daedalus_db_user: daedalus +daedalus_db_password: "{{ vault_daedalus_db_password }}" # PostgreSQL admin password postgres_password: "{{ vault_postgres_password }}" diff --git a/ansible/inventory/host_vars/puck.incus.yml b/ansible/inventory/host_vars/puck.incus.yml index bc0ddef..5190708 100644 --- a/ansible/inventory/host_vars/puck.incus.yml +++ b/ansible/inventory/host_vars/puck.incus.yml @@ -20,6 +20,7 @@ kairos_syslog_port: 51451 icarlos_syslog_port: 51461 spelunker_syslog_port: 51481 jupyterlab_syslog_port: 51491 +daedalus_syslog_port: 51501 # ============================================================================= # JupyterLab Configuration diff --git a/ansible/inventory/host_vars/titania.incus.yml b/ansible/inventory/host_vars/titania.incus.yml index fa7dc9c..707ef52 100644 --- a/ansible/inventory/host_vars/titania.incus.yml +++ b/ansible/inventory/host_vars/titania.incus.yml @@ -119,6 +119,11 @@ haproxy_backends: backend_host: "rosalind.incus" backend_port: 22082 health_path: "/api/healthz" + + - subdomain: "daedalus" + backend_host: "puck.incus" + backend_port: 23081 + health_path: "/api/health" timeout_server: 120s - subdomain: "lobechat" diff --git a/ansible/postgresql/deploy.yml b/ansible/postgresql/deploy.yml index 939e899..1e9bfcb 100644 --- a/ansible/postgresql/deploy.yml +++ b/ansible/postgresql/deploy.yml @@ -202,6 +202,7 @@ - { user: "{{ hass_db_user }}", password: "{{ hass_db_password }}" } - { user: "{{ nike_db_user }}", password: "{{ nike_db_password }}" } - { user: "{{ periplus_db_user }}", password: "{{ periplus_db_password }}" } + - { user: "{{ daedalus_db_user }}", password: "{{ daedalus_db_password }}" } no_log: true - name: Create application databases with owners @@ -224,6 +225,7 @@ - { name: "{{ hass_db_name }}", owner: "{{ hass_db_user }}" } - { name: "{{ nike_db_name }}", owner: "{{ nike_db_user }}" } - { name: "{{ periplus_db_name }}", owner: "{{ periplus_db_user }}" } + - { name: "{{ daedalus_db_name }}", owner: "{{ daedalus_db_user }}" } - name: Enable postgis and pg_trgm extensions in periplus database community.postgresql.postgresql_ext: @@ -251,6 +253,7 @@ - "{{ openwebui_db_name }}" - "{{ spelunker_db_name }}" - "{{ anythingllm_db_name }}" + - "{{ daedalus_db_name }}" handlers: - name: restart postgresql diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 index 5efc81a..521ef9d 100644 --- a/ansible/pplg/alert_rules.yml.j2 +++ b/ansible/pplg/alert_rules.yml.j2 @@ -244,6 +244,74 @@ groups: summary: "High log ingestion rate" description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + # ============================================================================ + # Daedalus Application Alerts + # ============================================================================ + - name: daedalus_alerts + rules: + - alert: DaedalusDown + expr: daedalus_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Daedalus is down" + description: "Daedalus has been unreachable for more than 1 minute." + + - alert: DaedalusMCPDisconnected + expr: daedalus_mcp_connections_active == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus has no active MCP connections" + description: "Daedalus has reported zero active MCP connections for 5 minutes." + + - alert: DaedalusHighErrorRate + expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus HTTP 5xx error rate above 5%" + description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests." + + - alert: DaedalusClientExceptionSpike + expr: rate(daedalus_client_exceptions_total[1m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Daedalus client exception spike" + description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)." + + - alert: DaedalusSlowResponses + expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus p95 response time above 5s" + description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusMCPLatency + expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus MCP p95 latency above 30s" + description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusS3Errors + expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus S3 error rate above 1%" + description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 index 1a369c8..9c6a50f 100644 --- a/ansible/pplg/prometheus.yml.j2 +++ b/ansible/pplg/prometheus.yml.j2 @@ -45,4 +45,10 @@ scrape_configs: accessKey: ['{{ casdoor_prometheus_access_key }}'] accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + - job_name: 'daedalus' + static_configs: + - targets: ['puck.incus:22181'] + metrics_path: '/metrics' + scrape_interval: 15s + # Red Panda Approved Prometheus Configuration diff --git a/ansible/prometheus/alert_rules.yml.j2 b/ansible/prometheus/alert_rules.yml.j2 index 5efc81a..521ef9d 100644 --- a/ansible/prometheus/alert_rules.yml.j2 +++ b/ansible/prometheus/alert_rules.yml.j2 @@ -244,6 +244,74 @@ groups: summary: "High log ingestion rate" description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + # ============================================================================ + # Daedalus Application Alerts + # ============================================================================ + - name: daedalus_alerts + rules: + - alert: DaedalusDown + expr: daedalus_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Daedalus is down" + description: "Daedalus has been unreachable for more than 1 minute." + + - alert: DaedalusMCPDisconnected + expr: daedalus_mcp_connections_active == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus has no active MCP connections" + description: "Daedalus has reported zero active MCP connections for 5 minutes." + + - alert: DaedalusHighErrorRate + expr: rate(daedalus_http_requests_total{status=~"5.."}[5m]) / rate(daedalus_http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus HTTP 5xx error rate above 5%" + description: "Daedalus is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests." + + - alert: DaedalusClientExceptionSpike + expr: rate(daedalus_client_exceptions_total[1m]) > 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Daedalus client exception spike" + description: "Daedalus is recording more than 10 client exceptions per minute (current: {{ $value | printf \"%.1f\" }}/min)." + + - alert: DaedalusSlowResponses + expr: histogram_quantile(0.95, rate(daedalus_http_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus p95 response time above 5s" + description: "Daedalus p95 response latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusMCPLatency + expr: histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m])) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus MCP p95 latency above 30s" + description: "Daedalus MCP p95 latency is {{ $value | printf \"%.2f\" }}s." + + - alert: DaedalusS3Errors + expr: rate(daedalus_s3_errors_total[5m]) / rate(daedalus_s3_requests_total[5m]) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Daedalus S3 error rate above 1%" + description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/prometheus/prometheus.yml.j2 b/ansible/prometheus/prometheus.yml.j2 index 1a369c8..9c6a50f 100644 --- a/ansible/prometheus/prometheus.yml.j2 +++ b/ansible/prometheus/prometheus.yml.j2 @@ -45,4 +45,10 @@ scrape_configs: accessKey: ['{{ casdoor_prometheus_access_key }}'] accessSecret: ['{{ casdoor_prometheus_access_secret }}'] + - job_name: 'daedalus' + static_configs: + - targets: ['puck.incus:22181'] + metrics_path: '/metrics' + scrape_interval: 15s + # Red Panda Approved Prometheus Configuration diff --git a/docs/daedalus.md b/docs/daedalus.md new file mode 100644 index 0000000..5eea9d7 --- /dev/null +++ b/docs/daedalus.md @@ -0,0 +1,244 @@ +# Daedalus — Deployment Requirements + +All infrastructure runs within the Agathos Incus sandbox. Hosts are resolved via DNS using the `.incus` suffix. + +--- + +## 1. HAProxy — Titania + +**Host:** `titania.incus` +**Domain:** `daedalus.ouranos.helu.ca` + +HAProxy on Titania terminates TLS and routes traffic to Daedalus on puck. Casdoor SSO enforces authentication before requests reach the backend. + +```haproxy +frontend https + acl host_daedalus hdr(host) -i daedalus.ouranos.helu.ca + use_backend daedalus if host_daedalus + +backend daedalus + option httpchk GET /api/health + server puck puck.incus:22181 check +``` + +**Requirements:** +- ACL entry in the HAProxy `frontend https` block +- Backend definition with health check on `/api/health` +- Casdoor application configured for `daedalus.ouranos.helu.ca` (same pattern as other Agathos services) +- TLS certificate covering `daedalus.ouranos.helu.ca` (wildcard or SAN) + +--- + +## 2. PostgreSQL — Portia + +**Host:** `portia.incus` +**Port:** 5432 +**Database:** `daedalus` + +Stores conversation history, workspace configuration, user preferences, and file metadata (S3 keys). + +**Provisioning:** +```sql +CREATE USER daedalus WITH PASSWORD ''; +CREATE DATABASE daedalus OWNER daedalus; +``` + +**Connection string (backend `.env`):** +``` +DAEDALUS_DATABASE_URL=postgresql+asyncpg://daedalus:@portia.incus:5432/daedalus +``` + +**Schema management:** Alembic migrations run from the backend container/service on puck. No direct DDL. + +**Tables (created by Alembic):** +- Workspaces +- Conversations / messages +- File metadata (S3 keys, filenames, content types, sizes) +- User preferences + +--- + +## 3. Prometheus Scraping — Prospero + +**Scraper:** `prospero.incus` +**Target:** `puck.incus:22181/metrics` + +Daedalus exposes a `/metrics` endpoint in Prometheus text format. Prospero's Prometheus must be configured to scrape it. + +**Prometheus scrape config:** +```yaml +- job_name: daedalus + scrape_interval: 15s + metrics_path: /metrics + static_configs: + - targets: + - puck.incus:22181 +``` + +**Key metric families:** +| Prefix | Category | +|--------|----------| +| `daedalus_up`, `daedalus_build_info` | Application health | +| `daedalus_http_requests_total`, `daedalus_http_request_duration_seconds` | HTTP traffic | +| `daedalus_mcp_*` | MCP connection and request metrics | +| `daedalus_agent_*` | Agent interaction metrics | +| `daedalus_file_*`, `daedalus_s3_*` | File and S3 operations | +| `daedalus_client_*` | Browser telemetry (exceptions, Web Vitals) | +| `daedalus_feature_usage_total` | Feature usage counters | + +**Network requirement:** The `/metrics` endpoint is restricted to internal networks (`10.10.0.0/24`, `172.16.0.0/12`, `127.0.0.0/8`) in the Nginx config. + +**AlertManager rules** (on Prospero): +| Alert | Condition | Severity | +|-------|-----------|----------| +| `DaedalusDown` | `daedalus_up == 0` for 1m | critical | +| `DaedalusMCPDisconnected` | `daedalus_mcp_connections_active == 0` for 5m | warning | +| `DaedalusHighErrorRate` | HTTP 5xx > 5% for 5m | warning | +| `DaedalusClientExceptionSpike` | Client exceptions > 10/min | warning | +| `DaedalusSlowResponses` | p95 > 5s for 5m | warning | +| `DaedalusMCPLatency` | MCP p95 > 30s for 5m | warning | +| `DaedalusS3Errors` | S3 error rate > 1% for 5m | warning | + +--- + +## 4. S3 Object Storage — MinIO + +**Provider:** MinIO on Incus (provisioned by Terraform) +**Bucket:** `daedalus` + +Stores workspace file uploads. Metadata lives in PostgreSQL; actual bytes live in S3. + +**Key layout:** +``` +workspaces/{workspace_id}/files/{file_id}/{filename} +``` + +**Backend environment variables:** +``` +DAEDALUS_S3_ENDPOINT=http://:9000 +DAEDALUS_S3_ACCESS_KEY= +DAEDALUS_S3_SECRET_KEY= +DAEDALUS_S3_BUCKET=daedalus +``` + +**Requirements:** +- Terraform resource for the bucket (same pattern as Casdoor and LobeChat S3 buckets) +- Access key / secret key stored in Ansible Vault +- Credentials never exposed to the frontend — all file access flows through FastAPI + +--- + +## 5. Application Runtime — Puck + +**Host:** `puck.incus` +**Nginx port:** 22181 (proxied by HAProxy on Titania) +**Uvicorn port:** 8000 (internal only, behind Nginx) + +### Production Deployment + +**Systemd service** (`/etc/systemd/system/daedalus-api.service`): +```ini +[Unit] +Description=Daedalus API (FastAPI) +After=network.target + +[Service] +Type=simple +User=daedalus +WorkingDirectory=/srv/daedalus/backend +ExecStart=/srv/daedalus/venv/bin/uvicorn daedalus.main:app --host 127.0.0.1 --port 8000 --workers 2 +Restart=always +RestartSec=5 +Environment=DAEDALUS_ENV=production +StandardOutput=journal +StandardError=journal +SyslogIdentifier=daedalus + +[Install] +WantedBy=multi-user.target +``` + +**Nginx** serves the SvelteKit static build from `/srv/daedalus/frontend/build` and proxies `/api/*` and `/metrics` to Uvicorn on `127.0.0.1:8000`. + +**Directory layout (production):** +``` +/srv/daedalus/ +├── backend/ # Python source +├── frontend/build/ # Static SPA build +├── venv/ # Python virtualenv +└── .env # Environment configuration +``` + +### Docker Compose (Development) + +For local development on puck, two containers run behind Docker bridge networking: + +| Service | Image | Port | +|---------|-------|------| +| `api` | Built from `./backend/Dockerfile` | 8000 (internal) | +| `nginx` | `nginx:alpine` | 22181 → 80 | + +```bash +docker compose up --build +``` + +--- + +## 6. Logging — Alloy → Loki + +**No log files.** Structured JSON goes to stdout. + +| Environment | Log path | +|-------------|----------| +| Production (systemd) | stdout → journal → syslog → Alloy → Loki (prospero) | +| Development (Docker) | stdout → Docker log driver → Alloy → Loki (prospero) | + +Alloy on puck is already configured to ship container and journal logs to Loki on Prospero. The `SyslogIdentifier=daedalus` tag allows filtering in Grafana with `{unit="daedalus"}`. + +--- + +## 7. LLM Proxy — Sycorax + +**Host:** `sycorax.incus` + +FastAgent MCP servers route LLM API calls through Arke on Sycorax for multi-provider model routing (OpenAI, Anthropic, etc.). Daedalus does not call Sycorax directly — it communicates with FastAgent servers over MCP Streamable HTTP, and those agents use Sycorax. + +--- + +## 8. DNS Summary + +| FQDN | Resolves to | Purpose | +|------|-------------|---------| +| `daedalus.ouranos.helu.ca` | Titania (HAProxy) | Public entry point | +| `puck.incus` | Puck | Application host (Nginx + Uvicorn) | +| `portia.incus` | Portia | PostgreSQL | +| `prospero.incus` | Prospero | Prometheus, Loki, Grafana | +| `titania.incus` | Titania | HAProxy + Casdoor SSO | +| `sycorax.incus` | Sycorax | LLM proxy (Arke) | + +--- + +## 9. Deployment Checklist + +### Infrastructure (Terraform / Ansible) +- [ ] Create `daedalus` database and user on Portia +- [ ] Create `daedalus` S3 bucket in MinIO (Terraform) +- [ ] Store DB password and S3 credentials in Ansible Vault +- [ ] Add Prometheus scrape target on Prospero +- [ ] Add AlertManager rules on Prospero +- [ ] Add Grafana dashboard on Prospero +- [ ] Configure HAProxy backend + ACL on Titania +- [ ] Configure Casdoor application for `daedalus.ouranos.helu.ca` + +### Application (Puck) +- [ ] Create `/srv/daedalus` directory structure +- [ ] Create `daedalus` system user +- [ ] Set up Python virtualenv and install backend dependencies +- [ ] Build SvelteKit frontend (`npm run build`) +- [ ] Deploy `.env` from Ansible Vault +- [ ] Install Nginx site config +- [ ] Install and enable systemd service +- [ ] Run Alembic migrations (`alembic upgrade head`) +- [ ] Verify `/api/health` returns `{"status": "ok"}` +- [ ] Verify `/metrics` is reachable from Prospero +- [ ] Verify `daedalus.ouranos.helu.ca` loads the SPA through HAProxy diff --git a/terraform/storage.tf b/terraform/storage.tf index 61a0db4..4d005ad 100644 --- a/terraform/storage.tf +++ b/terraform/storage.tf @@ -63,6 +63,23 @@ resource "incus_storage_bucket_key" "spelunker_key" { role = "admin" } +# S3 bucket for Daedalus file storage +resource "incus_storage_bucket" "daedalus" { + name = "daedalus" + pool = var.storage_pool + project = var.project_name + description = "Daedalus file storage bucket" +} + +# Access key for Daedalus S3 bucket +resource "incus_storage_bucket_key" "daedalus_key" { + name = "daedalus-access" + pool = incus_storage_bucket.daedalus.pool + storage_bucket = incus_storage_bucket.daedalus.name + project = var.project_name + role = "admin" +} + # Outputs for S3 credentials (to be stored in Ansible vault) output "lobechat_s3_credentials" { description = "Lobechat S3 bucket credentials - store in vault as vault_lobechat_s3_*" @@ -96,3 +113,14 @@ output "spelunker_s3_credentials" { } sensitive = true } + +output "daedalus_s3_credentials" { + description = "Daedalus S3 bucket credentials - store in vault as vault_daedalus_s3_*" + value = { + bucket = incus_storage_bucket.daedalus.name + access_key = incus_storage_bucket_key.daedalus_key.access_key + secret_key = incus_storage_bucket_key.daedalus_key.secret_key + endpoint = "https://${incus_storage_bucket.daedalus.location}" + } + sensitive = true +}