Compare commits

...

42 Commits

Author SHA1 Message Date
343b0e13d6 fix(certbot): harden renewal hook and fix permission errors
The renewal deploy-hook ran as the certbot user but lacked permissions to
write the combined PEM to /etc/haproxy/certs and to reload HAProxy,
causing silent failures that left a stale certificate in production until
expiry.

- Add certbot user to the haproxy group so it can write the combined PEM
- Grant certbot NOPASSWD sudo for `systemctl reload haproxy` only
- Make the Prometheus textfile directory group-owned by certbot (0775)
  so cert-metrics.sh can atomically update ssl_cert.prom
- Refactor renewal-hook.sh to always refresh cert metrics on exit via a
  trap, ensuring expiry alerts fire when the hook itself is broken
- Replace `set -e` with explicit error handling and structured logging
2026-06-17 09:58:46 -04:00
2f5a15eef5 chore(haproxy,terraform): harden haproxy stats and pin incus provider
- Add maxconn limit and HTTP timeouts to mitigate slowloris attacks
- Restrict stats endpoint to internal LAN and localhost only
- Hide HAProxy version on stats page
- Pin Incus Terraform provider to ~> 1.0 for stability
2026-06-09 22:52:23 -04:00
35061e3b6d Caliban: Update Rommie port 2026-06-07 08:14:55 -04:00
95682eca61 Caliban: configure Kernos mcp api key 2026-06-07 08:14:39 -04:00
711bbc093b Caliban: Update llama cpp ports 2026-06-07 08:14:18 -04:00
9bfa9a3617 feat(terraform): expand caliban port forwards and document port ranges
- Add proxy devices on caliban for SSH (25512), Postgres (25515),
  and three web ports (25516-25518) alongside existing RDP forward
- Remove HTTP/HTTPS proxy devices from prospero (now handled via
  HAProxy on titania)
- Document Incus port forwarding ranges (25510-25599) per host in
  ouranos.md and fix a typo
2026-06-07 06:40:42 -04:00
f2fb01ddd2 Titania: Add Hecate 2026-06-05 12:03:25 -04:00
c8ad7a0129 feat(terraform): add S3 storage bucket and credentials for Peitho 2026-06-01 13:47:18 -04:00
12b1db36f8 feat(haproxy): block internal observability endpoints from public traffic 2026-06-01 07:30:07 -04:00
77a82b4784 docs: update FreeCAD MCP README to document dual-service architecture 2026-05-31 10:13:43 -04:00
3893b91a55 feat(ansible): add CASE Field Systems MCP endpoint configuration
Configure FastAgent MCP server to connect to the CASE Field Systems
service over HTTP. Enables integration with LAN, SD Card, and
Provisioning workflows without authentication.

Uses dynamic Ansible variables for host and port to support
environment-specific deployments.
2026-05-30 10:19:24 -04:00
76a0e043e9 chore(ansible): add CASE agent configuration to kottos inventory
Introduce the CASE engineering agent by defining kottos_case_port
(24152) and updating the agents list comment. This extends the
systemd-managed pallas process configuration to include the CASE
runtime alongside existing Harper, Scotty, Research, and Tech
Research agents.
2026-05-30 09:44:07 -04:00
acf3419450 refactor(ansible): rename freecad_mcp env vars and rework deployment
- Drop `FREECAD_MCP_` prefix from env vars (use `FREECAD_*`)
- Update freecad_mcp port from 22032 to 22061
- Document that FreeCAD bridge is required for tool calls
- Replace kottos deployment with pallas deployment
2026-05-30 09:37:56 -04:00
bc431a3a2a refactor(alloy): remove athena syslog listener in favor of docker logs 2026-05-30 09:37:15 -04:00
30b5cab808 feat(rommie): add JPEG quality and size cap for get_screenshot
- Add ROMMIE_SCREENSHOT_JPEG_QUALITY and ROMMIE_SCREENSHOT_MAX_KB env vars
  to control parent-agent screenshot output encoding and size limit
- Configure defaults (quality 80, 512KB cap) in caliban.incus host vars
- Trigger rommie service restart when .env file changes
2026-05-28 13:30:17 -04:00
3bdb11dc72 chore(ansible): update model endpoints and enable Rommie deployment
- Bump Qwen model from 3.5 to 3.6 and update inference endpoints
  (nyx:22079→22072, pan:22078→22076) for caliban and puck hosts
- Add Rommie MCP server deployment to site.yml
- Update Rommie docs to reflect new port (20361), model versions,
  and health check accepting 200/406 status codes
2026-05-28 12:17:23 -04:00
a01feee663 chore(ansible): update vault credentials 2026-05-26 21:45:17 -04:00
f4a25316de SearXNG: set docker pull policy always 2026-05-26 06:47:48 -04:00
3c2f8c57ca feat(observability): add SearXNG, Argos, and Pallas monitoring
- Add SearXNG syslog ingestion and blackbox health probes on miranda
  and rosalind for per-host attributable failure detection
- Scrape Argos MCP application metrics from miranda
- Add Pallas dashboard panels for downstream availability and turn
  error ratios
2026-05-24 23:52:53 -04:00
43fae203d1 feat(ansible): standardize Neo4j ports and add monitoring
- Unify Neo4j HTTP/Bolt/syslog ports across ariel and umbriel hosts
- Add neo4j_metrics_port (22094) for APOC exporter sidecar
- Add umbriel to Prometheus node_exporter targets
- Add Neo4j scrape config and alerts for tx rollback rate and
  stalled store growth
- Replace kernos_harper MCP with andromeda (caliban.helu.ca)
- Remove angelia MCP from kottos fastagent config
- Switch neo4j group membership from keeper_user to ponos
2026-05-22 22:19:13 -04:00
698ceacb74 chore: update ansible vault secrets and credentials
Updated encrypted vault.yml file with new credentials and
secrets for production infrastructure
2026-05-17 07:32:51 -04:00
52d444f731 feat(ansible): add hold_slayer database variables and deployment
- Add hold_slayer_db_* variables to portia host_vars
- Update postgresql deploy.yml to create user, database,
  and enable extensions for hold_slayer
2026-05-16 19:10:49 -04:00
b2fc398782 Move llama-cpp to generic fastagent slot 2026-05-12 15:07:00 -04:00
8c95173705 feat(alloy): add journal relabeling and kottos integration on puck
Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.

Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00
e92ab80bbf feat(ansible): add Jellyfin service and improve deployment
- Add Jellyfin backend to HAProxy configuration on titania.incus
- Simplify deployment by using community.docker.docker_compose_v2 module
- Consolidate handlers and remove redundant Docker commands
- Update Jellyfin systemd service from oneshot to simple type
- Remove PUID/PGID environment variables from docker-compose template
2026-05-04 15:49:18 -04:00
f818b7917d feat(infra): add Jellyfin media server configuration and logging support
Add Jellyfin service to ansible inventory with hardware
transcoding and Casdoor SSO configuration. Configure
Alloy syslog listener to capture Jellyfin logs to Loki.
Update documentation with new service mapping and S3
bucket credential retrieval instructions.
2026-05-04 15:33:25 -04:00
b9ce14ff77 Docs: Update Ouranos to include new Umbriel instance 2026-05-03 19:35:55 -04:00
4ae6379613 chore(ansible): centralize third-party Docker image versions
Add centralized image version variables in group_vars/all/vars.yml for
vulnerability tracking and controlled upgrades of third-party Docker
images (casdoor, flower, grafana-mcp, gitea-mcp, neo4j, memcached,
nginx, oauth2-proxy, rabbitmq, searxng).

Update vault.yml accordingly.
2026-05-03 18:57:58 -04:00
2be323f27e Casdoor: Change to curl for healthcheck 2026-05-02 07:01:54 -04:00
14f026d0bb Docs: Pallas agents 2026-04-29 07:21:01 -04:00
0789edc31a Docs: Pallas Agents 2026-04-29 07:21:00 -04:00
2794822871 docs: add Django-specific Red Panda Standards addendum
Add `Red_Panda_Standards_Django_V1-01.md` which extends the main Red
Panda Standards with Django-specific conventions covering:

- Environment setup and pyproject.toml build backend (setuptools)
- Dependency pinning strategy (floor pin with ceiling)
- Project directory structure
- Settings, environment variable, and database configuration patterns
- Code organization, model, view, URL, and serializer conventions
- Authentication, permissions, and API design guidelines
- Testing standards and Docker/deployment practices
2026-04-20 09:37:01 -04:00
1509b81ce0 Docs: deleted outdated file 2026-04-18 13:38:57 -04:00
5251288975 Docs: Red Panda Standards Update regarding logging 2026-04-18 06:58:59 -04:00
072291929f Docs: Red Panda Standards Upate 2026-04-18 06:36:43 -04:00
60074612f3 PGAdmin setup steps corrections 2026-04-13 16:37:18 +00:00
6301facc1a Vault additions 2026-04-13 15:47:47 +00:00
f3f599a33a Vault formatting 2026-04-13 15:31:49 +00:00
d60b9a972f feat(ansible): add mnemosyne db and update ouranos documentation
- Configure mnemosyne database credentials in ansible inventory
- Update postgresql playbook to provision user and database
- Add setup instructions and DB list to documentation
2026-04-13 14:31:21 +00:00
2f5a445945 Ouranos Vault Mnemosyne DB password 2026-04-13 12:49:28 +00:00
9a9f7986fc HA Proxy config for Periplus 2026-04-11 23:30:15 +00:00
c31c86f3b2 Port updates for MCP servers 2026-04-11 18:48:21 +00:00
78 changed files with 9829 additions and 977 deletions

View File

@@ -33,6 +33,7 @@
dest: "{{agent_s_repo_dir}}"
version: "{{agent_s_rel}}"
accept_hostkey: true
force: yes
register: agent_s_clone
- name: Fetch all remote branches and tags (Agent-S)

View File

@@ -93,6 +93,20 @@ loki.source.syslog "gitea_mcp_logs" {
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "searxng_logs" {
listener {
address = "127.0.0.1:{{searxng_syslog_port}}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
labels = {
job = "searxng",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
prometheus.exporter.unix "default" {
include_exporter_metrics = true
disable_collectors = ["mdadm"]
@@ -104,6 +118,45 @@ prometheus.scrape "default" {
job_name = "mcp_docker_host"
}
// Argos MCP application metrics (/metrics is exposed by argos itself; see
// argos/argos_searxng/metrics.py).
prometheus.scrape "argos" {
targets = [{
__address__ = "127.0.0.1:{{argos_port}}",
job = "argos",
instance = "{{inventory_hostname}}",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}]
forward_to = [prometheus.remote_write.default.receiver]
scrape_interval = "30s"
metrics_path = "/metrics"
}
// Independent verification that this host's SearXNG instance answers /healthz
// (Argos's own per-instance gauge can lie — argos itself could be sick).
prometheus.exporter.blackbox "searxng" {
config = "{ modules: { http_2xx: { prober: http, timeout: 5s, http: { valid_status_codes: [200] } } } }"
target {
name = "{{inventory_hostname}}"
address = "http://127.0.0.1:{{searxng_port}}/healthz"
module = "http_2xx"
labels = {
service = "searxng",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
}
prometheus.scrape "searxng_blackbox" {
targets = prometheus.exporter.blackbox.searxng.targets
forward_to = [prometheus.remote_write.default.receiver]
scrape_interval = "30s"
job_name = "searxng_blackbox"
}
prometheus.remote_write "default" {
endpoint {
url = "{{prometheus_remote_write_url}}"

View File

@@ -18,10 +18,60 @@ loki.source.file "system_logs" {
forward_to = [loki.write.default.receiver]
}
// Journal relabel rules — tag Pallas-managed units (kottos now, mentor /
// iolaus later) with the same {service, project, component} schema used
// by Mnemosyne and Daedalus. Rules run top-to-bottom and STOP at the
// first target_label match per source, so the generic "systemd" fallback
// stays last. If a new Pallas host/project ever lands here, copy one of
// the blocks below and adjust SyslogIdentifier + project.
loki.relabel "journal_puck" {
forward_to = []
// Expose the systemd unit as an auxiliary label for debugging.
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
// Kottos — Pallas FastAgent runtime for the engineering agent project.
// SyslogIdentifier=kottos is set in ouranos/ansible/kottos/kottos.service.j2.
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "service"
replacement = "pallas"
}
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "project"
replacement = "kottos"
}
// Alloy itself — useful to separate from the "systemd" bucket when the
// shipping pipeline misbehaves.
rule {
source_labels = ["__journal__systemd_unit"]
regex = "alloy\\.service"
target_label = "service"
replacement = "alloy"
}
// Default fallback — everything else becomes service="systemd". We
// also set job here for backwards compatibility with existing
// dashboards that filter on ``job="systemd"``.
rule {
source_labels = ["__journal__systemd_unit"]
regex = ".+"
target_label = "job"
replacement = "systemd"
}
}
loki.source.journal "systemd_logs" {
forward_to = [loki.write.default.receiver]
relabel_rules = loki.relabel.journal_puck.rules
labels = {
job = "systemd",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
@@ -41,19 +91,11 @@ loki.source.syslog "angelia_logs" {
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "athena_logs" {
listener {
address = "127.0.0.1:{{athena_syslog_port}}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
labels = {
job = "athena",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
// Athena used to ship via syslog on {{athena_syslog_port}}; it logs to
// container stdout and is now picked up by the docker-socket block below
// (service="athena", component=app/mcp/nginx). The host_var is retained as a
// reserved port number but no listener binds to it — remove the var from the
// inventory when the rollout is verified.
loki.source.syslog "kairos_logs" {
listener {
@@ -69,19 +111,11 @@ loki.source.syslog "kairos_logs" {
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "menosyne_logs" {
listener {
address = "127.0.0.1:{{mnemosyne_syslog_port}}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
labels = {
job = "menosyne",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
// Mnemosyne used to ship via syslog on {{mnemosyne_syslog_port}}; it now
// logs line-delimited JSON to container stdout and is picked up by the
// docker-socket block below. The host_var is retained as a reserved port
// number but no listener binds to it — remove the var from the inventory
// when the rollout is verified.
loki.source.syslog "spelunker_logs" {
listener {
@@ -111,19 +145,66 @@ loki.source.syslog "jupyterlab_logs" {
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "daedalus_logs" {
listener {
address = "127.0.0.1:{{daedalus_syslog_port}}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
// Daedalus also used to ship via syslog on {{daedalus_syslog_port}}; it
// already emits structlog JSON to stdout, so the docker-socket block
// below now handles it. Host_var kept for the same transitional reason
// as mnemosyne above.
// ----------------------------------------------------------------------------
// Docker socket — any compose project on this host lands in Loki with
// `service` = compose project (e.g. "mnemosyne", "daedalus", "kairos") and
// `component` = compose service (e.g. "app", "mcp", "worker", "nginx").
// This replaces per-service syslog listeners — one block covers every
// compose project, current and future.
//
// Requires: the Alloy process to have read access to /var/run/docker.sock
// (Ansible role should add the alloy user to the `docker` group). No Docker
// daemon changes required — we scrape the json-file driver, which is Docker's
// default and is pinned in each compose project's x-logging anchor.
// ----------------------------------------------------------------------------
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
refresh_interval = "30s"
}
discovery.relabel "containers" {
targets = discovery.docker.containers.targets
// Compose project → service label
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
target_label = "service"
}
// Compose service → component label
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "component"
}
// Container name (for one-off / non-compose containers)
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
// Fall back to the container name as `service` when compose labels are
// absent (e.g. a `docker run ...` container outside any compose project)
rule {
source_labels = ["service", "container"]
separator = "@"
regex = "@(.+)"
target_label = "service"
}
}
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.relabel.containers.output
forward_to = [loki.write.default.receiver]
labels = {
job = "daedalus",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
loki.write "default" {
endpoint {

View File

@@ -75,6 +75,21 @@ loki.source.syslog "lobechat_logs" {
forward_to = [loki.write.default.receiver]
}
// Jellyfin Docker syslog
loki.source.syslog "jellyfin_logs" {
listener {
address = "127.0.0.1:{{ jellyfin_syslog_port }}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
labels = {
job = "jellyfin",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "searxng_logs" {
listener {
address = "127.0.0.1:{{searxng_syslog_port}}"
@@ -175,6 +190,31 @@ prometheus.scrape "gitea" {
bearer_token = "{{gitea_metrics_token}}"
}
// Independent verification that this host's SearXNG instance answers /healthz.
// Argos (on miranda) load-balances across this instance and miranda's own;
// each host's Alloy probes its local SearXNG so failures are attributable.
prometheus.exporter.blackbox "searxng" {
config = "{ modules: { http_2xx: { prober: http, timeout: 5s, http: { valid_status_codes: [200] } } } }"
target {
name = "{{inventory_hostname}}"
address = "http://127.0.0.1:{{searxng_port}}/healthz"
module = "http_2xx"
labels = {
service = "searxng",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
}
prometheus.scrape "searxng_blackbox" {
targets = prometheus.exporter.blackbox.searxng.targets
forward_to = [prometheus.remote_write.default.receiver]
scrape_interval = "30s"
job_name = "searxng_blackbox"
}
// Prometheus remote write endpoint
prometheus.remote_write "default" {
endpoint {

View File

@@ -0,0 +1,57 @@
logging {
level = "{{alloy_log_level}}"
}
loki.source.file "system_logs" {
targets = [
{__path__ = "/var/log/syslog", job = "syslog"},
{__path__ = "/var/log/auth.log", job = "auth"},
]
forward_to = [loki.write.default.receiver]
}
loki.source.journal "systemd_logs" {
forward_to = [loki.write.default.receiver]
labels = {
job = "systemd",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
loki.source.syslog "neo4j_logs" {
listener {
address = "127.0.0.1:{{neo4j_syslog_port}}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}"
labels = {
job = "neo4j",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
prometheus.exporter.unix "default" {
include_exporter_metrics = true
disable_collectors = ["mdadm"]
}
prometheus.scrape "default" {
targets = prometheus.exporter.unix.default.targets
forward_to = [prometheus.remote_write.default.receiver]
job_name = "containers"
}
prometheus.remote_write "default" {
endpoint {
url = "{{prometheus_remote_write_url}}"
}
}
loki.write "default" {
endpoint {
url = "{{loki_url}}"
}
}

View File

@@ -27,7 +27,10 @@ services:
tag: "casdoor"
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:{{ casdoor_port }}/api/health"]
# curl is installed in the casbin/casdoor image (see upstream Dockerfile);
# wget is not guaranteed to be present, and BusyBox wget --spider behaves
# inconsistently. Use `curl -f` per ouranos.md standards.
test: ["CMD", "curl", "-f", "http://localhost:{{ casdoor_port }}/api/health"]
interval: 30s
timeout: 10s
retries: 3

View File

@@ -86,6 +86,19 @@
groups: "{{ certbot_group }}"
append: true
# The renewal deploy-hook runs as the certbot user and writes the combined
# PEM into the group-writable /etc/haproxy/certs (mode 0770, owned by the
# haproxy group). certbot must be a member of that group, otherwise the
# hook fails with "Permission denied" and HAProxy serves a stale cert until
# it expires.
- name: Add certbot user to the haproxy group
become: true
ansible.builtin.user:
name: "{{ certbot_user }}"
groups: "{{ haproxy_group }}"
append: true
when: "'haproxy' in services | default([])"
# -------------------------------------------------------------------------
# Directory Structure
# -------------------------------------------------------------------------
@@ -178,14 +191,32 @@
group: "{{ certbot_group }}"
mode: '0750'
# Group-owned by certbot and group-writable so cert-metrics.sh (run as the
# certbot user from the renewal hook) can atomically write ssl_cert.prom.
# node-exporter only needs to read these files, which 0775 still allows.
# The renewal hook reloads HAProxy after installing a new cert, but runs as
# the unprivileged certbot user. Grant exactly `systemctl reload haproxy`
# via sudo — nothing more. visudo validation prevents a malformed drop-in
# from locking out sudo.
- name: Allow certbot to reload HAProxy via sudo
become: true
ansible.builtin.copy:
dest: /etc/sudoers.d/certbot-haproxy-reload
content: "{{ certbot_user }} ALL=(root) NOPASSWD: /usr/bin/systemctl reload haproxy\n"
owner: root
group: root
mode: '0440'
validate: visudo -cf %s
when: "'haproxy' in services | default([])"
- name: Create Prometheus textfile directory
become: true
ansible.builtin.file:
path: "{{ prometheus_node_exporter_text_directory }}"
state: directory
owner: root
group: root
mode: '0755'
group: "{{ certbot_group }}"
mode: '0775'
- name: Template certificate metrics script
become: true

View File

@@ -8,7 +8,7 @@
# 3. Reloads HAProxy via systemd
# 4. Updates certificate metrics for Prometheus
set -euo pipefail
set -uo pipefail
# RENEWED_LINEAGE is set by certbot --deploy-hook or passed explicitly by deploy.yml
CERT_DIR="${RENEWED_LINEAGE:?RENEWED_LINEAGE must be set}"
@@ -16,37 +16,70 @@ CERT_NAME=$(basename "${CERT_DIR}")
HAPROXY_CERT="{{ haproxy_cert_path }}"
HAPROXY_DIR="{{ haproxy_directory }}"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Starting renewal hook for ${CERT_NAME}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $*" >&2; }
# Always refresh Prometheus cert metrics on exit, even if installation below
# fails. The metrics drive the SSLCertificateExpired/ExpiringSoon alerts, so
# they must reflect reality precisely when the hook is broken — otherwise a
# failed renewal rots silently (which is exactly how the cert expired before).
# A non-zero exit is reported by certbot as a WARNING, surfacing the failure.
hook_status=0
finish() {
{{ certbot_directory }}/hooks/cert-metrics.sh || fail "cert-metrics.sh failed"
if [[ ${hook_status} -ne 0 ]]; then
fail "Renewal hook FAILED for ${CERT_NAME} — HAProxy is serving a STALE certificate"
fi
exit "${hook_status}"
}
trap finish EXIT
log "Starting renewal hook for ${CERT_NAME}"
# Check if certificate files exist
if [[ ! -f "${CERT_DIR}/fullchain.pem" ]] || [[ ! -f "${CERT_DIR}/privkey.pem" ]]; then
echo "ERROR: Certificate files not found in ${CERT_DIR}"
fail "Certificate files not found in ${CERT_DIR}"
hook_status=1
exit 1
fi
# Combine certificate and private key for HAProxy
# HAProxy requires both in a single PEM file
cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"
# Combine certificate and private key for HAProxy (single PEM), writing to a
# temp file in the same directory and moving atomically so HAProxy never reads
# a partial file. A permission failure here is the documented failure mode.
if ! cat "${CERT_DIR}/fullchain.pem" "${CERT_DIR}/privkey.pem" > "${HAPROXY_CERT}.tmp"; then
fail "Could not write ${HAPROXY_CERT}.tmp — check ownership/permissions of $(dirname "${HAPROXY_CERT}")"
rm -f "${HAPROXY_CERT}.tmp"
hook_status=1
exit 1
fi
# Atomic move to avoid HAProxy reading partial file
mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"
if ! mv "${HAPROXY_CERT}.tmp" "${HAPROXY_CERT}"; then
fail "Could not move combined PEM into place at ${HAPROXY_CERT}"
rm -f "${HAPROXY_CERT}.tmp"
hook_status=1
exit 1
fi
# Set permissions
chown {{ certbot_user }}:{{ haproxy_group }} "${HAPROXY_CERT}"
chmod 640 "${HAPROXY_CERT}"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Certificate combined and written to ${HAPROXY_CERT}"
log "Certificate combined and written to ${HAPROXY_CERT}"
# Reload HAProxy if running
# Reload HAProxy if running. The hook runs as the unprivileged certbot user,
# so the reload goes through sudo (a scoped sudoers rule grants exactly this
# command). sudo -n fails fast rather than blocking on a password prompt.
if systemctl is-active --quiet haproxy; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Reloading HAProxy..."
systemctl reload haproxy
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy reloaded"
log "Reloading HAProxy..."
if sudo -n systemctl reload haproxy; then
log "HAProxy reloaded"
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] HAProxy not running, skipping reload"
fail "HAProxy reload failed"
hook_status=1
exit 1
fi
else
log "HAProxy not running, skipping reload"
fi
# Update certificate metrics
{{ certbot_directory }}/hooks/cert-metrics.sh
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Renewal hook completed successfully"
log "Renewal hook completed successfully"

469
ansible/comfyui/README.md Normal file
View File

@@ -0,0 +1,469 @@
<div align="center">
# ComfyUI
**The most powerful and modular AI engine for content creation.**
[![Website][website-shield]][website-url]
[![Dynamic JSON Badge][discord-shield]][discord-url]
[![Twitter][twitter-shield]][twitter-url]
[![Matrix][matrix-shield]][matrix-url]
<br>
[![][github-release-shield]][github-release-link]
[![][github-release-date-shield]][github-release-link]
[![][github-downloads-shield]][github-downloads-link]
[![][github-downloads-latest-shield]][github-downloads-link]
[matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
[matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
[website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
[website-url]: https://www.comfy.org/
<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
[discord-url]: https://discord.com/invite/comfyorg
[twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
[twitter-url]: https://x.com/ComfyUI
[github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
[github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
[github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
[github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
<br>
</div>
ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
- ComfyUI natively supports the latest open-source state of the art models.
- API nodes provide access to the best closed source models such as Nano Banana, Seedance, Hunyuan3D, etc.
- It is available on Windows, Linux, and macOS, locally with our [desktop application](https://www.comfy.org/download), our [portable install](#installing) or on our [cloud](https://www.comfy.org/cloud).
- The most sophisticated workflows can be exposed through a simple UI thanks to App Mode.
- It integrates seamlessly into production pipelines with our API endpoints.
## Get Started
### Local
#### [Desktop Application](https://www.comfy.org/download)
- The easiest way to get started.
- Available on Windows & macOS.
#### [Windows Portable Package](#installing)
- Get the latest commits and completely portable.
- Available on Windows.
#### [Manual Install](#manual-install-windows-linux)
Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
### Cloud
#### [Comfy Cloud](https://www.comfy.org/cloud)
- Our official paid cloud version for those who can't afford local hardware.
## Examples
See what ComfyUI can do with the [newer template workflows](https://comfy.org/workflows) or old [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
## Features
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- NOTE: There are many more models supported than the list below, if you want to see what is supported see our templates list inside ComfyUI.
- Image Models
- SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/))
- [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
- [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
- [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
- Pixart Alpha and Sigma
- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
- [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
- [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
- [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
- [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
- Ernie Image
- Image Editing Models
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
- [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11)
- [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model)
- Video Models
- [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
- [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
- [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
- [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
- [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
- [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
- [Hunyuan Video 1.5](https://docs.comfy.org/tutorials/video/hunyuan/hunyuan-video-1-5)
- Audio Models
- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- 3D Models
- [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
- Asynchronous Queue system
- Many optimizations: Only re-executes the parts of the workflow that changes between executions.
- Smart memory management: can automatically run large models on GPUs with as low as 1GB vram with smart offloading.
- Works even if you don't have a GPU with: ```--cpu``` (slow)
- Can load ckpt and safetensors: All in one checkpoints or standalone diffusion models, VAEs and CLIP models.
- Safe loading of ckpt, pt, pth, etc.. files.
- Embeddings/Textual inversion
- [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
- [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/)
- Loading full workflows (with seeds) from generated PNG, WebP and FLAC files.
- Saving/Loading workflows as Json files.
- Nodes interface can be used to create complex workflows like one for [Hires fix](https://comfyanonymous.github.io/ComfyUI_examples/2_pass_txt2img/) or much more advanced ones.
- [Area Composition](https://comfyanonymous.github.io/ComfyUI_examples/area_composition/)
- [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models.
- [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/)
- [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/)
- [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
- [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
- [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
- Latent previews with [TAESD](#how-to-show-high-quality-previews)
- Works fully offline: core will never download anything unless you want to.
- Optional API nodes to use paid models from external providers through the online [Comfy API](https://docs.comfy.org/tutorials/api-nodes/overview) disable with: `--disable-api-nodes`
- [Config file](extra_model_paths.yaml.example) to set the search paths for models.
Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)
## Release Process
ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
- Releases a new major stable version (e.g., v0.7.0) roughly every 2 weeks.
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
- Minor versions will be used for releases off the master branch.
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
- Commits outside of the stable release tags may be very unstable and break many custom nodes.
- Serves as the foundation for the desktop release
2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
- Builds a new release using the latest stable core version
3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
- Every 2+ weeks frontend updates are merged into the core repository
- Features are frozen for the upcoming core release
- Development continues for the next release cycle
## Shortcuts
| Keybind | Explanation |
|------------------------------------|--------------------------------------------------------------------------------------------------------------------|
| `Ctrl` + `Enter` | Queue up current graph for generation |
| `Ctrl` + `Shift` + `Enter` | Queue up current graph as first for generation |
| `Ctrl` + `Alt` + `Enter` | Cancel current generation |
| `Ctrl` + `Z`/`Ctrl` + `Y` | Undo/Redo |
| `Ctrl` + `S` | Save workflow |
| `Ctrl` + `O` | Load workflow |
| `Ctrl` + `A` | Select all nodes |
| `Alt `+ `C` | Collapse/uncollapse selected nodes |
| `Ctrl` + `M` | Mute/unmute selected nodes |
| `Ctrl` + `B` | Bypass selected nodes (acts like the node was removed from the graph and the wires reconnected through) |
| `Delete`/`Backspace` | Delete selected nodes |
| `Ctrl` + `Backspace` | Delete the current graph |
| `Space` | Move the canvas around when held and moving the cursor |
| `Ctrl`/`Shift` + `Click` | Add clicked node to selection |
| `Ctrl` + `C`/`Ctrl` + `V` | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes) |
| `Ctrl` + `C`/`Ctrl` + `Shift` + `V` | Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
| `Shift` + `Drag` | Move multiple selected nodes at the same time |
| `Ctrl` + `D` | Load default graph |
| `Alt` + `+` | Canvas Zoom in |
| `Alt` + `-` | Canvas Zoom out |
| `Ctrl` + `Shift` + LMB + Vertical drag | Canvas Zoom in/out |
| `P` | Pin/Unpin selected nodes |
| `Ctrl` + `G` | Group selected nodes |
| `Q` | Toggle visibility of the queue |
| `H` | Toggle visibility of history |
| `R` | Refresh graph |
| `F` | Show/Hide menu |
| `.` | Fit view to selection (Whole graph when nothing is selected) |
| Double-Click LMB | Open node quick search palette |
| `Shift` + Drag | Move multiple wires at once |
| `Ctrl` + `Alt` + LMB | Disconnect all wires from clicked slot |
`Ctrl` can also be replaced with `Cmd` instead for macOS users
# Installing
## Windows Portable
There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).
### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)
Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\
If you have trouble extracting it, right click the file -> properties -> unblock
The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
#### All Official Portable Downloads:
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
#### How do I share models between another UI and ComfyUI?
See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
You can install and start ComfyUI using comfy-cli:
```bash
pip install comfy-cli
comfy install
```
## Manual Install (Windows, Linux)
Python 3.14 works but some custom nodes may have issues. The free threaded variant works but some dependencies will enable the GIL so it's not fully supported.
Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
torch 2.4 and above is supported but some features and optimizations might only work on newer versions. We generally recommend using the latest major version of pytorch with the latest cuda version unless it is less than 2 weeks old.
### Instructions:
Git clone this repo.
Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
Put your VAE in: models/vae
### AMD GPUs (Linux)
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2```
This is the command to install the nightly with ROCm 7.2 which might have some performance improvements:
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.2```
### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only.
These have less hardware support than the builds above but they work on windows. You also need to install the pytorch version specific to your hardware.
RDNA 3 (RX 7000 series):
```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/```
RDNA 3.5 (Strix halo/Ryzen AI Max+ 365):
```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx1151/```
RDNA 4 (RX 9000 series):
```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all/```
### Intel GPUs (Windows and Linux)
Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
1. To install PyTorch xpu, use the following command:
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu```
This is the command to install the Pytorch xpu nightly which might have some performance improvements:
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```
### NVIDIA
Nvidia users should install stable pytorch using this command:
```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130```
This is the command to install pytorch nightly instead which might have performance improvements.
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu132```
#### Troubleshooting
If you get the "Torch not compiled with CUDA enabled" error, uninstall torch with:
```pip uninstall torch```
And install it again with the command above.
### Dependencies
Install the dependencies by opening your terminal inside the ComfyUI folder and:
```pip install -r requirements.txt```
After this you should have everything installed and can proceed to running ComfyUI.
### Others:
#### Apple Mac silicon
You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
1. Install pytorch nightly. For instructions, read the [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) Apple Developer guide (make sure to install the latest pytorch nightly).
1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux.
1. Install the ComfyUI [dependencies](#dependencies). If you have another Stable Diffusion UI [you might be able to reuse the dependencies](#i-already-have-another-ui-for-stable-diffusion-installed-do-i-really-have-to-install-all-of-these-dependencies).
1. Launch ComfyUI by running `python main.py`
> **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux).
#### Ascend NPUs
For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
#### Cambricon MLUs
For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
3. Launch ComfyUI by running `python main.py`
#### Iluvatar Corex
For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method:
1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536)
2. Launch ComfyUI by running `python main.py`
## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4)
**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI.
### Setup
1. Install the manager dependencies:
```bash
pip install -r manager_requirements.txt
```
2. Enable the manager with the `--enable-manager` flag when running ComfyUI:
```bash
python main.py --enable-manager
```
### Command Line Options
| Flag | Description |
|------|-------------|
| `--enable-manager` | Enable ComfyUI-Manager |
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (implies `--enable-manager`) |
| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
# Running
```python main.py```
### For AMD cards not officially supported by ROCm
Try running it with this command if you have issues:
For 6700, 6600 and maybe other RDNA2 or older: ```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 python main.py```
### AMD ROCm Tips
You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.
```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```
You can also try setting this env variable `PYTORCH_TUNABLEOP_ENABLED=1` which might speed things up at the cost of a very slow initial run.
# Notes
Only parts of the graph that have an output with all the correct inputs will be executed.
Only parts of the graph that change from each execution to the next will be executed, if you submit the same graph twice only the first will be executed. If you change the last part of the graph only the part you changed and the part that depends on it will be executed.
Dragging a generated png on the webpage or loading one will give you the full workflow including seeds that were used to create it.
You can use () to change emphasis of a word or phrase like: (good code:1.2) or (bad code:0.8). The default emphasis for () is 1.1. To use () characters in your actual prompt escape them like \\( or \\).
You can use {day|night}, for wildcard/dynamic prompts. With this syntax "{wild|card|test}" will be randomly replaced by either "wild", "card" or "test" by the frontend every time you queue the prompt. To use {} characters in your actual prompt escape them like: \\{ or \\}.
Dynamic prompts also support C-style comments, like `// comment` or `/* comment */`.
To use a textual inversion concepts/embeddings in a text prompt put them in the models/embeddings directory and use them in the CLIPTextEncode node like this (you can omit the .pt extension):
```embedding:embedding_filename.pt```
## How to show high-quality previews?
Use ```--preview-method auto``` to enable previews.
The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.
## How to use TLS/SSL?
Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app will now be accessible with `https://...` instead of `http://...`.
> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above.
<br/><br/>If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal.
## Support and dev channel
[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
[Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).
See also: [https://www.comfy.org/](https://www.comfy.org/)
> _psst — we're hiring!_ Help build ComfyUI: [comfy.org/careers](https://www.comfy.org/careers)
## Frontend Development
As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). The compiled JS files (from TS/Vue) are published to [pypi](https://pypi.org/project/comfyui-frontend-package) and installed as a dependency in ComfyUI.
### Reporting Issues and Requesting Features
For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
### Using the Latest Frontend
The new frontend is now the default for ComfyUI. However, please note:
1. The frontend in the main ComfyUI repository is updated fortnightly.
2. Daily releases are available in the separate frontend repository.
To use the most up-to-date frontend version:
1. For the latest daily release, launch ComfyUI with this command line argument:
```
--front-end-version Comfy-Org/ComfyUI_frontend@latest
```
2. For a specific version, replace `latest` with the desired version number:
```
--front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
```
This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
# QA
### Which GPU should I buy for this?
[See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI)

View File

@@ -4,18 +4,17 @@
# =============================================================================
# MCP Transport Configuration
# =============================================================================
FREECAD_MCP_TRANSPORT=http
FREECAD_MCP_HTTP_PORT={{ freecad_mcp_port }}
FREECAD_TRANSPORT=http
FREECAD_HTTP_PORT={{ freecad_mcp_port }}
# =============================================================================
# FreeCAD Connection Mode
# =============================================================================
FREECAD_MCP_MODE={{ freecad_mcp_mode | default('xmlrpc') }}
FREECAD_MCP_XMLRPC_HOST={{ freecad_mcp_xmlrpc_host | default('localhost') }}
FREECAD_MCP_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }}
FREECAD_MCP_TIMEOUT_MS={{ freecad_mcp_timeout_ms | default('30000') }}
FREECAD_MODE={{ freecad_mcp_mode | default('xmlrpc') }}
FREECAD_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }}
FREECAD_TIMEOUT_MS={{ freecad_mcp_timeout_ms | default('30000') }}
# =============================================================================
# Logging
# =============================================================================
FREECAD_MCP_LOG_LEVEL={{ freecad_mcp_log_level | default('INFO') }}
FREECAD_LOG_LEVEL={{ freecad_mcp_log_level | default('INFO') }}

View File

@@ -1,51 +1,104 @@
# FreeCAD Robust MCP Server — Ansible Deployment
Deploys the [FreeCAD Robust MCP Server](https://pypi.org/project/freecad-robust-mcp/)
to Caliban as a systemd service with HTTP transport, ready for MCP Switchboard
consumption.
to Caliban as **two** systemd services:
- **`freecad-mcp.service`** — the MCP server (HTTP/streamable-http transport on
`:22061`), pip-installed into a venv under `/srv/freecad-mcp`, run as the
hardened `harper` service user.
- **`freecad-mcp-bridge.service`** — FreeCAD itself running in **GUI** mode on
the XRDP desktop (display `:10`), exposing the XML-RPC bridge on
`localhost:9875`. Run as `robert` (the `principal_user`, who owns the X
session), from source staged as a tarball.
The MCP server connects to the bridge over `localhost:9875`; the bridge in turn
drives FreeCAD. The two halves rendezvous only on that local port.
## Architecture
```
┌─────────────────────────────────────────────────┐
┌──────────────────────────────────────────────────────────
│ caliban.incus │
│ │
│ ┌──────────────────────┐ │
│ │ freecad-mcp.service │ │
│ │ (streamable-http) │◄─── :22032 ──────────┤◄── MCP Switchboard
│ │ venv + PyPI package │ │ (oberon.incus)
│ └─────────────────────┘ │
│ │ xmlrpc :9875 │
│ │ (streamable-http) │◄─── :22061 ────────────────────┤◄── MCP Client
│ │ venv + PyPI package │ (user: harper, hardened) │
│ └─────────────────────┘
│ xmlrpc localhost:9875
│ ▼ │
│ ┌──────────────────────
│ │ FreeCAD (future)
│ │ XML-RPC server │
└──────────────────────┘
└─────────────────────────────────────────────────┘
│ ┌──────────────────────────────┐
│ │ freecad-mcp-bridge.service
│ │ /usr/bin/freecad (GUI) DISPLAY=:10 (XRDP)
│ startup_bridge.py user: robert
│ │ XML-RPC :9875 / socket :9876│ │
│ └──────────────────────────────┘ │
└──────────────────────────────────────────────────────────┘
```
## Two services, two users (by design)
| Service | User | Transport / port | Hardened | Needs X |
| ---------------------------- | -------- | ----------------------- | -------- | ------- |
| `freecad-mcp.service` | `harper` | HTTP `:22061` | yes | no |
| `freecad-mcp-bridge.service` | `robert` | XML-RPC `:9875` (+ 9876) | no | yes (`:10`) |
The bridge runs as `robert` because it attaches to the standard XRDP display
`:10`, owned by `robert` with Xauthority `/home/robert/.Xauthority`. It cannot
be hardened like the server unit — it needs the user's X session and home.
## How the bridge starts (no `just`/`mise`/`uv` needed)
The bridge runs **inside FreeCAD's own Python interpreter** via
`/usr/bin/freecad <startup_bridge.py>`. The README "Option B"
(`just freecad::run-gui`) in the upstream repo is only a launcher wrapper that
locates FreeCAD and runs that same script — `just`, `mise`, and `uv` are not
required for the bridge.
The bridge scripts are **not** shipped in the pip wheel (it packages only
`src/freecad_mcp`). They live in the git repo under
`freecad/RobustMCPBridge/freecad_mcp_bridge/`, so the bridge is delivered
separately as a staged tarball (see Deployment below).
> **GUI vs headless:** We run GUI mode to keep the GUI-only tools (screenshots,
> object color, visibility, camera). `freecadcmd <blocking_bridge.py>` would run
> headless without those tools — not used here.
> **Python version:** FreeCAD 1.0.0 on Caliban uses the system Python (3.13),
> not a bundled 3.11. The upstream ABI-match warning applies only to *embedded*
> mode (importing `FreeCAD` into an external interpreter). We run scripts inside
> FreeCAD and the bridge is pure stdlib, so the version mismatch is a non-issue.
## Lazy connect: a green server healthcheck is not "FreeCAD reachable"
`freecad-mcp.service` starts and answers the MCP `initialize` handshake **without**
the bridge running — the XML-RPC connection to FreeCAD is only attempted on the
first CAD tool call. So the server playbook's `initialize` check proves
"transport up", **not** "FreeCAD reachable". The bridge playbook's validation
(below) is what proves the full chain.
## Prerequisites
- Caliban host in Ansible inventory (already exists in Ouranos)
- Python 3.11+ on Caliban (already present)
- Caliban host in the `freecad_mcp` inventory group (already configured).
- `python3` + `python3-venv` on Caliban (installed by the playbook).
- `freecad` package on Caliban (installed by the playbook).
- The XRDP display `:10` running, owned by `robert` (the standard Ouranos RDP
desktop — not configured here, it is always present).
## Deployment
### 1. Copy playbook files to Ouranos
Copy the contents of this directory into your Ouranos repo:
## Files in this role
```
ansible/freecad_mcp/
├── deploy.yml
├── .env.j2
── freecad-mcp.service.j2
├── deploy.yml # Two plays: MCP server + GUI bridge
├── stage.yml # Clones the fork + builds the bridge tarball
── .env.j2 # MCP server env (FREECAD_* vars)
├── freecad-mcp.service.j2 # MCP server unit (harper, hardened)
└── freecad-mcp-bridge.service.j2 # FreeCAD GUI bridge unit (robert, :10)
```
### 2. Add inventory group
## Inventory
Add to `ansible/inventory/hosts`:
`ansible/inventory/hosts` (already present):
```yaml
freecad_mcp:
@@ -53,70 +106,101 @@ freecad_mcp:
caliban.incus:
```
### 3. Add host variables
Add to `ansible/inventory/host_vars/caliban.incus.yml`:
Host vars in `ansible/inventory/host_vars/caliban.incus.yml`:
```yaml
# FreeCAD Robust MCP Server
freecad_mcp_user: harper
freecad_mcp_group: harper
freecad_mcp_directory: /srv/freecad-mcp
freecad_mcp_port: 22032
freecad_mcp_version: "0.5.0"
freecad_mcp_port: 22061
freecad_mcp_xmlrpc_port: 9875
freecad_mcp_socket_port: 9876
# FreeCAD MCP Bridge (GUI, runs as principal_user on the XRDP display)
freecad_mcp_bridge_directory: "/home/{{ principal_user }}/freecad-mcp-bridge"
freecad_mcp_bridge_display: ":10"
```
Update `services` list:
Group vars in `ansible/inventory/group_vars/all/vars.yml`:
```yaml
services:
- alloy
- caliban
- docker
- freecad_mcp
- kernos
freecad_mcp_version: 0.6.1 # PyPI version pin (server install)
freecad_mcp_git_ref: "main" # fork ref for BOTH the pip install and the staged bridge tarball
```
### 4. Run the playbook
## Deployment
The bridge source is delivered via the staging pattern: cloned on the Ansible
controller, packed with `git archive`, and unpacked on the host (no deploy keys
on Caliban). Stage first, then deploy:
```bash
cd ~/git/ouranos/ansible
source ~/env/ouranos/bin/activate
# 1. Build the bridge tarball on the controller (~/rel/freecad_mcp_bridge_<ref>.tar)
ansible-playbook freecad_mcp/stage.yml
# 2. Deploy the MCP server (idempotent) + the GUI bridge
ansible-playbook freecad_mcp/deploy.yml
```
`stage.yml` clones/pulls the fork into `~/gh/freecad-addon-robust-mcp-server` at
`freecad_mcp_git_ref` and `git archive`s it to
`~/rel/freecad_mcp_bridge_<ref>.tar`. `deploy.yml` unpacks that into
`~robert/freecad-mcp-bridge` and points the bridge unit at
`freecad/RobustMCPBridge/freecad_mcp_bridge/startup_bridge.py`.
## Upgrading
To upgrade to a new PyPI version, update `freecad_mcp_version` in host_vars
and re-run the playbook. The pip install task will detect the version change
and the handler will restart the service.
- **MCP server:** bump `freecad_mcp_version` (PyPI) and/or `freecad_mcp_git_ref`
in group vars, re-run `deploy.yml`. The pip task detects the change and the
handler restarts `freecad-mcp`.
- **Bridge:** re-run `stage.yml` (rebuilds the tarball from the latest fork
ref), then `deploy.yml`. The `unarchive` change notifies the
`restart freecad-mcp-bridge` handler.
## Validation
The playbook automatically validates the deployment by:
The playbooks validate automatically:
1. Waiting for the HTTP port to become available
2. Sending an MCP `initialize` JSON-RPC request to `/mcp`
3. Verifying a 200 response
- **Server play:** waits for `:22061`, sends an MCP `initialize` request to
`/mcp`, expects HTTP 200 (transport-level only — see lazy-connect note above).
- **Bridge play:** waits for `:9875`, then calls the bridge's XML-RPC `execute`
with `_result_ = bool(FreeCAD.GuiUp)` and asserts the result is `True`
proving FreeCAD is up **in GUI mode**, end to end.
You can also manually test:
Manual checks:
```bash
curl -X POST http://caliban.incus:22032/mcp \
# Transport up (no FreeCAD needed):
curl -X POST http://caliban.incus:22061/mcp \
-H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","method":"initialize","id":1,"params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"curl","version":"1.0.0"}}}'
# Bridge listening + in GUI mode:
ss -ltnp | grep 9875
python3 -c 'import xmlrpc.client as x; print(x.ServerProxy("http://localhost:9875", allow_none=True).execute("_result_ = bool(FreeCAD.GuiUp)"))'
```
## Service Management
```bash
# On Caliban
# MCP server
sudo systemctl status freecad-mcp
sudo systemctl restart freecad-mcp
sudo journalctl -u freecad-mcp -f
# FreeCAD GUI bridge
sudo systemctl status freecad-mcp-bridge
sudo systemctl restart freecad-mcp-bridge
sudo journalctl -u freecad-mcp-bridge -f
```
## Security
The systemd service runs with hardened settings:
The **MCP server** unit (`freecad-mcp.service`, user `harper`) is hardened:
| Setting | Value | Rationale |
|---------|-------|-----------|
@@ -126,5 +210,15 @@ The systemd service runs with hardened settings:
| `PrivateTmp` | `true` | Isolated /tmp namespace |
| `ReadWritePaths` | `/srv/freecad-mcp` | Only app directory is writable |
This is significantly more hardened than the Kernos service (which needs
broad filesystem access for shell commands).
The **bridge** unit (`freecad-mcp-bridge.service`, user `robert`) is **not**
hardened: FreeCAD GUI needs the user's X session, `.Xauthority`, and FreeCAD
config in the home directory. It binds XML-RPC/socket on `localhost` only.
## Known limitation
The bridge depends on the XRDP `:10` session (owned by `robert`). `Restart=on-failure`
recovers crashes, but **not** loss of the X display — if that session restarts,
restart `freecad-mcp-bridge` afterward. Auto-tying the two is a possible
follow-up.

View File

@@ -216,3 +216,102 @@
ansible.builtin.systemd:
name: freecad-mcp
state: restarted
# =============================================================================
# FreeCAD MCP Bridge (GUI) — runs FreeCAD on the XRDP desktop as principal_user,
# exposing the XML-RPC bridge on localhost:9875 that the MCP server connects to.
# =============================================================================
- name: Deploy FreeCAD MCP Bridge (GUI)
hosts: freecad_mcp
tasks:
- name: Ensure FreeCAD is installed
become: true
ansible.builtin.apt:
name: [freecad, tar]
state: present
update_cache: true
- name: Create FreeCAD MCP bridge directory
become: true
become_user: "{{ principal_user }}"
ansible.builtin.file:
path: "{{ freecad_mcp_bridge_directory }}"
state: directory
mode: '0755'
- name: Transfer and extract FreeCAD MCP bridge release
become: true
become_user: "{{ principal_user }}"
ansible.builtin.unarchive:
src: "~/rel/freecad_mcp_bridge_{{ freecad_mcp_git_ref }}.tar"
dest: "{{ freecad_mcp_bridge_directory }}"
notify: restart freecad-mcp-bridge
- name: Template FreeCAD MCP bridge systemd service
become: true
ansible.builtin.template:
src: freecad-mcp-bridge.service.j2
dest: /etc/systemd/system/freecad-mcp-bridge.service
owner: root
group: root
mode: '644'
notify:
- reload systemd
- restart freecad-mcp-bridge
- name: Enable and start freecad-mcp-bridge service
become: true
ansible.builtin.systemd:
name: freecad-mcp-bridge
enabled: true
state: started
daemon_reload: true
- name: Flush handlers to restart bridge before validation
ansible.builtin.meta: flush_handlers
- name: Wait for FreeCAD XML-RPC bridge to listen
ansible.builtin.wait_for:
port: "{{ freecad_mcp_xmlrpc_port | default(9875) }}"
host: localhost
delay: 5
timeout: 60
- name: Verify bridge is in GUI mode (FreeCAD.GuiUp via XML-RPC execute)
ansible.builtin.command:
argv:
- python3
- -c
- |
import sys, xmlrpc.client
proxy = xmlrpc.client.ServerProxy(
"http://localhost:{{ freecad_mcp_xmlrpc_port | default(9875) }}", allow_none=True)
resp = proxy.execute("_result_ = bool(FreeCAD.GuiUp)")
if not (resp.get("success") and resp.get("result") is True):
sys.exit("Bridge reachable but not in GUI mode: %r" % resp)
print("FreeCAD bridge GUI mode confirmed")
register: bridge_gui_check
retries: 5
delay: 5
until: bridge_gui_check.rc == 0
changed_when: false
- name: Display bridge info
ansible.builtin.debug:
msg: >-
FreeCAD MCP Bridge running in GUI mode on {{ inventory_hostname }},
XML-RPC localhost:{{ freecad_mcp_xmlrpc_port | default(9875) }}
handlers:
- name: reload systemd
become: true
ansible.builtin.systemd:
daemon_reload: true
- name: restart freecad-mcp-bridge
become: true
ansible.builtin.systemd:
name: freecad-mcp-bridge
state: restarted

View File

@@ -0,0 +1,21 @@
[Unit]
Description=FreeCAD MCP XML-RPC Bridge (GUI)
After=network.target
[Service]
Type=simple
User={{ principal_user }}
WorkingDirectory={{ freecad_mcp_bridge_directory }}
Environment=DISPLAY={{ freecad_mcp_bridge_display }}
Environment=XAUTHORITY=/home/{{ principal_user }}/.Xauthority
Environment=FREECAD_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }}
Environment=FREECAD_SOCKET_PORT={{ freecad_mcp_socket_port | default('9876') }}
ExecStart=/usr/bin/freecad {{ freecad_mcp_bridge_directory }}/freecad/RobustMCPBridge/freecad_mcp_bridge/startup_bridge.py
Restart=on-failure
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=freecad-mcp-bridge
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,46 @@
---
- name: Stage FreeCAD MCP bridge release tarball
hosts: localhost
gather_facts: false
vars:
freecad_mcp_archive: "{{rel_dir}}/freecad_mcp_bridge_{{freecad_mcp_git_ref}}.tar"
freecad_mcp_repo_url: "git@github.com:heluca/freecad-addon-robust-mcp-server.git"
freecad_mcp_repo_dir: "{{github_dir}}/freecad-addon-robust-mcp-server"
tasks:
- name: Ensure release directory exists
file:
path: "{{rel_dir}}"
state: directory
mode: '755'
- name: Ensure github directory exists
file:
path: "{{github_dir}}"
state: directory
mode: '755'
- name: Clone freecad-addon-robust-mcp-server repository if not present
ansible.builtin.git:
repo: "{{freecad_mcp_repo_url}}"
dest: "{{freecad_mcp_repo_dir}}"
version: "{{freecad_mcp_git_ref}}"
accept_hostkey: true
register: freecad_mcp_clone
- name: Fetch all remote branches and tags
ansible.builtin.command: git fetch --all
args:
chdir: "{{freecad_mcp_repo_dir}}"
when: freecad_mcp_clone is not changed
- name: Pull latest changes
ansible.builtin.command: git pull
args:
chdir: "{{freecad_mcp_repo_dir}}"
when: freecad_mcp_clone is not changed
- name: Create FreeCAD MCP bridge archive for specified release
ansible.builtin.command: git archive -o "{{freecad_mcp_archive}}" "{{freecad_mcp_git_ref}}"
args:
chdir: "{{freecad_mcp_repo_dir}}"

View File

@@ -18,6 +18,7 @@
- git-lfs
- curl
- memcached
- acl
state: present
update_cache: true
@@ -187,8 +188,8 @@
--config {{ gitea_config_file }}
--name "{{ gitea_oauth_name }}"
--provider openidConnect
--key "{{ gitea_oauth2_client_id }}"
--secret "{{ gitea_oauth2_client_secret }}"
--key "{{ gitea_oauth_client_id }}"
--secret "{{ gitea_oauth_client_secret }}"
--auto-discover-url "https://id.ouranos.helu.ca/.well-known/openid-configuration"
--scopes "{{ gitea_oauth_scopes }}"
--skip-local-2fa

View File

@@ -74,10 +74,14 @@
state: directory
mode: '0755'
# Mode 0770: the certbot renewal deploy-hook (running as the certbot user,
# a member of the haproxy group) must be able to create the temporary PEM
# file here. With 0750 the hook fails with "Permission denied" and HAProxy
# keeps serving a stale cert until it expires.
- name: Ensure /etc/haproxy/certs directory exists
ansible.builtin.file:
path: /etc/haproxy/certs
owner: "{{ haproxy_user | default('haproxy') }}"
group: "{{ haproxy_group | default('haproxy') }}"
state: directory
mode: '0750'
mode: '0770'

View File

@@ -9,6 +9,7 @@ global
log /dev/log local0
log /dev/log local1 notice
stats timeout 30s
maxconn 4096
# Ubuntu systemd service handles user/group and daemonization
# Default SSL material locations
@@ -30,16 +31,24 @@ defaults
timeout connect 5s
timeout client 50s
timeout server 50s
# Slowloris protection: cap time to receive the full request/keep-alive idle
timeout http-request 10s
timeout http-keep-alive 10s
# Stats page with Prometheus metrics
listen stats
bind *:{{ haproxy_stats_port }}
mode http
# Restrict to the Ouranos LAN + localhost (Alloy scrapes via localhost).
# Belt-and-suspenders alongside host-level firewalling.
acl from_internal src 10.10.0.0/16 127.0.0.0/8
http-request deny unless from_internal
stats enable
stats uri /metrics
stats refresh 15s
stats show-legends
stats show-node
stats hide-version
# Prometheus metrics endpoint
http-request use-service prometheus-exporter if { path /metrics }
@@ -88,6 +97,19 @@ frontend https_frontend
# Deny if auth endpoint rate exceeded
http-request deny deny_status 429 if host_id is_auth_endpoint { sc_http_req_rate(1,st_casdoor_auth) gt 20 }
# -------------------------------------------------------------------------
# Internal observability + probe endpoints
# -------------------------------------------------------------------------
# These must never be served through the public proxy. Real scrapes/probes
# reach app hosts directly on the LAN; anything arriving here is external.
# Defense-in-depth — app nginx also enforces this via a real-IP allowlist.
# 404 (not 403) so the edge doesn't advertise the path exists. Exact paths
# + trailing-slash forms only; never path_beg /mcp, which would break the
# real MCP endpoint. App-host-agnostic by design.
acl is_internal_obs path /metrics /nginx_status /mcp/live /mcp/ready /mcp/health
acl is_internal_obs path_beg /nginx_status/ /mcp/live/ /mcp/ready/ /mcp/health/
http-request deny deny_status 404 if is_internal_obs !{ src 10.10.0.0/16 }
{% for backend in haproxy_backends %}
{% if backend.subdomain %}
# ACL for {{ backend.subdomain }}.{{ haproxy_domain }} (matches with or without port)

View File

@@ -34,27 +34,43 @@ spelunker_rel: main
mcp_switchboard_rel: main
kernos_rel: main
rommie_rel: main
kottos_rel: main
# PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/
freecad_mcp_version: 0.6.1
openwebui_rel: 0.8.3
pulseaudio_module_xrdp_rel: devel
searxng_oauth2_proxy_version: 7.6.0
# Git ref (branch, tag, or commit) - https://github.com/heluca/freecad-addon-robust-mcp-server
# Used for both the pip-installed MCP server and the staged GUI bridge tarball.
freecad_mcp_git_ref: "main"
# Docker image versions (third-party)
# Centralized for vulnerability tracking and controlled upgrades
casdoor_image_version: "3.0.1"
flower_image_version: latest
grafana_mcp_image_version: latest
gitea_mcp_image_version: latest
neo4j_version: latest
neo4j_mcp_image_version: latest
memcached_image_version: "1.6-trixie"
nginx_image_version: "1.27-bookworm"
nginx_exporter_image_version: "1.4"
oauth2_proxy_image_version: "v7.6.0"
rabbitmq_image_version: "3-management-alpine"
searxng_image_version: "latest"
# MCP URLs
argos_mcp_url: http://miranda.incus:25534/mcp
argos_mcp_url: http://miranda.incus:20861/mcp
angelia_mcp_url: https://ouranos.helu.ca/mcp/
angelia_mcp_auth: "{{ vault_angelia_mcp_auth }}"
caliban_mcp_url: http://caliban.incus:22021/mcp
gitea_mcp_url: http://miranda.incus:25535/mcp
gitea_mcp_url: http://miranda.incus:22062/mcp
gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}"
github_personal_access_token: "{{ vault_github_personal_access_token }}"
grafana_mcp_url: http://miranda.incus:25533/mcp
grafana_mcp_url: http://miranda.incus:22063/mcp
huggingface_mcp_token: "{{ vault_huggingface_mcp_token }}"
neo4j_mcp_url: http://circe.helu.ca:22034/mcp
nike_mcp_url: http://puck.incus:22031/mcp
korax_mcp_url: http://korax.helu.ca:22021/mcp
neo4j_mcp_url: http://miranda.incus:22064/mcp
nike_mcp_url: http://puck.incus:20661/mcp
rommie_mcp_url: https://rommie.ouranos.helu.ca/mcp
freecad_mcp_url: https://freecad-mcp.ouranos.helu.ca/mcp

View File

@@ -1,454 +1,493 @@
$ANSIBLE_VAULT;1.1;AES256
30626436343431303066393835396333626537363230663331633338646536613164626561616565
3762376636383731363162313763383431613430393239330a643235336539386637343236323830
63663966623766653135636562383634623330643639313538313836396261623637363130646233
3432383136336432640a343537356239616137316539303838386562623931326462653732363461
30623738653035356439636639613932373461363238333032353430343334393538383938333434
37353632653430646636303535326561313162316666346136353532643166653232383666326631
38316465326338643231336262363832363734303261313161626533666539646630303663306135
65393536623764323466636130363931373964396163356534326234316534633737366534633264
31333030623831353031363536346662343461373766613164616164366666393333383031616466
65633066633132616235353663313738333366316531613230313733316232613031393736323765
31333261656663326432636561306432346232653264326130386566636131396364326461376331
35383130346238303233303331616435616533613034393064613266643365333932663631393164
30663831613463303961366164373139643739323364346533383232303032646333346138313163
33356264643061656337376362623130636536333733663134353435306238343436323336613665
63373661326634626164613337336231666431323464653734623431633933646263333831323135
35633661363136666439613834613863356564383961356464323733366536623730376364616632
32616263626136366634643239303065623131666331393938646237353438353938616365326637
61383938613065346538613332306366376131393134643464646530366162656330663733636361
35633134346430386532363164346664643235393032326265383239613436386564633536616535
39663764653162323061623937633839633438366366376564306566616436303364363239353665
30363337336637383532343438313665326134666232623731303736633262613939326533396139
61303164303362643831333034616338626566323664343163636436633632663337373630393461
66333535386463316665623565663761366262323231363739353365653364356332373738376366
30663830343864626633613731646336393032313932333636636335666437386661613130636336
64353434626366356530396233363634323038666664656264373438356266643934333236353531
32663539646432343639356237396236616231333866386364383430373536613061316464633064
31613965653533643562643665353534616161613661393231373735343562323932333666666431
38323338313331363261396533656465613536376563323862306361386363623731616465343466
35633038316139633665333639303731386562663135373031313135316130646637363238353064
37336332653162333738626164386133366337613866333439323561663839666161346138613262
34346566356534646565663765363034343561653233633661653137626335343763363032616332
65396639653263663735396638306639636466383166316539646263336631386530643438323861
34356534626533393935393933333732396561366139353264303333613533376639646164353731
35663061656436643464393337666162666462666231613762356636386230396632343237323064
31353039663064383963613738633561343539666363303132356635333035643530333534333533
34326661616438666538663036636535633534366134386437313935323963643966323533633836
65383730613336623664643732356134333935383230313533656331623234353362316231636630
31373761353030363430613738383839346337366262653334346533363762303537313662353865
62343335666431326439376631643564626632333364663432363836303962336666393030326662
64353662636662353565643630656431633134643035313162666565653964396638323463343638
31346665363932313434373264633866333339643634616262336235656238663034613734333138
34323038633836613737326132326238633364623237383839383636303130353561333435356465
36333638373730353035336636333930363833323435303662646634396632303761386438333565
30393861343965656638366531333630663865303964373338373634663433353535326539363735
33326234323361313238636163656534396462653131323964343730613039326538373964643138
36653837366235636565336538386661653435653862376635646135306634306361646239313034
32666662373239303237393366336431623030653165633164326164343438613035313238313061
62313131323738616465343234613462376165373266313165383830656230386565356532303362
39663639313266323961636230646236646435613265623737316166376163646631346437663039
33306232303566663262306630653964613262323739316333363636353137326339623236393037
33313163633462623063383239333133643733626364316335323761363639663763653234643366
37323431356637653862616639643666663964323838316466333165326631616365353062393036
66663336336139343435333533666430626232353234636234326464633137323937303865323830
63643465666362343834313163303761353631316636646165393562323936366162613838646563
30303335643335383461653134353632663562313461636434366336323461626161623464366663
31623062303039666538663838656261613262373131313538653865386461633939646331396662
61343537366534663338356265313937343939336464666264643662383339353131646634636234
33376532343464623032323735373061313436643038313136666334633932373962353737633733
31303331356466313238396631646466393930396134663730653162336366666331303238663264
33323335363232396163386665633735366634613166643139376461353139616637633537633731
37326665356161353936393830343631663830623066376630383134353236313837623834346636
33366465333235333366333739666162623136663463666634633438613162363061383430353435
62306661663237323231313864373335336632333836396331663737326161366264373833326339
39303365343330646265363833393961643331373338613131653966393061376331346263333637
31376533643037626264386166336438323432336538373138306230336432366238346365373831
35636136363861623337626431393363323933316463613833653334326634663036326561376366
36393337356266323633386166353462396261656532303931326363383830323761313933633131
30353030316238666264656433336566313933393332353736643065356238666430386631653539
66656430373830333564333062636435656634663932383264653030336133303034303335643231
34373936366165313432653834663334396362343735383439653065616235373766326566346365
61306230663361343936373636353637326437376530353762643531343439326539386564336235
65306565626164656633663736643164336537353438613366646663626533366163313938343434
63396138336662323633346133663064353061336231616232623039363765383337383663636637
35636262646232633632356366636464316230633430303931343036616165613136343637373533
30323266336233636561303062653939313061616564383137363335663263313964306364656639
39613031656636653566313233666266373537316630376437393831346533396165646334623162
30653132363333613666316633363561333434633063343534666465323463376631663363333735
33613035363732663931383062663039663963663363396661373735643531373434313766666536
62656532656332636637326339383866663561633930616631613766386662636531653037376363
38356638346630353665313035626532356337346331333232323536333731363061386134356164
38616662396262663661646437653830343234656566626265303236373231343464366133303262
34633336376664643762366233643033323937336662303435323031626263323933616139346362
65623036386631396534623364303531343362343438303062386265656563306538396132623133
34336535373435353866353931336664346337666533663930326439656136393735613633336464
37613063366337323662346536393863616535383732353032623966386138643461346566613966
35333361663163396333653535636365663735316461623966363732383538386263663139303861
61353631653238383862313836343865386433666531396564623133316634643661343264373462
32303036393731653131633962313935373166636464386635623732623966346566616631613862
62656537326432643830653536376538316435333331623839353063393363373366656561373635
31356239633534323364333030343531666563613934333030633633653433643336323636663565
61346236333462643161383137356661623334396661383963396232376464626535353963313734
30333261646365303037303966356663323136623533616134363463633464343236343339656261
38636533326237383064323761373132646463396632343434653335636237613435616132323333
66373963616633356437306263653066353232323035313537633162346664333964363262366666
66613538323238306131396463366338306631633634393963656439373336333833646165656537
66396363366531303263303031386561373130353262323834666163363938653733353233626132
63623166663430393634363230313931313734646137393732316564346336313036393030346530
31386566353431396435353361623764383339613131376430313863623234626164386436653966
33613030643036373830353732306136616663393265353930663362363764396534633236386638
30376165363934313761646534663033346133373063336536346139653834313731646539623063
65363461386436376563313831393532313661613264383664663134373737323436643432653766
61613132666536393536346563653064633933643361323737313039656537363530636565373234
64623664633365643931363636306166343930613239643165306337646366613033353339376237
39386636346130366532306136366466343765303664663637643961613362643532356661326666
63663966653436643664316561373963343839356237366430663632663038386239643533646630
38343664373266336261303233646535653364383632386131613837383733303431313631646237
32613365613234646534343362353633343061326133323135383138613033303233396634363031
30363533366261373961313563396536333130373132663138306134343362363331313861303831
65353531396435313133336464663637646438613766353962343563396164376535616334366261
66616635373565363164636436376261343035383530303465306465663831323037663638623034
64396438643931643934643266623165393464373965396431376264313466643465363437316431
65363137653137333863643031313465653764393865333539386135663162616264313864303336
39373361313237613837623139656531313937303366643163336434316235313765353030353536
61353233353834343831393237633964636662623639356331303433323331656331306466303563
64633535343531306234353866363737376264346631336366353238373434613437613930303663
61623965653565303265316233636434393565633230316634626639346436343633393034383830
35303531303239353230306635363333396433356131396338656262393466653434343664383433
65656437366633313463326331663539663334616436663236643331313332313830346533613762
34646135393231626162396330643163323035626536636536613636613465373766656162636331
66363065336232373239326166366531393366363334376564653834343863303766393165373633
35353039326636346361626161653836303961623862383639356438663563326231303961396630
31353739666264633437616639333336306363623638386135343235653166336366373539623436
66353530363464383937353634336633636338393633303139313632363031346634363563633261
34323863393637373539666138386161643536393432353663353038626262643364313535666136
33356236363061636366303361383162343735636633643134356137303730316366623133376132
66646663376637643730306264663630623530376439326531636338613134613161303662313163
37376331616631646537393532393065636631646565383633373637653235306631326239363463
66383238656237313632353336343333333535386133623437623361356166626635333463623162
66646162623131306630623430633035323339323061616431343565343563663563386166656661
35326365653162626335313266653165666265353239393061633439626365623864306136363430
31373133643564613066356636333565393632323434393165633461376666663834373034336364
64633137656466666535373333386230393533326634363461643633353938653236343763646363
34643431646338343063643966386264373537376538633830626434363663613138633861383035
66636232346132383435393338323262353866376633353431396664613039386464316363353466
61323135383064646234353961363336326339666630326331396533333235313864316465386166
62383735633737303834396332323166653364386332666438623563343361373231396263326533
37326464616362353565356436333638633334343061306666373066333365366237316662633363
33363136366436303161303330323062666364306530303737363364366336323364646535343761
39643338336331386334313433626535366264623265346334373235313061653139313266396336
63363362363232356235326632663732396337343835376266623063643966373930326665306630
63646163326531653565303339326638383831316333636663653165633865373963623363313033
33653534616232363632353531393162373039393732373436343031623836346236323838323539
38663130323565326338396133313835303931616639363733343036616164616434663061323035
39316564303163316135613465336234376230336239393936646563303238353638376137653864
32353135633261346665643337366139303633323163323166356538363937643666623663343836
65653631646363356161336463366461643864613634333538366532393531323734366530616230
37353035393862653562383532626634316265646566396563343532616662326262613466326563
39623433383834306235373432653739346565316563316530383035326330356534326464306363
30316364393031333130316433326533633764643931363565333864303035336161336538326161
33613364343766316330656130316532333731303631646236616637386638393961643339353439
38306362633263393562623134653430393131653933363265343562636531613638636135323963
31613763353966393033353333316437373439316435373332623133336532376334653633346561
37643138376337623563396135383264373832333131653764353664616163656339316364396437
34316134623738323830623036383032643030613933343031393734376564613935356335323064
34343934373434613436663261616234616230323832323264383363613037343030636137643261
31643231313031376663613230663864633530393931613633376534663134613534353731363033
64316564366438313834303839633362646564396661316162323566323234336638356139636530
36303063653664366665316161643434633036313339313162613537656533646335393835323963
33306232353438643434356637363832393632393434656464333239373464613239363165663836
30366635633961353862353662646564653932636232653562663962336266316433633736306632
61383232333631643135653036316136336136396532666339363765373636653030336532306236
30366662636461396130356535393232633832616366353433663565623730343432626137663236
33636262633334356236346339306263383436386261326632356331323731323932336663663961
61323832656237616263333963636636656264373666313632303737383035343139623431303065
65623436626363646336653632323261613834633936656439316631383762383161303032386362
64346532303332633239633832633062396233636333666430323665313563663764306562313832
33333931373830636462613532623032346336363864313664306335616661326431613865316136
64353966373133323161663361316462623133323332363637636461356162306537616432323962
30666461653032643636353963616531306362323465343362623464306664353665356536663833
31316463613564613333663263633239663263326338373033626365653339373639313963386538
62353733343438383461616534383232326238396639336432356635376261343636613366653235
37373836336137653366656431646434363435653462383264333432376661653831326236313365
36366433616636643462656435376339383739323136623962643666353336656439643937313066
66383235636236393238383939326465313462633633653535626664643862376361346436333734
36396463316665383763626132636666386138316438323833313861326565366266353930336132
62313234346233386334663364343034356461653861333237353339653930303735353236353062
61393232366566666463613031653861373962666432326465366464363536333136396433383336
33636436353237656261633635613161636133643761616433613639636231633733646335373231
37666437363465346262663132363063633465653635386537303435386465626234613633343736
31303466643933373766313836313633356262393030303961316534333537656662353431333436
61353032656264373634626630383232396230613165653035323261623034366463633033316162
64613337303734336365363931396234623463333336373862656531653066643064616333363131
30636165643064393362663331386531353936363263623366313362633030383064643663316634
36666563613232653933383966373837633433343763366637646634363862396233326130353539
66343332326363336461353431303737353464316533613831626439643961393138343637323562
66353535333230663930306663613034656336663662393530323730666339346635366533363339
64656433616664336634323038333837366432323830336466363335663639623436313263613933
65663932363934333430333362373539373066643237383433346337323761656133343235333261
62316562666237653165666563396132366263363766346432353631653862643435643465366538
62616463663539623337333866623035343430653964393565616364313266376662356334333063
36333434303433323338303037633066393339306466396662663134373164613037353530316339
37356163616461303136663837333836666534313863366232386261353661653738366239633036
31326336626262666332623136326137393231326231636234623532656534653864323132613866
34346161666238653136343364323231393861643230313765323238336238623534666438306638
33363462646633356135343731333939666162613166663264333631343433643237643435666166
37636564633463356638303932653030313065643336643732373739646534616333363563323334
37326439663737356238393036656535303830616332373239333036316234343637303966303166
33666464353564353035656535666165616363663032663134633832313035393765306662633633
62616361666236383435376661633033333834353061396233343061386633653562623363663763
65623165666434383339643063313131353730366462663331346239333830663236373565666432
64383037653061616631336138623239623139623639353635346134356234316336353631333034
64366238333035353739373038626466313331616264623031333936323333633537656134396364
33363536396661363437323837306639653733636633663037373565313533303563303930656130
34386530626262663931373631333234626363323364386135343532316561316539626361323831
33393263623038383230666263343233613063303534363930636234346461343338303965323962
66613833333835336331646134303538343039333133333861666261323639636136396138303234
61393738623463616231303037393962313737343239303939366163626237313039366232343230
33396337626438373561653337393838653036643431336333323266616434383736316435633439
33633236393262653431343132363364353664353330633733313561666233383066643630643036
36323739616337316133383635376439316133303764663464616333353037333334343037383063
35343265343132356236613062323761316139613839646261666639313339643233383134306534
38313036613834336536333930356633663063613735376664376139643131383533306332626563
65613763663338393062663533636138313432386166356163383934663930373530383530306133
31313937623434386339373830663139303238326565306432623864626433623734633132616634
38666638303938616135376661363866343533376238316138376233326133626563613435633339
64626666326339386663393562376663616434353036346230623733393136333064323930633733
63303466303036353338356362623935316261316137363765343632383534396635613961346633
37656263383464376338646334343835353930356463393731373861396333646463613838643835
63643363383231373030313435323637353865313964643537613036316338656633313338616362
36333339653533643032316537396430336633643763346264303832633934363739653634663164
64383463343535383364333965363963623537376264656439356438373964333635656132623039
30353631373164306333363331336234323831626466656539646365303330616565663862663239
38316534656538653735313232653036313763316136363031636561366431623733346634663963
38313661303130353638303232346461376261666337326434626436613331646531336236616535
66383234333965353233306633613361386639303430313632336132343939316562393135386432
34356464333465616461303564343363396237313862343237333563633461393561626162326337
39303636653361336361383535343931653863393462656130616463343638376335616538316632
64646364343530633738386566653362623361336637393435396430383934326232613033313539
38626139333263336333646338353333396335386366646662633634326133366664613535326465
65613961303264333364363838363632633030386461346563353263636437643734346262623032
34316131303732616161396537316435313165616333633762646434623139366661366264396438
62646162646363316232316436633665393836643264366231663962333538313839316131303638
63663835303865636561326363396365383632636636376337333137623065353761613532646264
64303439333565656535633834333830666434616233653338396262306162653435633964363866
36326265376434303461333232306233383635393039656530633538313234643137303261633937
36363466376335353136376130323531373963303866376638303236666234613332613836663837
33613330303361323661373837346231383162383665633139643733656134623738383364373135
39313631346534323135623932346638343262316230316564336266303963663932353462386632
64393437303636313338373763343735613264316630393931646664633736383738333164366339
65323364623631393038636234306537303436613862383935663830316239343830383339306437
34393062633461636233356435646564376165313833366663306466656431336339626162313865
38623534393635336134663466356238323635356431623630636564666564303232623764633635
65366330636438316231376435326236333033356561326564393961643964316361373035313236
35343934363464326439646435633861663236376466316263643130306235636561393332666566
37383932313435393435643035656533383761336331373466323938373561326131346335343835
33373833636136343064383564613437306464353636313930326335306165636431623131343761
36303138616633313062613834363235343937666133323134633039333634323531383833376462
66363039383439383630353066323861353962633339623338623139616539343033663863313662
31666161373663626263303364333631613161383265363764613433346561353939613235623338
61643730396331326338666266353031626165383132383364396464306663316532363538653535
32633735666231366565303962626662646439346261313032363936653433343132613166343938
37393136366136633833313332626464323962393839393161313364633563666165656236376466
66666466316662323734663139613266656132376434386665323666333338343433366338663633
34663631623837303136636333666132616264356665643132613737613132623131346263346438
38313666366538343261663837363538333439306538373731353537616639363833346465613466
63633839323535636634363865356662353636383539643833333462663466633665363963623564
62636138323031363436356439373433633132326161636266643536303364653561313734623566
38396266303535636164326439383063336132303365376434623137323936333764643333333639
38656163326566373466306366306537326132633035633066643836366535373762333764663937
33343839393261343534373631336136613362333063613166656236353237373930623339653739
63646437643462646538353831306633623565633562323163383234633139363463366433336661
66653932353462353235303431343431636166613861393665613235623130623235323761643330
38333462633038386361326532383635313066643936353438383732316532626236653366653838
65616261383866323163356163393535306332633438646136643166346134626535353964316162
38646534623138636135323262373630333432373134346539343565393266336239326337333964
63386265663365383764306234366537373030363735383665363662653235313765613466323663
35636137363164643632353663373335653561373835363361386261303364356361636264333935
63393863646331346137623766643436666262646465333136363431336261626535353733353138
61303134613230313361393832393866376661333366316663306131636465623132623261646466
63313230646664316134323635376564653265336135303731366232613062653137303262323031
34353463646265313237333531653437393161373434343461383764306362616231373662393062
38626261393364313166373938316661666361363937393630303339653539373062353539666636
36323961326135386264646436363336333334376132623839633532663366636136333561363436
66363636363434343864386330326134633861633935343934316537663531343062643231376232
63643830346365623132373730333564353637386632323131366462343936336433613764373031
36646363643131656563313838353731613833306531343331346464653437303538346263326530
66653962666638363137336164613231386434373539303261326233346264656361363137313461
35393861633562623331376631646361663032653633666333363433333835653364333737326364
37653733633936326166333739383738653933376561663738363364626338616562316230643634
66643166366337383830333032303661636566326361396239643330643538656461366230653237
36653538393033303838326230623436373835386538633665646139393332316239316535373634
33666232363632396430336662343639306339656466303066633637313264643566373162343231
32626166316631613031666139666139323436636466383565396561343131616166313462316137
35346239303262373766653032396138633931306532386131326535366266656534393361396238
63396335633837626535333563646566363665303464643465396237636131346335613565666332
33646333383466396136313935383037663138333563313738313436613165336432646563646334
63343530303239356633636266653734376230353862376335373362633938333238663364636135
66633134386233653034633637303331363962346163643936643435643861396233653037393363
62306561623436326166356562613631336262643263386531303666336339666466623632383261
61636665336232306635663337373336353462303934313635633463653231353430363434326536
35393835313236343064633739376232343833313936653735306335323064666663663763303065
66633338393434663061333032383938646236376435623737373863636435353266376239363239
61653064613737616634643330393438303661346637336265373736303463613266373463313830
38663361613335383535323436316664326132333264623464653031663363343530393935383633
65656565643363303330376433616532613466623965353862323038663262303530616434623763
35363235373565343461393634613066666636393866313232386666643935323764343066643439
63376434396232363935653031666663356361303031366262646131363231306231643966313334
64386230653136663139313436396435666332396431326638383337326664396438316434633939
31366666626266303334303763313361653438353265363661383833323061333766353865343366
63626166353466336535313433346339363739373736373737333763333831343965356533303565
31636533663866653136663762393033653335316432666137656665346564333930393935643532
33343631653031656138306262636463613338366332343037353361386530323136376531656462
65393365356264383030623637323438633837323134643935303338303631353834333961363233
34643265653334636463643539363937326230316633653232653064623439383662346538633463
37366635373464376537313236643163623863396638636139633933333436343836623937363139
32636436626531316263623131353332353766323835313132626132643834356566366336623862
39653964343631633235356436393036343737323831646435376135396366376635346132343736
62653330653339646263633331393831623037333334353061623539366636373832366565343330
66613232373637646561383461313261313435653531356436633030323632663230353537313565
64346632333532613636646164346162353230363363623238346463333166373165653239326666
33376530303839666161323131633362336235653237336339653630643839386661373230656333
66363039326437663036363335653432613139653661653961376531363963363135336532343630
62306239373061356236353462383932383230616466333566646162363561626536393437336339
39616130343333323431356163636639363166663363643138373062383730643736303530353066
37396166326333666664653130346131353735616436633237373766323134366633383761323332
36363461643931383338323062656239373435666363306663313766633863353563656464396466
36333933363463653962346434663436383763373533333732626465363762663365396166386366
30646665616234343432336436343330363332646435623339643665373364303565343939323838
61623030376437333135643361336364313063373964356139313538323330626130306237393862
61643064333531613865326634626161376139396462353561343637313137383938643430613234
38316265393061366166653135336632316265623136313462353439333436306130386366303765
34353036363766333763386662636364383933373930363937336632623164646466306432636137
61666534373230383332373737386561633332376135323765303139323131653230656162616535
33393636383236363565656436336130656137323065383663383262383933353432636338353561
34356363316630313861383766653362346432613866386362393634333931303437636262643663
39616633333966306162343835343732646361383935633532326334373963346362303862306561
64396162343934303633633636313861376334383736326232333030333238323033646636653038
38336537393134326239346463373361303530633230346533623139333338323138633039646363
31633439353331633836633463313733383539633830386330363630396166313164326164636438
63396632356639313432613133623131306234623531333163333430653062356565323564653565
34653131653135393334393431626138653264643062376163343132343436613766626164613363
33336561303836383361363531393166636539313466323463643566386230653931316232616461
61613134326465396436333834346239333238633434346335613938323735623563373662633366
34336264316265623634646362383337323433373762623332366234643035646666366136303837
30616133386334666533366434323064343863636333353965613839333330656536396331343366
65346462313935336564353639316435393535393236616461383638363735383235326332303833
63623432316536313431633964383461373063303834373333626636613561306433623530373835
61303533396332386434663766396163363263316537393836326630633062373837613238633234
64623266313661613564373663373735636463346462316636653630356665326465633031656439
64656234663363623236323264366536613533323261616461353232613738356539323536623737
61653631663135636631373130313433386263643437623165646664623930333333333435396137
33353032376234343362336436373831636261643439313962623233643865316431373539383633
32356366323135613330616634393439366337353462373537616566336139623239343765326333
65343239376462656635343166333435373763333264643433303836373163303930363763333263
34363833366231353636323464333964643139333830376434303330326561646237663530633566
31636536316162353438633336626237303933326331316439663931323661633362333233386237
65656131323061323833626437356535323035316565326638373362303235373161376361616137
38326331326533376564323232313133313830623166626632366463653334346163363362653764
64623838383530623835613065656365356464623166393939393466656663613962393861303866
31376134343439353534633633636436383664326538656337336130383864376463623063666233
65366165613734336233653066616232396664353136306437373831663731646433636233653636
61393133306432613430363136613235323235353161326230636533393566616435353436643633
34643133393063616337343030663338616632336665346165643065313863353430373730313837
39343865616230633135663963623862366237346439313635323930373834626430323037613230
64313433656631346664396338636630386438393465393431323734663838626137363563346435
63303862613430616233306662336237626433633966613234646463653635363635653961366361
30363537343331353932346337633466316637616361383639303163343733633565613832663933
37383662383330343438663763633438313537346537326466343533386532323632386164303638
34326263353136326366373031613666396339323436643432633736316634346636376563376461
30313331666136646566636130396633613462353538343139313932646333663430326664343033
31323830343630383238616338343131323365666463366536323935633733313232333861643539
36393436386534666563393032656232616162633135306666636535666630643039316531383236
66393737373863623930613561363761623862303539343331383362663536313162646638366461
66626261306164623361646439336565383765383538633538316363636136633636613535623062
66636361343033396163326439623964306261333463346463376662303537616264653361373339
32323462316265346334656333323337373434353633656563313939303439323965633435366136
63626337303133666131383732303734353037633861356364303734393235656364303931373064
35316533383863626239333365613062346663353336323431323730643939353563336537613431
32623938386532353163313639306364373330343236303563316364653663633066306162333765
31363566356137633665626232636364396531616565616163376265643637363761613439366161
36316239303833656239363766316134363064326333613963396661363066653530653230653863
66323963353366626533333037636263376638373162353232623362646136333530653065376537
30343861613538343534633235333961353461663636323436663930346633633330383435616436
36303838623264306561346561376331633038636238393539333732633936303136356532303166
35356337663634303538633966343837326536343063336535626338626339636431383165316466
61613063633839303736303235633963326537663731326266653030386136383432373839363534
65383439356137376431653234643933636265643166656661613837346134666434313961313338
37333564613364333337333866653863633736393966646164306461616661383732303834663633
31386436363434643537326562646234353263613536303335316638383339356334306238646235
65303062633061356638393937316639363065316636373637343965656234343038633133326434
63656261353535383133303537663036333535656334363230313930663063663663386430313131
35613532326431353132386135306233326233383665386238636137636466343866613436373564
66366432643865366138656434333435323932653435346136373735636361633230613064656364
33633533383963373263323033326565343638653730386635396236386131633465353262653863
34623662386138663133353966663235666436323264366333383362373662303336626130376230
66336234383430326537373131393133373766336437333237333461363063346136343635363236
37666463386362653666333032333839636562613437343938366164653538666164376162353030
36366564313338336439386661626237393333376465383437323864393535353039323661363336
31383337633731323335303633363066373936326437623530303364613630373736333539393966
36386663373630663837303930633262666662386435396632313235313733333163363737316162
34646332363230616239643562306335383631623462666637633138383834363939643736326664
39346135343734663964373533383161613861386565303731646534313364653236343838353165
31356563643135653663316337653866643362623862653562313864316661613838613637323361
34613832326364323437336131373230626662326230376364346334356264373662623563636133
36313737363931623861643331393932376339646631343966363364303237616136633139323333
33313562623934316334646630363735316266623663633531373436636661633465656333623863
65613463646366316235366231373364373366313066363339343838616631373561653665643939
65353466623562373365343832353432366539356337613064303031383733623164343936333931
35313632313431633265313539653962343162343336393166353637333531646562323732653165
33636634353431393332366533343237643532333130643635646237383061326535393039386336
31393237613230626133666131646631386436346339633238616163393866333666646435613564
32383439346338633139303032353265376338636135663137363666393336326263646439643937
31346363306462333962663232336632626635323461386465393866346637633839653065396337
38346639343031386438373734656630376364623132343738666365366532616464303964333264
30633039323366613439373034626234623738313565333133343031356139303734313631386339
37353336626166393762346137323361393261626231653565623833646330313031393166336137
61663162363236313131373765303139343962323162383764326233333461316636313035326538
65393432333935316636666163636138373566386639373161353634333764663365396133646432
35313331616535346461393939656130323666656465646539633933656264363236656237383738
36353963356266663932396363633263666162363934383034656166353732313934386233313836
66636164343631643032646461363033383434646166343136613136346366346565373037633931
35636366323634353566323263393033343162346363633339663336376161626231386166623732
38626537323632656332366362323332306464666337313236663230343662386437393135363237
36313262613539333165633238643664333565373766623939646564306635346539386133643132
66393434323964326263646164353936666564646666663330646332633032343363353231383338
64653463633231366230616338303936393164653263366430333263313662393139353337643664
31373632623234343835323666393735336231306262323338623163343665393166626639336262
32356334383439623963343961383131396238343066656330353863663736363662653863646237
61393538373739353563353035643864373735643264373764353534643034623334373462346665
34306638643264326434616561363461646661656638396232343036366438393264623138396263
30653934373964663131613538646635643163386462663937396632346634333732326239656365
39393833343166386263363130303338653362343566316535666336623965653131383733656337
31313032646265613136343938313666376462303163353932316538373830333731313531313261
37636362366432343239306165643530313939653435633135373562393762393365303135366564
61333066336631353239373864653765303565643364663638323966306362323366336265316261
65366666376461373437613632643738373036303138656537666163653538383436346561636463
34663830623064306336343434623264353161613262633637623365643836626531363939633334
34386239636533386262353733343065333534363761623134643465333638643232646539633230
61313331626263643864336465303437303931366136616331643635653739376165646532383835
34313939643362383262393739316631333165383332356137353063303232653064613938613632
62353763346639393431646336363533633566373834633332323535346666666465373262623235
39303565653066613266646366653438363138623934303162613462633064636530316532313735
34656530373634343131393635643366643234326531373462303038383533663032623139383830
63346261346161636133386431353737386365366362663536386264363263363263353530646233
35616161303431336136393434393135356230666635303636313561626236303739386230663835
38616531303539656232363062306232383061316466393032386666316162366137643866613036
36633834313839616539313231623834663539356664363434663031376230383131623732623633
35373432396232656664666337336533613330313633396134636135343037316565343738663033
31626537376463663764313434653832643535333739626534373634613065646430366438353761
62303436363536653938383966626639666563306565353662346437656137343632393634343061
34626661633830343238633439353231333237613039303236326661343032383931386237316662
66373262613965343363646634666134323362326335326237626261393636313037383566323835
33383866653033643564623237353264376164356336663831343431343361383963346666363535
37363930386431636666366339666365373763663038623430303364336634623536303939666666
37393964663739393865303737653238386635636632633564653565306136623166653330636631
32326464346333373561653866333739313436666436353739616261313861343461333038373931
30666364373766363963376464626636633064336333353133323733643231663335613066373563
66663165643235656130363930326232623736323033353931616433663337306237633061313361
31653239663934383732343032383466313863313834646530376335336135346164613430633361
64373938306538373230343565656430613463366534383862646132303139616462306635366464
37363666656566376366646362626661333333353863643032643932383531383339653237323162
38313361373365313138653435653134323533663637393265303434366262323232386261383932
6663
61303461373234626338303164373438363631653037303239393666636437633832303066626461
3233396130396437656562373763646165393231363464660a326364396463343861373236393733
62363134376266383866383933643966633332636562623536636536653563393263383066626337
6635643065643761360a343730636366623364633861653734343132363866323338343031613030
37306532306437656463326538623066343435623163643133383638396432623065376439366232
30313065626530356562336239373562313630613561653435323333623035653366323734663637
34626630353062323131643837353839323735393031643337313333396162623062653566646363
30666137613934626630323838353066616432343238653935646332376531396134333931306464
35353331663964373735623661643238623033353131356630376363353131623930366562313361
61636633393266373230636435613736333732323462353031646439316639396432393232613236
65613963623461373437326263626161323266373166363230653165613637656630663065303132
33366362373639343230373836633231656233343539393332336264643430346636366537643836
63343933353261363430333233623930326663313465393034356530393237636264626537303430
30323965636161653931643235636161396239643766613561636131343237343337366137326238
63393336306230353766386232396264393336636639666661303962626362636266303262663036
38393530636438313236633566313361393136346630376133396137316664636336326633383437
31653131663834663036313366643237376364316236313066316338663038343530616236396566
37643563366638393164396666616434313236376364383439343464366537386138363064666431
35623831653536313261373462376364306233346632626233376365323536313762663464373037
33613038303430313538313735353232353131653862383362613234323166323936613166323266
35393064623530316661353431613733643061393435383637653732656561613138653337353737
61623035646138313162336332613139316134613935353262653635336634383962633066653938
32346565633834646465393135393935353766616530366139303635623863633932666134366664
37326331383638376636313931393233636132336536306331396461616262663335333264363030
31626463346262616561616266313235346461623737353465636334623861393066373162396163
30303036653964313739373963636566383364646465386164636534363938633437636463663839
39396264623439613862346339636661643538343832326162353032313638633262626331623231
36346333353635333332376564353862313539333762663664666538633963383864623234396262
39343630313363656530656436663561623533343862373438356632323936303333666365653664
33313834343630326338306339643666383533366534616638646665663930626636653031343362
64613134393032306230353636353434356266343464653661386366333466393834313031616232
33663835393934623163303164626463393237363139303064636636663363356138383939393065
35363532616565633338383835306137666665376362346235323366653265333637633034663761
34383337363161353037356334313838386663303334393736306234353137633933353634333334
63323937666231333163373231663436366132383536303433623364656131323662373932313234
39633163666462313334326433346463323639363433656564623436366134653437386330313663
66626538326431393737663565656666393039366162623562623438366134646465346135366135
37373264333034653032346135383236366537353466346464393439613866613232323537643762
63353562366261346162323435323136666661643366326162306636386330623032656463623639
35383263343865393437376438383964396363613831666238623737376132633438346337363733
62656266373235356334383264633732633139646333623363633534393435643237663661666533
33646236343135323561396137363762303036353962326537653834363965373135303338353232
39613437326330333639396366656238623835306638393930666665366666666662366465633139
34653566376339313037313034326238363436303064313134633833343565333733316564343937
65393132313832633465393738653433303064353632613861373836343630653738343730623738
65373466306264613832326336366635323136346661386331353837623431373634643730623966
34396564366662306232616136636136323834326165636439623463633165363366326666306466
36616466313136346561616361383239653831323931356264366630646138663236333761306162
37323530643138306138656665306363383139643935623439646130633938343165643865343230
62623238336635656462663832333665643537333139346134313632646365633535643630653162
33616536653037396632396537326463316164306634653961343333353164306230323839336162
62336439633366616638376331386131373535333364326365373535623331633432336264646430
62613765343837393735326433626430666634333336366538313265613935303332646366633166
32633733626666373336636366656639636165343835386661636665313532666362363666383734
63363463303034386134376366656139376564323262373066656365386138613630393861373933
37623936643966313264323337613362633363613435303464366436343365623363336234653562
33643535393637313332653534633939303431356666326337666539376130613032316236633162
33353830626439623161353832366432643265323734373835323663643831626234653930623737
64663566356337326439353461376136373131366330653133666463653737353761653938653465
32633764663239323161323639643861623530626633313832373762353630333532313534653832
37313230623330643338353462633163346138323766356638373132316666323530396566323532
66383336313238626335363762313839333861346130383137363266626632653839363231636563
34303132343733663131383730393062396264626662623262353335663732633438373431346636
64353531636338323662653430343861653931396364616236653838646237643934306433373962
62303663383662643666363236383330643266336235316634346131343030646234633531653735
39646561373662343939363538393639313839643061393538346234363735653562626232656336
31633637633035306337303136383464373034313332656563343061316231333463626134396130
63313162303064613433663765333737626463353334643836313938333065303835326235616262
64633630346236306631336338623938346266646362383263366264653432393735616335376561
38383061663335356264383438643937336633613965376161663330366562643130643462323464
31343235313139313365316262643830323063343763633330356430666663346233643836386363
65646236663036326331356333373835636435383730623038346333343035613930363431653461
62323038313962643231336135316133373431353334623266656236636364353565363766656534
62373632313232336338633630626235323339373231326431626130356530363235333334363734
39623331396438653262343464383336383835383433656130666465306430653230616430313936
32616234653161323637356330616536636361623539363964633163636366666136323831313534
62656133383962316133646463393035353461643631666537386634303432343937643338636633
37333135363063613563663839373162323765303230653636303764393535346632393965663163
62343233346235666564306531323565323734633662663966353635646164383734303830323564
31353666356434356334646237373465336531306264393264646232323161656162326337303064
30666539623365633965386330386534383131663764663565336636343434383666666633323730
62653631393166656339313334666533373031383439393465356536333865626161623162616565
66323631313963323539343465336436313264363062353133306566313464336236643737306638
34653230363937363431653930336430613938373133373463613866613963333831353037643038
38323937363534623661353835653261303831376330323239303762613434376436323738613037
31353539613961613238393335616431643036653561383135323732393736626435653535323937
38373861303865323165363966666364393162346262663465353934616432303239656565623831
38626337323061303861616132323738363564613366366437356633373638373737393662636536
35646637386162323030366161666562653932336665393332373739663533323965363563336337
31663437363430343335656166393562326530616362333562373231366531373238373439633332
62316437336264343164643866333737626130333336353031353763376364313866353135656665
37326435343238326366376262653638346235666163633661666132323232303536376361303639
63656164653537333362346237383533336539663462363866323435306332636661663534363537
64623932373233326134383234613961316435626237363732383363383838353337353338353962
38626632646430373339383263643964303766356336623238356465643632306334343962643864
38333362663261616362366362663833623663376134373263643866333261623063393563356436
32653439636139353139636238333437306665313930346230366562306637356539383137373363
30623839386336366261393066373037323062323330613562633939643931353230376665623264
31353133363335323166646437616439323164383066386265366235653963326362656437343363
66663964333261396563336135356530306266313132633164643936343565666231343636376637
36303765366130363836373036633565313462663633663338656563623337306562393433373965
66306666393132393963313861393565623831343539376265653134373138613162386664356131
64306663383564353762313161323034366339326662303633353235393064383439343433306261
64373938636634643166313735346662323035633363323532646161383536616639393361393061
37336137623030616366393038663439396261646164306130663462386131666632306363356465
31353138306533613239653937646561363161633435303138343034646434316364343935393430
32633665393432643662373339626266633334316332353562363864633965323262393864633565
62393337376530356566633931353936303863393730396332396464353566633761336135363739
38653435363232666263643639326339613764393336633639353962613539636165376462616637
35653563303166626432323362643364373966346163386635653634333638663030336163633732
33616136393562386565326365333732363165366233363037646636373464336437303338616262
64333536383739633038666135626630643863663933666565323937343064376165303535303537
31623664373761353231336331343432333064656431393563623438633238303532303836656661
31336366393932333062306136346430653138626463656362646265333438376262366234383330
61303366313461653337323761613336643363306531333163313361616466363765636364366434
39323063363231343233333461333966643166643330303332353138636631363836323135646230
38346630616562613163396262343064386436353961633635353033383232326239616436356566
36666537363566333666333831323738366130386162323339633639343430313832336265376262
32303263343330616531376630643964653965386363393336313436633235616332373537316437
39336536656433646464633732643365613336313666616264373364393666663632373863366630
65363964313366656330316361383036386564363933346362316532656630333537393266343539
30633561363332653766333330363437643030636532373532623635326636353434653166393866
39343238613731353362323732343063633332656635333565373264653561663166353836643538
62353336646439313465363261383931393038366561643665643239633938336436326637336436
64646234366336323133646363303230326264633039316335366137656464326634323266633438
66396666366634623466383864653265623137613763343266646337343438633262336235376533
37376239306632356131353139336430303336653530303836316433336133666539333462343630
31353834363533343632383864323961626630313033613864323037373430373632383239613862
61313462663765393739323362383035326436626434333530396165653535373961323865326432
34353238656333633133616263373861303138643163343264646665613039626636323233303261
34626138333564633563666262343164343531626431363031626465343965313236353137663036
37356262333939346534303333316434306162666336343531356562383662366130336438393838
35626263346363396662373162396665376164353034656463393462346662326133613966636534
37313431623362396634336535336464383238633266643337663939313132333262313130373761
38623036326332663635396638326233303236366136643334356535353136303161333531356432
39366139356236346565363464313436643165386230646230343130303531633732663433353364
30363233646637326637663730653134383532336261376633633133316361353035626132363032
33636432633433303439663435366636323166343363343736323230363339373132333433666330
38656338316264383065623638643436353734646337633832326130383265326136346534613263
30656234343639323165323331633834613333333032333134363763633464333461643031623261
61393637336632653061373038333566313839633332613631313566633239636135326263633539
39636266666662626435393636343162333365653137376561396364373932393631363365623834
63306661373331656432663666356639326666363730306662316336366139353135336264343539
34393564373432626437363664316238333738626536653831333765623839313133626365646635
62373664613439643165656566363638326234363834323830373566343138386662626431303036
33663963373032356162326237343764396538666638326238386235626566343530336166366362
65363538363266623166343537643434306637623737373266653637326532616333363864653766
36333738653662623863333735663330663135613061386338663063303563323638363731313161
37326536633131393534393563346537323733373163353566373934316136613339623938346165
34383331633834636563383364633833313834626466326565396161643730666131376131636465
63363035386535653336373030666636333535393837323237633435366565653138373662303037
36363863336163626231353861343831333437643133306531633638346635363438336165383133
35626664303030303864366238656665306535623861633330653838313533303332353265643837
66616635343131356338303838323765326431326439393333336361303031373266656137623136
33633062366462633464303634663532313332333039323064323664366561323263663062336330
61323666373631336564346164313831366438363433383464323331353337336635643239636435
65666333356434393132343666653533623535656562353663343363386562616234626233396635
32613865646363373236613936643335653031316431303237633536613264663939636532353733
36333530313363386363313366343239663439643666376431623766666266386434313931323338
65386438376663313138633534633839666362393165663830393431303764316336343962616434
36346236356131623661626166316237643737353561626164643338656564313638633161396565
61633335633032376131336532626532383130313336653232356666616235623230313337306339
32383632376232643839383735336439393865393238333439313665623162383134383839393431
61643936373434333532366466663934303964643039626163613966326463323832313736663431
33353130643138393038373966663433306533326432306138653733346336643236663831346130
35623763323166393231653434383434373662613762306231643835393836323933323336383661
66636237653432326362313239393333613733343266336365653631326666383334643833663666
36383031643764316430653532316332633931663132646234376139646565646230613833386363
64396139663830643864386137643139313564666564636135333534653735316461623366356633
38386461653565623237346631323066323535646661393865613162333537393864303061313038
30613936373737306231623630613362333832663336333561633836393665343139306465343135
37343433393135393366663837343663653439366565663335653262343135626461323136373535
38366262336138393338616236393263356131333030613039373366373961373338663938386431
35306263613462613435646631343637663266333331396262313566613962316235386335353736
33333233373030393032386237316430623330353866613038373934393337343762393537383931
61643965333234363233333938313432396332663662316464366230313865633139613637646336
32366332643235393565396639373534646635613036653265373664393165336437393139303365
31643064656265333333626133383336663437366535653736616435663461303735653366356634
61663637353337396530343438633164633331663866363837326434353466626638386131356237
62313063303937396661626465613732316236626336353961376338363663653365656361346261
33613338613834346534366434623331643364646161646633366434303831356663303831333439
37636461613763613933613939633335643430323837656533306662303032346130353934353631
63343062323563363664313631396564633830626563336535383039326562666539383830393935
31663136313337343830663933343265366633356235626564656532663936353335383733653765
35323463646536326664623939363035313466376666386135646666366261306537646564646330
32663262333064383236393335663630386665346164346432396138396231653637326431383030
36316437313139366639323664376630666465396562393634356664353431356330656161306338
33303137353662313762353634343862393731353936646136336233366232336532346537666364
31336337313536386531653534393639306164316537656639626336633436323634396539306633
35663239636630313630396333343737663637663934313666636466323765616331306561366663
37373465643330346365303838383238383436316638646466363030376139656266656263623664
35636364356639336464636266353830336333623235636664616164383636646236646264306238
66643930303165626532626462313461373865666562663764396132363964343661376339653563
39303334303964666330343635663238386536316431316332663666623437646661316138353066
34376564366431313064306266623166623830333137343163613261326236373361613735653533
66643465376162303436323562613364313663663436313363313561306231366366333064653339
37613134343730386461613562306631393863666339303638653537663263333636363862623166
30653762393465323938623739303536393037626263353736346439393261616236323832376633
66626533366631616336363433626332633934643532303565326539663330323238376232303565
64383134623934386361323539643038653037633330653964353230623430313737333537353032
66343839623838373035373238656232636536343435333439653964383737373439656363653535
62316131303035313032333333613962386535666339393038393739636133366535633730343830
62326336383538623538643461356238343136343665343038373663623630653731353932313166
34336265656433656465633433363138356663313234376633376665326366363232633737353536
38653338643064306237386566316534343061653530323931323932633635303838623135336262
38646236306661343338663039666438653039306332396333356664333031383563323065333062
31393165333963326135383935326566383539666161393234303764383838316639366362383339
36386637346661343633356164313466653364663663336231636465323636646130623932326561
63323862343534643334653365653639353466656536373933363033383862393165613630646436
37666439313031633961636665333962303730643332323063326439356238343535623064303061
30393764306238363362356131366337396139303661616464363665353265646539663437363734
63396536326263326336383533656230656462313938623833613130306238343061303061636661
61323566346130623735323662636239356538366632636130663838383938613861343035333138
61376438643432323536363966353364643736663163356366663038626362396266353535313030
35623861376433316331666334313336633139306636333430336536363063613839306638613363
61633261343165386236643265333865623038306263613237656231323831633832646536363464
62323338343039303461663233626130393133643335636631383536633061376632653234353430
66303132653162323332636233326533396165303739376130313531623161316263643738653332
36303436386362386361346330636535626136373236333234653462656262363031383466313330
64346338393433636135616437623037343964383664663862386137316638353862323732646232
31333661626238633632623637626665353430393362653061383462666639326430386664633233
66663933643834613637613234633332636663356330313632356635356265316532346134373431
61356530656262323534626561636465386562646638663337313236336136386234623530656138
63366530353464393139636638343563383330306232376666386133316662373062616535656262
65663439333337373038663035633933323362666363663830666332613261643239613237666438
32376239393032333463306533663534616363636432326234383833653734376632316566383231
33633430616234333736373132653365653530373666316565376535303434393939656133363938
30653465623832373439376439366336613266386330333938613161633932376561616263623064
30376166626333666261626239623363663537636331393531386332653861326339376538393430
66366365333135336538376535346430313630366662656639363133393062623234623536353164
30333539623635626464333332313763333039346638376634343637313365393035333462356333
30356466303562316165356431613336326530346338366334373666333736373438663632613061
63303966323838636630323462343965626266666565626430626531373361366436333837353030
32326531356661666436363431303238613537633530383535376439653166643864303961313037
63663436646365663666363330656432353363356166626133353738653366346165643935326235
37613365653466646137336663313162333964323033663264653132636461363862633630653732
35333839663263643431373739663433393962626637616135336164313163613164333136333862
62326231626138383434363332306635626665656339633332333863306134396163373439343032
64623666343333363631663937646237316363373561633162346438636161633963303731386439
38353131373966663937313632326231623238333438303932346663306633303032353333396363
39393362373933383933636333376162303435386238346237356239633433656566353765356137
35633863646432326638653333313331343266646437643265333162303266323537366531336165
65393035623634323630343436653062316366616562633938356466333165616636613139656333
34313166383339393665313762316164323933393637326131623764326261376536363232316133
64373566326165633865316230666566653934366438376339636338623864643361666465613739
30643666643362646435666463376664323934343537373164616631356234313964316138633164
63646463363233653766666230656266343839386238373637336563616131326631313034623534
36383239316664336133323538396230346538643930383933343131656466373636346432646266
63303133396461306663643066326135343332643066616166396562333131623332636538316330
30646663356335313361653861396165653937343733316438376337306230383639613363343636
37353138323261663031366562316234306364643539356235396366303039326433303065353862
65626330366331663234313739636163646137666465633938393163366664623564373038633937
61346261646338616638663766396337303161383035316433306134633230353533373865393833
36353063613563393734306436646132646331353538363439623930326231306364613364346335
31313834393364373833623763613530646636346364313835643338373636653566326166333065
31663936343439316333646634643161323435333261383335303330613635306531636534336630
39323831643639303838356139316338313536306665396438366434636563333036343339386663
63396436363536396132303961366135636639326638643934663965376436633763663536636332
30303264333565303632333039326461333934616638393630376639646330613830323134323635
61366539343266366332646234373131383532633266636530663736356338316465323034306438
36663033633030303465326162653931333463373163383335343866366262363561373832306136
32333339626131613130646464656261343339306433643434653532653935366139333335323561
35383337393232333738626334343436376561663032653638303336333234333361343164323630
63386263376435313663353737666331336137363566323639616235363439323439653137393930
31643335633766636265336262663866353566623861326634316536663133313634386364353465
33646134313031336331356139356362336133396162623661643765663438336139306438303763
61626333356635303863336633383262633631616666336334346337343963393762626235353963
31316564393064656631356663666635656265653437343762373138376264373263353930343635
34336461303032356234636662633765363436303161323239393533356139653938653463316332
35383030323234356137643136363963353631663636383939633333333261323735653535393730
35616264396235326633336263313437613230626238623661316339323632323563363463653238
63303836613136356637353238343730616661656464663536366661653031366137313266353237
37633131636533326230623465653439303230643935313332666236653465333531366134643938
35313133633332313430386466303338303462306536343366303637666334316339353737373539
32373135393862396334386337653737353738323135353432353437633836663865386433396235
30653435323032303836666164623263633134346461343165346165313435626434656237363364
65346332643566323633623138623562363866663734373864383561356536323461383635363061
35326564646133666266326434313338646463643739346663396462353162663662333861663163
30363939343034383232636335353231333930656364653861623365626237636462316332356532
32313762306466333661306331356364636438353939643432626136623761326636636534623866
32366462366231316662303265316235323230626261646138636338346137663839626163353636
35343938333165313534623866633831363731383036616631336132323637373566306465306136
34653533313134633631303362303834656232623537383464643266663362653964616164323066
38613833313139383637366637623962626163653536393862666639346363623237373164656539
38323064613230643134656236623163343232326239333266313664643632396637633638653062
31333532666334396638393838633865633132356366626533643566623762623130313034613137
62656563613963626633613235393831393039313231353965626236623539313063626664666437
65326462326538646435313438643539333934323734343666386631643636383662623065383930
37663837316463366631353639303938666235363933343666316330613166313063336330313064
62333161636130306531636135306139376264396432623439326261383639326462316666343139
32343638303437303433623638666630656466663737616333666362353730646433306163663233
39393765633535343065663530346438376465386665643534326436383138303536363539643266
31316561363532626633653863323336393534623736613665336331396231613835643335333635
66363565643961356165373032653764366163363537636561393266653764623431363937393164
63326465623062396164343033666366346137353139363336316532656639313666633438343036
34323831336665656334343637343034356136306331356133306662396339653939303365326230
38363434663830323135306464333831333563333932393533383332653263313265653535353266
33616362356164616437353538316661333161636664626138316132383331656132343830393032
35303564396633393732393236346665343630346338663533313034366433653966386332666461
64333631376262393161383434373032323136633362386664326265326364316238363032333462
38346333323461643264363366643862303363663162363765613563323035633834386431373635
36303466326661306637306363623230623936343065376130393862393639363937386238373931
30653239326665613132333830353863626161316663633834626237396239626138656433303661
34643264343461346661313739646665363335313863663633343730323633623039623135343262
33353164393464363838323837633664363438333162666438616432346531373732613838343831
66376539623662343730326533373138323633636537636231346437393338343436666564373766
32336439653834393631363266366235316336613431383530376231373237643932376332306264
39663134656439306266373361323165353836396439383935623935336237623734623738306130
35666435663936616164626137376566613235363239396237353034303261666263393233623632
30363565353732363833633161613662623463366661396530366530346666393733396538643137
33396139343936666231353337636262333833386162373130306237366663373137313133323063
66613133636632353630363636373538336131663963663938393638393032303332636437616365
32326331353664313439363266626365396439613332616561323735353661663934343731326563
64336365623835363633306535376561616139373938663432633262306332393539353737623038
31303036356164313662396631633834363463636430316166633338303264393934383434656432
35386462633736633162313331346538633633376638353363666361633130346465373833353262
37643735333530346437313431366136316430626537336330303230616266626636383530313262
33623337616133356136383665623166393064633838363836633830383535326232303735646434
61306236383730633963343630613966326537616132393030623264643431333230373461323866
62393835616566616231616566386164356430656464666136306335343066643764393466663235
33386161353561396566376333363765643338393730373135313632313739383932623331386237
61333733356565656631633033343262326530653339313966626234383261643231333832623261
34336366663862313964303239303131653663373236366235363162343535306565383062373232
34633738663438353864303965626533633937306431366132613335386338633431653362376139
65626464663262393139303633623831656265633035373539303363366638333866373066376133
32353039393566623061643735346234616531623136313339336634653637643233623038373038
31303864623836333035653364366537303063303366346438666339643538366362363439366237
30336133383133343635353961636233366136663764346538366339333935353833643963353836
32623265396665613133613464333262656434393338336633626234646135316264653866623833
36653930313238353131636266623238313338306163356365346137656666653365333335393138
32393466616136323133633231363135333331333232656633373236356139643935643530366531
64363166393530306463643435313135303232653662383766383562303235303463616166333732
33663636363837663131656530353733633566366366313332346335336435333932393263613230
61623035316331626237356566643962613936383566333537613035323865363138393164346531
31313934313233633337326333356636636132356633656264323666306661386563323139626530
32396361376365623865373662386361653438363331636265626262656636303937323631336132
35396463656532396463666564323663653564313731363533626338313738613735393636643531
64353537343466633665306563636236313837653963633033383534656264323331396638613562
32393739353937656365626662393336633737633562393765313835663939343331643832383135
34653338623430336330343666613635303530333336313436393964333431386235383464623832
30313534306565326236653739353865646166653039323861623539643963643064346136343033
37353236646439386366393563363833333664353533373032343462376331623835363739393635
63613934313436326539383339653030646666363563653466623764313732393830343233383139
32313964383363366635303339323963633638373434306262353665646163663361663730333466
39353034333639656165383134396639316363383335303635633064343561616464643134323535
34356335616466316636396436386236613331653439316462663935653763316437366265373233
30356436666435373563623965333063663439356432383466626131663635386530326633336335
37356661336135653338326236666361373437393031346364303134333762363235346230326430
36643063636332333666653133663834333365346234383638323331356132326138666139313439
30373933343666663333636537313166396238303738313039666630326438623430636131666663
32656334323433326234323839663134376266653439366664393230393230656265346430333739
38366433323838343438363537636139336634396263656263323738643662346432663337633864
35383738393461613036383333633436306632303433343365323036383532633665373964313566
61653130336361613030613232623932623639346666353436626236303639373530373231613234
61336532623537346535656133376230363834373635393838656430333632386565633233386633
66636262633965636163633039303630346632386433633333356564323937616664383130663163
61643138373736303131666332623339623764336265393062356638633331633536353866323238
34363838366136373338326330643962393930356562346161313236663265656664353036323865
38363764306462346632326164343730333861663332636637383837646364333131346662663234
64613035643938643030373939643337303865346232643338323761306334333438663231316661
36653261636333656663373165346635623661616364633933646533343166383531326366323137
38623363376134323937636464616265376566373231643135363139333235393530623034636633
65366131313437623663643038653164316333366237643961636236306136316336653436356436
35343762626532613633386632326330376563323432303465373135616538653437383862643236
32333739306232613838363763646236623662613238316138666537646166303231333263653961
33643632363265323561396639363238323132656265666337363830393630343664613833376237
62396531353336653739636663373535616631623961366439656432626362386364636638316131
31316130326438316230623438376137323832643662363862663166363630373863313765303531
65316338643565643436656266326137663432613461653738373261386566346639353431613765
62653934386338383334636639376139393131393566643164376630333063383131366532326338
39383665356530393165663839303863346233613131313861313731356164643130353236393334
34356663336131383437656339333531323132393365633563353538353635356439623835383366
36363435636165626334613462393434663030666231343037366138396338623335353837643261
64366630663436316436336634383231363133353835633136643736613963313034316535376232
33383562663732376263653836623265633036313431393631363831313662303639626236653164
30646464323239613630663063653037613033396431396334363735376664646633343766316566
63633566333731396365353635316135326261316138303838626337303164613835383662356234
36363965643263396631393264363835643834356264303935336635323233353732663361313566
31393032316365623163393137623537633866333563393732323732306530373236306138313565
30376335356239373366383466383263393731663266626137396466336137636536383537623430
62646535303465633834333365363636663631633536366530323761666564366561306337643339
34613736393262306265363132643864306539663435396664323136313162343966633132343633
63613463363965616332653339626564616431626263393037323862663431323738656363653665
38386263326533626131616630653961306163373062653634666663376261353434643038396434
63383335306435623266666363323333356163303562323065313536353639343263313162366538
65313564653837653933343336626565356434666663313632346164363537376237643131663663
61346533613164643333323235336335393166613036626564653538313837653363613864663636
30323938663965643035313861316539613961326334323435363337613335653365386637396330
34313131393839316665626166333664663339646235666137363336356633323761393966373664
66333462323535366630616564336230653962646338643565343661343134326438386537623263
33373839373135626639373733323466636362323565353437663630323534306636313037363239
31626438306263623339613766666431343866323765333065303234646230643533376464646564
31306233396136633037393435366462303635313936336662616363393363383737653161663435
33623262313339666230643935376239656165633362373661623464653934383530323037646366
65333130373639636536616438393465383664306262623736623438346233623738653631666662
62643837333834326234616331303639626235326337383964633166656261633165336561666362
34636530323535343466393432626134613662663934366335383039613865316132313438303038
38666635343336303639353433366638323363376137616164333231653766616432646364316435
62343932346561316532663538363865353462316338326164643465313963323163393537646337
62313134386633316533663562653137663532636232303166393438623765626665616331653136
63346432656232663162653636373931383263346331343762363761666338343133353763666164
37663837326334666339303537653266303838613965396161383561336637646637666664343765
39313237646266643839323934386437643737393261343639373530623130343038356638613937
62363664663461326536353963393539633038313830666365643263383935366166316635323839
65313361373266613365636231366138346665326565643333633264646535643139366565383562
66303039636565303031373662353232376138383231623565343534623962633961383164306438
65323331623066383934663462333035656331336536393333396430333732326239643531393931
33316264636361613736353066643831386666643333346133396437666334633033393230633265
62613438306664326633656632303232323730313061306265656166343565336530333065613763
32653932663965366562393836633737613632343961346563623232366234323333623962363262
66373063643936386431323338323566633562366336333835636533313038386432646464663731
62633933396337663334643438386134383838326263326636393963323936396462663230643266
30666637303431373938656466343232653036643332356330616561393438353539373461356338
37613135633339316462323465303838643962633132396136666333366162313934383133623262
66343363666663316337643036383662353937656235633264663737636334333938666637383139
35616139643635653830316635393361666339323835613835653430396435653534306234306464
61323630663932666137373933353963306230363830353236376530326530386531386264633537
39656130653263353666623766643362613533303562633330373330656435643231363237636438
35326637303834616134643263386138346137633733356333313162356262666633383637663333
34303765336430613237393830376538333536333266303930373732386262333764363130323032
61343065343862393735376232316431383066326662306335326334373939623261363164303336
34613135323033656666666661353933626538333961653465646138346231643537376166636566
33336130633236383836363731616237323764333737313766636335376538636565613931613131
37383132353636323237396131383134633362623233336437626565396533303330373161613738
62353166353032643632623239353162353465316536343837613039323831616538323633613236
63313566353132346137306664383534666237303666356465383634616238643530633937343533
30663363633737353332633266326335336334323161346337616566633731363165333835616166
65346334646232353932373333623765613837336666653062363732643463386463356261623133
34316336313565646538626433363530313534636463646364353734616265393630333531326132
64336132653537373932313930373934333466363630363665303139363236666231343463363034
62313031633434623138383537333166623732306133306466613539396162303032646534616633
34663365663734363566353264313836343638633638373733383331383239656666386566613235
61653264383964303434363338326136396238626337653862653932323164663838353431623938
34396162393433333834366330383136386565383763326338316434316539656365356334396339
65333532376466653064643363613131663531343566393033356261663737623463303932396463
31356166653037363063656433323033633462323437363062386237373339636166646561346135
62356135386233316339666463323437656164306536373039323431373133626338613166366335
31353364623165366235326637323264383639643038623937333930363038616633333939383065
30333563623433666236303064333261643266393235623737303835643461386230643864666633
65396237313863316166636461663165333532376463306464653138393632393164363965393462
63363932653630323936616261386162356534623835313166633164666632336231333565336262
30646434653133373334356533643665376337383864616561303639616462396231393938396331
32336262633836303436303139626438366138343436376331373266343032323031623139663730
30356462343837643665343763613637663766376136393963346365613934306131343461363530
31383265393161313465633231363766653135356332353563386534323532333963373239626434
31336665353532636464653866386365386432633739313730306335326462626331313434353935
63346261363165633361626239626234313766353134303338636533396336633432323236636530
33633737393566663863663361366435646363393732643533333332383230383730663535613835
34646639663534353263376464316439313631616464306362303235653834323830633231386634
62363638336364333466653636343439353131663831656536343732336466646564323965373232
36383237363466373366373464656264653737396333323831343263646638613832396430643965
65633862636233373961646239363666346235643235356332626533303131313533353130656437
32303434346264376433363864323932656665383732333365383536323331316262663338616630
36313335633639623161346139323161633835333735663430393738373165306666326234303131
35333738626435366164613838333134393833363163373733313536616639633163636261343133
34326532306661316434303336363935653839313533303361366632613033353137303436363562
65636463666131626561326533343535616138313832663865646632353161616536303038626337
35656239326331623762643838303836333266366430313434306534623964633031346164306433
66303831353964663831303839666439323932326135323063653763633436636339343963383262
33646639303432646366646639356634356232323932353334333364303463656638326533653739
37363636376131326338663334643039666539376533326336383338653131306233373362313138
30363262666165656463333933336162323862613164313162323732633736613738373231353037
34636634663266326165343635316539313737313364663063303638306437666564616330303662
65623866353230616235636465363061323766383035356539636561666433623531306266333265
63393663616562376230383566613034386431376464366461316234373161356163396332656466
33316262393030656639306134643535396664356366616437313366623731653032386437396139
63643037666139343562613934613933646131336235653234623161383530356266303938653365
38366234393537393232663736396361663536386566643839373135323233643830326563373931
34616631376634623138333163343438326135383134373063306232346137636164646330333262
65393636376464393534393432386634373665643535656363663635646134313539393362393962
34363934376564373934346564323033613465333131386335663133633139383636313433636338
39653532653134363231663661663736376364653234393537333765376134323165386133643465
33633464376339666131613938313435396131626337303163363631663036353931646131646538
37303630373662396339386464613362616261396131323530346662656436636130313063643865
63396434333164363133363766306436386635653739636266383134346130613930343430323337
61313434303033393537643663343835653566653038386239613061316638386365633037376261
65396466646436333833366132643434373234303962356566363931353166383730313536383735
35323065643664386665653661653261613832366463363062393835313564366436646635636163
32373936616231356566323836373865313939653634373365363965663565663336343331373836
37646237393137613632393563356239633535656466343533353536613164656634373539633061
38626362633732626333336533313165636266623333393636343939666165373133346464373536
66383061313230383932306365643461653666353565356338626232313133656561316361653633
30303063343564626238373337306136373231303135383161303231343765313363663533393737
34383935623136646435306265663738383730633465306434356437376334386466316463393232
61343035306235326139386235346634616535376238643361333137663738303364316634386638
63383962303764303663323437366430623135303038623163646362323132613932363366633164
38613461323337373239663634333136643161653032326334656562313566646365663766646436
64326333303561653130656436303066383563333730633764366139623561323934306635663665
38336561646161363263626364313336663163316637313162383762386362343331313138613564
65623539656336326362323334336263346562643530303064346464643363376134666330653630
37316330323165373566353739663739333133643632363466346432633366663864633034316463
61343935663337373134

View File

@@ -18,6 +18,7 @@ vault_spelunker_db_password: changeme
# Neo4j
vault_neo4j_auth_password: changeme
vault_mnemosyne_neo4j_auth_password: changeme
# RabbitMQ
vault_rabbitmq_password: changeme
@@ -62,6 +63,7 @@ vault_mcp_switchboard_secret_key: changeme
# SearXNG
vault_searxng_secret_key: changeme
vault_searxng_brave_api_key: changeme
# PgAdmin
vault_pgadmin_email: admin@example.com
@@ -98,3 +100,25 @@ vault_ntth_token_1_app_secret: changeme
vault_ntth_token_2_app_secret: changeme
vault_ntth_token_3_app_secret: changeme
vault_ntth_token_4_app_secret: changeme
# Kottos (Pallas FastAgent runtime on puck)
# vault_kottos_openai_api_key — API key for the OpenAI-compatible LLM
# endpoint (nyx Qwen in Ouranos, varies
# per environment). Set to any string
# if the endpoint doesn't validate.
# vault_kottos_github_pat — GitHub personal access token passed
# into the github MCP Docker container
# via GITHUB_PERSONAL_ACCESS_TOKEN env.
# vault_kottos_angelia_bearer — Bearer token for the Angelia MCP
# server (accepts the outgoing auth).
# vault_kottos_mnemosyne_jwt — Long-lived team JWT minted in the
# Daedalus admin UI → Settings →
# Pallas Instances → kottos row →
# "Reveal" or "Rotate". Mnemosyne
# validates this on every search_memory
# call and scopes results to the
# workspaces attached to this team.
vault_kottos_openai_api_key: changeme
vault_kottos_github_pat: changeme
vault_kottos_angelia_bearer: changeme
vault_kottos_mnemosyne_jwt: changeme

View File

@@ -9,16 +9,14 @@ services:
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 22011
neo4j_syslog_port: 51414
# Neo4j
neo4j_rel: master
neo4j_version: "5.26.0"
neo4j_user: neo4j
neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_auth_user: neo4j
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_http_port: 25554
neo4j_bolt_port: 7687
neo4j_password: "{{ vault_neo4j_cypher_password }}"
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -20,32 +20,65 @@ principal_uid: 1000
alloy_log_level: "warn"
# Rommie MCP Server Configuration (Agent S GUI Automation)
rommie_port: 22061
rommie_port: 20361
rommie_host: "0.0.0.0"
rommie_display: ":10"
rommie_allowed_hosts: "caliban.incus,rommie.ouranos.helu.ca"
rommie_model: Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
rommie_model_url: "http://nyx.helu.ca:22079"
rommie_model: Qwen3.6-27B-Q5_K_M
rommie_model_url: "http://nyx.helu.ca:29000"
rommie_provider: "openai"
rommie_ground_provider: "huggingface"
rommie_ground_url: "http://pan.helu.ca:22078"
rommie_ground_url: "http://pan.helu.ca:29000"
rommie_ground_model: "UI-TARS-7B-DPO-Q6_K_L.gguf"
rommie_grounding_width: 1024
rommie_grounding_height: 1024
# get_screenshot output for the parent agent (Agent S autonomous capture unaffected)
rommie_screenshot_jpeg_quality: 80
rommie_screenshot_max_kb: 512
# FreeCAD Robust MCP Server Configuration
freecad_mcp_user: harper
freecad_mcp_group: harper
freecad_mcp_directory: /srv/freecad-mcp
freecad_mcp_port: 22063
freecad_mcp_port: 22061
freecad_mcp_xmlrpc_port: 9875
freecad_mcp_socket_port: 9876
# FreeCAD MCP Bridge (GUI, runs as principal_user on the XRDP display)
freecad_mcp_bridge_directory: "/home/{{ principal_user }}/freecad-mcp-bridge"
freecad_mcp_bridge_display: ":10"
# JupyterLab Configuration
jupyterlab_user: robert
jupyterlab_group: robert
jupyterlab_notebook_dir: /home/robert/notebook
jupyterlab_venv_dir: /home/robert/env/jupyter
## Ports
jupyterlab_port: 22081 # JupyterLab (localhost only)
jupyterlab_proxy_port: 22071 # OAuth2-Proxy (exposed to HAProxy)
## OAuth2-Proxy Configuration
jupyterlab_oauth2_proxy_dir: /etc/oauth2-proxy-jupyter
jupyterlab_oauth2_proxy_version: "7.6.0"
jupyterlab_domain: "ouranos.helu.ca"
jupyterlab_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca"
jupyterlab_oauth2_redirect_url: "https://jupyterlab.ouranos.helu.ca/oauth2/callback"
## OAuth2 Credentials (from vault)
jupyterlab_oauth_client_id: "{{ vault_jupyterlab_oauth_client_id }}"
jupyterlab_oauth_client_secret: "{{ vault_jupyterlab_oauth_client_secret }}"
jupyterlab_oauth2_cookie_secret: "{{ vault_jupyterlab_oauth2_cookie_secret }}"
# Kernos MCP Shell Server Configuration
kernos_user: harper
kernos_group: harper
kernos_api_keys: "{{ vault_caliban_kernos_api_keys }}"
kernos_directory: /srv/kernos
kernos_port: 22062
kernos_port: 20261
kernos_host: "0.0.0.0"
kernos_log_level: INFO
kernos_log_format: json
kernos_environment: sandbox
kernos_allow_commands: "apt,awk,base64,bash,cat,chmod,cp,curl,cut,date,dd,df,dig,dmesg,du,echo,env,file,find,free,git,grep,gunzip,gzip,head,host,hostname,id,jq,kill,less,ln,ls,lsblk,lspci,lsusb,make,mkdir,mv,nc,node,nohup,npm,npx,ping,pip,pkill,pnpm,printenv,ps,pwd,python3,rm,rsync,run-captured,scp,sed,sleep,sort,source,ssh,ssh-keygen,ssh-keyscan,stat,sudo,tail,tar,tee,timeout,touch,tr,tree,uname,uniq,unzip,uptime,wc,wget,which,whoami,xargs,xz,zip"
kernos_allow_commands: "apt,awk,base64,bash,cat,chmod,cp,curl,cut,date,dd,df,dig,dmesg,docker,du,echo,env,file,find,free,git,grep,gunzip,gzip,head,host,hostname,id,ip,jq,kill,less,ln,ls,lsblk,lspci,lsusb,make,mkdir,mv,nc,node,nohup,npm,npx,ping,pip,pkill,pnpm,printenv,ps,pwd,python3,rm,rsync,run-captured,scp,sed,sleep,sort,source,ssh,ssh-keygen,ssh-keyscan,stat,sudo,tail,tar,tee,timeout,touch,tr,tree,uname,uniq,unzip,uptime,wc,wget,which,whoami,xargs,xz,zip"

View File

@@ -10,21 +10,23 @@ services:
- grafana_mcp
- mcpo
- neo4j_mcp
- searxng
# Alloy
alloy_log_level: "warn"
argos_syslog_port: 51434
neo4j_cypher_syslog_port: 51431
grafana_mcp_syslog_port: 51433
gitea_mcp_syslog_port: 51435
argos_syslog_port: 51418
neo4j_cypher_syslog_port: 51414
grafana_mcp_syslog_port: 51413
gitea_mcp_syslog_port: 51412
searxng_syslog_port: 51419
# Argos MCP Configuration
argos_user: argos
argos_group: argos
argos_directory: /srv/argos
argos_port: 25534
argos_port: 20861
argos_log_level: INFO
argos_searxng_instances: http://rosalind.incus:22089/
argos_searxng_instances: http://miranda.incus:22089/,http://rosalind.incus:22089/
argos_cache_ttl: 300
argos_max_results: 10
argos_request_timeout: 30.0
@@ -48,7 +50,7 @@ neo4j_mcp_directory: /srv/neo4j_mcp
grafana_mcp_user: grafana_mcp
grafana_mcp_group: grafana_mcp
grafana_mcp_directory: /srv/grafana_mcp
grafana_mcp_port: 25533
grafana_mcp_port: 22063
grafana_mcp_grafana_host: prospero.incus
grafana_mcp_grafana_port: 3000
grafana_service_account_token: "{{ vault_grafana_service_account_token }}"
@@ -57,21 +59,29 @@ grafana_service_account_token: "{{ vault_grafana_service_account_token }}"
gitea_mcp_user: gitea_mcp
gitea_mcp_group: gitea_mcp
gitea_mcp_directory: /srv/gitea_mcp
gitea_mcp_port: 25535
gitea_mcp_port: 22062
gitea_mcp_host: https://gitea.ouranos.helu.ca
gitea_mcp_access_token: "{{ vault_gitea_mcp_access_token }}"
# Neo4j Cypher MCP
neo4j_host: ariel.incus
neo4j_bolt_port: 7687
neo4j_auth_password: "{{ vault_neo4j_auth_password }}"
neo4j_cypher_mcp_port: 25531
# Nike MCP
nike_mcp_url: http://puck.incus:25576/mcp
neo4j_bolt_port: 22074
neo4j_cypher_password: "{{ vault_neo4j_cypher_password }}"
neo4j_cypher_mcp_port: 22064
neo4j_mcp_server_allowed_hosts: localhost,127.0.0.1,miranda.incus
# MCPO Config
mcpo_user: mcpo
mcpo_group: mcpo
mcpo_directory: /srv/mcpo
mcpo_port: 25530
# SearXNG Configuration
searxng_user: searxng
searxng_group: searxng
searxng_directory: /srv/searxng
searxng_port: 22089
searxng_base_url: http://miranda.incus:22089/
searxng_instance_name: "Ouranos Search"
searxng_secret_key: "{{ vault_searxng_secret_key }}"
searxng_brave_api_key: "{{ vault_searxng_brave_api_key }}"

View File

@@ -50,6 +50,15 @@ periplus_db_password: "{{ vault_periplus_db_password }}"
daedalus_db_name: daedalus
daedalus_db_user: daedalus
daedalus_db_password: "{{ vault_daedalus_db_password }}"
mnemosyne_db_name: mnemosyne
mnemosyne_db_user: mnemosyne
mnemosyne_db_password: "{{ vault_mnemosyne_db_password }}"
hold_slayer_db_name: hold_slayer
hold_slayer_db_user: hold_slayer
hold_slayer_db_password: "{{ vault_hold_slayer_db_password }}"
hecate_db_name: hecate
hecate_db_user: hecate
hecate_db_password: "{{ vault_hecate_db_password }}"
# PostgreSQL admin password
postgres_password: "{{ vault_postgres_password }}"

View File

@@ -72,6 +72,23 @@ prometheus_targets:
- 'sycorax.incus:9100'
- 'prospero.incus:9100'
- 'rosalind.incus:9100'
- 'umbriel.incus:9100'
# Neo4j scrape targets (neo4j-apoc-exporter sidecar on each Neo4j host)
neo4j_metrics_targets:
- 'ariel.incus:22094'
- 'umbriel.incus:22094'
# Pallas scrape targets — one entry per Pallas deployment (registry
# port). The `instance` label distinguishes deployments; the `agent`
# dimension comes from labels on the metrics themselves.
pallas_metrics_targets:
- targets: ['caliban.incus:24000']
labels: {instance: iolaus}
- targets: ['caliban.incus:24100']
labels: {instance: kottos}
- targets: ['caliban.incus:24200']
labels: {instance: mentor}
# Prometheus OAuth2-Proxy Sidecar
prometheus_proxy_port: 9091
@@ -126,10 +143,31 @@ pgadmin_oauth_client_id: "{{ vault_pgadmin_oauth_client_id }}"
pgadmin_oauth_client_secret: "{{ vault_pgadmin_oauth_client_secret }}"
# ============================================================================
# Casdoor Metrics (for Prometheus scraping)
# Prometheus Metrics Scraping
# ============================================================================
casdoor_metrics_host: "titania.incus"
# Casdoor
casdoor_metrics_host: titania.incus
casdoor_metrics_port: 22081
casdoor_prometheus_access_key: "{{ vault_casdoor_prometheus_access_key }}"
casdoor_prometheus_access_secret: "{{ vault_casdoor_prometheus_access_secret }}"
# Daedalus Metrics
daedalus_metrics_host: caliban.incus
daedalus_metrics_port: 23081
# Mnemosyne — two scrape targets:
# app: Django /metrics via nginx (django-prometheus + custom pipeline/MCP counters)
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
mnemosyne_app_metrics_host: caliban.incus
mnemosyne_app_metrics_port: 23181
mnemosyne_web_metrics_host: caliban.incus
mnemosyne_web_metrics_port: 23191
# Athena — two scrape targets (same shape as Mnemosyne):
# app: Django /metrics via nginx (django-prometheus)
# web: nginx-prometheus-exporter sidecar (nginx stub_status → Prometheus format)
athena_app_metrics_host: puck.incus
athena_app_metrics_port: 22481
athena_web_metrics_host: puck.incus
athena_web_metrics_port: 22491

View File

@@ -7,6 +7,7 @@ services:
- docker
- gitea_runner
- athena
- kottos
# Gitea Runner
gitea_runner_name: "puck-runner"
@@ -14,14 +15,90 @@ gitea_runner_name: "puck-runner"
# Alloy
alloy_log_level: "warn"
angelia_syslog_port: 51422
# mnemosyne_syslog_port retained for inventory-compatibility while the
# Alloy Docker-socket discovery block rolls out; no listener binds to it
# any more. Delete once the docker-socket pipeline is proven in prod.
mnemosyne_syslog_port: 51431
athena_syslog_port: 51424
kairos_syslog_port: 51425
icarlos_syslog_port: 51426
spelunker_syslog_port: 51428
jupyterlab_syslog_port: 51411
# daedalus_syslog_port retained for the same reason as mnemosyne above.
daedalus_syslog_port: 51430
# =============================================================================
# PPLG scrape targets on puck
# =============================================================================
# Consumed by ``ansible/pplg/prometheus.yml.j2`` on Prospero. Defining them
# here keeps the scrape config fully parametric so the same playbook runs
# unchanged against Ouranos / Virgo / Taurus — each environment sets its
# own puck-equivalent host in its host_vars.
# Daedalus (FastAPI on puck, behind nginx)
daedalus_metrics_host: "puck.incus"
daedalus_metrics_port: 23081
# Mnemosyne — two metrics surfaces:
# app (23181): /metrics served by nginx → Django app container, which owns
# the single prometheus_client process registry that both django-prometheus
# (HTTP / Celery) and the MCP server's tool-call counters write to.
# web (23191): nginx-prometheus-exporter sidecar scraping nginx stub_status.
mnemosyne_app_metrics_host: "puck.incus"
mnemosyne_app_metrics_port: 23181
mnemosyne_web_metrics_host: "puck.incus"
mnemosyne_web_metrics_port: 23191
# =============================================================================
# Kottos Configuration (Pallas FastAgent runtime)
# =============================================================================
# Engineering agents (Harper, Scotty, CASE, Research, Tech Research) running as a
# single systemd-managed ``pallas`` process. Logs land in journald via
# SyslogIdentifier=kottos, then Alloy's journal relabel block tags them as
# {service="pallas", project="kottos"} for Loki.
kottos_user: kottos
kottos_group: kottos
kottos_directory: /srv/kottos
kottos_host: "puck.incus"
kottos_namespace: "ca.helu.kottos"
# Ports — registry at 24100, agents 2410124149, sub-agents 2415024199
kottos_registry_port: 24100
kottos_harper_port: 24101
kottos_scotty_port: 24102
kottos_research_port: 24150
kottos_tech_research_port: 24151
kottos_case_port: 24152
# Log level — INFO surfaces lifecycle + failures, DEBUG adds per-request
# detail and successful health probe lines. Ouranos Lab convention:
# health-check 200 OKs live in DEBUG, never in INFO.
pallas_log_level: INFO
# fast-agent's own logger — keep at INFO in prod, bump to DEBUG alongside
# pallas_log_level when chasing MCP transport issues.
kottos_fastagent_log_level: info
# LLM provider — the same OpenAI-compatible Qwen endpoint Kottos uses today.
kottos_default_model: "openai.Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf"
kottos_openai_base_url: "http://nyx.helu.ca:22072/v1"
kottos_model_vision: true
kottos_model_context_window: 192000
kottos_model_max_output_tokens: 16384
kottos_timezone: "America/Toronto"
# Downstream MCP server URLs — each parametric so Virgo / Taurus override
# them in their own host_vars without touching the templates.
kottos_argos_url: "http://miranda.incus:25534/mcp"
kottos_neo4j_cypher_url: "http://circe.helu.ca:22034/mcp"
kottos_kernos_scotty_url: "http://caliban.incus:22062/mcp"
kottos_rommie_url: "http://caliban.incus:20361/mcp"
kottos_gitea_url: "http://miranda.incus:25535/mcp"
kottos_grafana_url: "http://miranda.incus:25533/mcp"
kottos_kernos_harper_url: "http://korax.helu.ca:20261/mcp"
kottos_angelia_url: "https://ouranos.helu.ca/mcp/"
kottos_mnemosyne_url: "https://mnemosyne.ouranos.helu.ca/mcp/"
# =============================================================================
# Athena Configuration
# =============================================================================
@@ -31,6 +108,12 @@ athena_directory: /srv/athena
athena_port: 22481
athena_domain: "ouranos.helu.ca"
# Prometheus scrape targets (see pplg/prometheus.yml.j2, athena job)
athena_app_metrics_host: "puck.incus"
athena_app_metrics_port: 22481
athena_web_metrics_host: "puck.incus"
athena_web_metrics_port: 22491
# Casdoor SSO Credentials (from vault)
athena_casdoor_client_id: "{{ vault_athena_oauth_client_id }}"
athena_casdoor_client_secret: "{{ vault_athena_oauth_client_secret }}"
@@ -39,26 +122,4 @@ athena_casdoor_client_secret: "{{ vault_athena_oauth_client_secret }}"
athena_secret_key: "{{ vault_athena_secret_key }}"
athena_db_password: "{{ vault_athena_db_password }}"
# =============================================================================
# JupyterLab Configuration
# =============================================================================
jupyterlab_user: robert
jupyterlab_group: robert
jupyterlab_notebook_dir: /home/robert
jupyterlab_venv_dir: /home/robert/env/jupyter
# Ports
jupyterlab_port: 22081 # JupyterLab (localhost only)
jupyterlab_proxy_port: 22071 # OAuth2-Proxy (exposed to HAProxy)
# OAuth2-Proxy Configuration
jupyterlab_oauth2_proxy_dir: /etc/oauth2-proxy-jupyter
jupyterlab_oauth2_proxy_version: "7.6.0"
jupyterlab_domain: "ouranos.helu.ca"
jupyterlab_oauth2_oidc_issuer_url: "https://id.ouranos.helu.ca"
jupyterlab_oauth2_redirect_url: "https://jupyterlab.ouranos.helu.ca/oauth2/callback"
# OAuth2 Credentials (from vault)
jupyterlab_oauth_client_id: "{{ vault_jupyterlab_oauth_client_id }}"
jupyterlab_oauth_client_secret: "{{ vault_jupyterlab_oauth_client_secret }}"
jupyterlab_oauth2_cookie_secret: "{{ vault_jupyterlab_oauth2_cookie_secret }}"

View File

@@ -7,6 +7,7 @@ services:
- anythingllm
- docker
- gitea
- jellyfin
- lobechat
- memcached
- nextcloud
@@ -223,6 +224,7 @@ searxng_port: 22089
searxng_base_url: http://rosalind.incus:22089/
searxng_instance_name: "Ouranos Search"
searxng_secret_key: "{{ vault_searxng_secret_key }}"
searxng_brave_api_key: "{{ vault_searxng_brave_api_key }}"
# SearXNG OAuth2-Proxy Sidecar
# Note: Each host supports at most one OAuth2-Proxy sidecar instance
@@ -237,3 +239,30 @@ searxng_oauth2_redirect_url: "https://searxng.ouranos.helu.ca/oauth2/callback"
searxng_oauth2_client_id: "{{ vault_searxng_oauth_client_id }}"
searxng_oauth2_client_secret: "{{ vault_searxng_oauth_client_secret }}"
searxng_oauth2_cookie_secret: "{{ vault_searxng_oauth_cookie_secret }}"
# Jellyfin Configuration
jellyfin_user: jellyfin
jellyfin_group: jellyfin
jellyfin_uid: 521
jellyfin_gid: 521
jellyfin_directory: /srv/jellyfin
jellyfin_port: 22086
jellyfin_syslog_port: 51426
# Storage paths
jellyfin_config_dir: /srv/jellyfin/config
jellyfin_cache_dir: /srv/jellyfin/cache
jellyfin_media_dir: /mnt/media
# Hardware transcoding (NVIDIA GPU passthrough)
jellyfin_enable_hwtranscode: true
# External access URL
jellyfin_published_server_url: "https://jellyfin.ouranos.helu.ca"
# SSO / OIDC Configuration (Casdoor)
jellyfin_sso_enabled: true
jellyfin_casdoor_client_id: "{{ vault_jellyfin_casdoor_client_id }}"
jellyfin_casdoor_client_secret: "{{ vault_jellyfin_casdoor_client_secret }}"
jellyfin_casdoor_issuer: "https://id.ouranos.helu.ca"
jellyfin_casdoor_redirect_uri: "https://jellyfin.ouranos.helu.ca/api/plugin/sso/callback"

View File

@@ -74,6 +74,12 @@ haproxy_backends:
backend_port: 22084
health_path: "/api/ping"
- subdomain: "jellyfin"
backend_host: "rosalind.incus"
backend_port: 22086
health_path: "/health"
timeout_server: 300s
- subdomain: "arke"
backend_host: "sycorax.incus"
backend_port: 25540
@@ -116,8 +122,8 @@ haproxy_backends:
health_path: "/api/healthz"
- subdomain: "daedalus"
backend_host: "puck.incus"
backend_port: 20080
backend_host: "caliban.incus"
backend_port: 20081
health_path: "/ready/"
timeout_server: 120s
@@ -127,8 +133,8 @@ haproxy_backends:
health_path: "/chat"
- subdomain: "mnemosyne"
backend_host: "puck.incus"
backend_port: 23181
backend_host: "caliban.incus"
backend_port: 23081
health_path: "/ready/"
- subdomain: "nextcloud"
@@ -163,13 +169,13 @@ haproxy_backends:
- subdomain: "nike"
backend_host: "puck.incus"
backend_port: 20681
backend_port: 20581
health_path: "/ready/"
- subdomain: "periplus"
backend_host: "puck.incus"
backend_port: 20581
health_path: "/ready/"
backend_port: 20681
health_path: "/ready"
- subdomain: "spelunker"
backend_host: "puck.incus"
@@ -182,26 +188,38 @@ haproxy_backends:
health_path: "/ready/"
- subdomain: "jupyterlab"
backend_host: "puck.incus"
backend_port: 22071 # OAuth2-Proxy port
backend_host: "caliban.incus"
backend_port: 22071
health_path: "/ping"
timeout_server: 300s # WebSocket support
timeout_server: 300s
- subdomain: "hass"
backend_host: "oberon.incus"
backend_port: 8123
health_path: "/api/"
timeout_server: 300s # WebSocket support for HA frontend
timeout_server: 300s
- subdomain: "hecate"
backend_host: "caliban.incus"
backend_port: 20881
health_path: "/live"
timeout_server: 300s
- subdomain: "freecad-mcp"
backend_host: "caliban.incus"
backend_port: 22032
backend_port: 22062
health_path: "/mcp"
timeout_server: 300s # SSE streaming support for MCP
- subdomain: "caliban"
backend_host: "caliban.incus"
backend_port: 20261
health_path: "/mcp"
timeout_server: 300s # SSE streaming support for MCP
- subdomain: "rommie"
backend_host: "caliban.incus"
backend_port: 22031
backend_port: 20361
health_path: "/mcp"
timeout_server: 300s # SSE streaming support for MCP

View File

@@ -0,0 +1,26 @@
---
# Umbriel Configuration - Graph Database Host (Mnemosyne)
# Services: alloy, docker, neo4j
#
# Dedicated Neo4j instance for Mnemosyne. Do not share with Spelunker or any
# other graph workload — Mnemosyne owns its Library/Collection/Item/Chunk/
# Concept labels and runs its own indexes and schema migrations.
services:
- alloy
- docker
- neo4j
# Alloy
alloy_log_level: "warn"
neo4j_syslog_port: 51414
# Neo4j
neo4j_user: neo4j
neo4j_group: neo4j
neo4j_directory: /srv/neo4j
neo4j_password: "{{ vault_neo4j_mnemosyne_password }}"
neo4j_http_port: 22084
neo4j_bolt_port: 22074
neo4j_metrics_port: 22094
neo4j_apoc_unrestricted: "apoc.*"

View File

@@ -17,6 +17,7 @@ ubuntu:
rosalind.incus:
sycorax.incus:
titania.incus:
umbriel.incus:
# Service-specific groups for targeted deployments
agent_s:

149
ansible/jellyfin/README.md Normal file
View File

@@ -0,0 +1,149 @@
---
# Jellyfin Deployment for Ouranos
Jellyfin media server deployed on Rosalind Incus container.
## Overview
Jellyfin is an open-source media server for organizing, streaming, and managing media content. This deployment includes:
- Docker containerized deployment
- NVIDIA GPU passthrough for hardware-accelerated transcoding
- Prometheus metrics collection
- Syslog integration with Grafana Alloy
- Casdoor OIDC SSO support (via plugin)
## Deployment
### Prerequisites
1. Rosalind Incus container must be running with Docker installed
2. `/mnt/media` must be accessible from the Incus host
3. NVIDIA GPU must be passed through to the Rosalind container
4. Casdoor application must be configured for Jellyfin OIDC
### Installation
```bash
# From ansible directory
cd /home/robert/git/ouranos/ansible
# Deploy Jellyfin to Rosalind
ansible-playbook jellyfin/deploy.yml --limit rosalind.incus
```
### Updating
```bash
# Update Jellyfin container
ansible-playbook jellyfin/deploy.yml --limit rosalind.incus
```
## Configuration
### Variables
| Variable | Description | Default |
|----------|-------------|---------|
| `jellyfin_user` | Service username | `jellyfin` |
| `jellyfin_group` | Service group name | `jellyfin` |
| `jellyfin_uid` | Service UID | `521` |
| `jellyfin_gid` | Service GID | `521` |
| `jellyfin_directory` | Base directory | `/srv/jellyfin` |
| `jellyfin_port` | HTTP port | `22086` |
| `jellyfin_syslog_port` | Syslog port | `51426` |
| `jellyfin_config_dir` | Config directory | `/srv/jellyfin/config` |
| `jellyfin_cache_dir` | Cache directory | `/srv/jellyfin/cache` |
| `jellyfin_media_dir` | Media bind mount | `/mnt/media` |
| `jellyfin_published_server_url` | External URL | `https://jellyfin.ouranos.helu.ca` |
### SSO Configuration
Jellyfin uses the `jellyfin-plugin-sso` community plugin for Casdoor OIDC authentication:
1. **Create Casdoor Application**:
- Application type: OIDC
- Callback URL: `https://jellyfin.ouranos.helu.ca/api/plugin/sso/callback`
- Enable PKCE
2. **Plugin Configuration**:
- Install manifest in `/config/plugins`
- Configure with Casdoor OIDC endpoints
3. **Casdoor Endpoints**:
- Authorization: `https://id.ouranos.helu.ca/oauth2/authorize`
- Token: `https://id.ouranos.helu.ca/oauth2/token`
- Userinfo: `https://id.ouranos.helu.ca/oauth2/userinfo`
## Monitoring
### Prometheus Metrics
Jellyfin exposes metrics at `http://localhost:8096/metrics`. These are collected by Prospero's Prometheus via:
- cAdvisor container metrics
- Process exporter
### Grafana Dashboard
Add a new data source in Grafana:
- Type: Prometheus
- URL: `http://prospero.incus:9090`
### Logs
View Jellyfin logs:
```bash
# Via Docker
docker logs -f jellyfin
# Via systemd
journalctl -u jellyfin -f
# Via Grafana Loki
https://loki.ouranos.helu.ca/explore?orgId=1&left=%5B%22now-1h%22,%22now%22,%22jellyfin%22,%7B%22job%22%3A%22jellyfin%22%7D%5D
```
## Troubleshooting
### Container won't start
```bash
# Check Docker status
docker ps -a | grep jellyfin
# Check logs
docker logs jellyfin
# Verify GPU passthrough
ls -la /dev/dri/
```
### Transcoding fails
1. Verify GPU is accessible: `nvidia-smi`
2. Check container has device access: `docker inspect jellyfin | grep Devices`
3. Review logs for transcoding errors
### SSO not working
1. Verify plugin is installed in `/config/plugins`
2. Check Casdoor application configuration
3. Verify redirect URLs match exactly
4. Browser console for OAuth errors
## Files
| Path | Description |
|------|-------------|
| `/srv/jellyfin/docker-compose.yml` | Generated Docker Compose config |
| `/etc/systemd/system/jellyfin.service` | Systemd wrapper service |
| `/srv/jellyfin/config` | Jellyfin configuration |
| `/srv/jellyfin/cache` | Transcode cache |
| `/srv/jellyfin/logs` | Application logs (via syslog) |
## References
- [Jellyfin Official Docs](https://jellyfin.org/docs/)
- [Jellyfin Docker Image](https://hub.docker.com/r/jellyfin/jellyfin)
- [SSO Plugin GitHub](https://github.com/9p4/jellyfin-plugin-sso)

View File

@@ -0,0 +1,86 @@
---
- name: Deploy Jellyfin
hosts: ubuntu
become: true
vars:
ansible_python_interpreter: /usr/bin/python3
tasks:
- name: Check if host has jellyfin service
ansible.builtin.set_fact:
has_jellyfin: "{{ 'jellyfin' in services | default([]) }}"
- name: Skip hosts without jellyfin service
ansible.builtin.meta: end_host
when: not has_jellyfin
- name: Create jellyfin group
ansible.builtin.group:
name: "{{ jellyfin_group }}"
gid: "{{ jellyfin_gid }}"
- name: Create jellyfin user
ansible.builtin.user:
name: "{{ jellyfin_user }}"
comment: "Jellyfin service account"
group: "{{ jellyfin_group }}"
uid: "{{ jellyfin_uid }}"
home: "{{ jellyfin_directory }}"
system: true
shell: /bin/bash
- name: Add keeper_user to jellyfin group
ansible.builtin.user:
name: "{{ keeper_user }}"
groups: "{{ jellyfin_group }}"
append: true
- name: Create Jellyfin directories
ansible.builtin.file:
path: "{{ item }}"
owner: "{{ jellyfin_user }}"
group: "{{ jellyfin_group }}"
state: directory
mode: '0750'
loop:
- "{{ jellyfin_directory }}"
- "{{ jellyfin_config_dir }}"
- "{{ jellyfin_cache_dir }}"
- name: Deploy Docker Compose configuration
ansible.builtin.template:
src: docker-compose.yml.j2
dest: "{{ jellyfin_directory }}/docker-compose.yml"
owner: "{{ jellyfin_user }}"
group: "{{ jellyfin_group }}"
mode: '0644'
notify:
- Restart Jellyfin
- name: Create systemd service for Docker Compose
ansible.builtin.template:
src: jellyfin.service.j2
dest: /etc/systemd/system/jellyfin.service
mode: '0644'
notify:
- Reload systemd
- Enable Jellyfin
handlers:
- name: Reload systemd
ansible.builtin.systemd:
daemon_reload: true
- name: Restart Jellyfin
community.docker.docker_compose_v2:
project_src: "{{ jellyfin_directory }}"
pull: always
state: present
become: true
become_user: "{{ jellyfin_user }}"
- name: Enable Jellyfin
ansible.builtin.systemd:
name: jellyfin
enabled: true
state: started
daemon_reload: true

View File

@@ -0,0 +1,32 @@
---
services:
jellyfin:
image: jellyfin/jellyfin:latest
container_name: jellyfin
user: "{{ jellyfin_uid }}:{{ jellyfin_gid }}"
ports:
- "{{ jellyfin_port }}:8096/tcp"
- "7359:7359/udp"
volumes:
- "{{ jellyfin_config_dir }}:/config"
- "{{ jellyfin_cache_dir }}:/cache"
- "{{ jellyfin_media_dir }}:/media:ro"
restart: unless-stopped
devices:
- /dev/dri:/dev/dri
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8096/dashboard"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
logging:
driver: syslog
options:
syslog-address: "udp://prospero.incus:1514"
tag: "jellyfin"
environment:
- TZ=America/Toronto
- JELLYFIN_PublishedServerUrl={{ jellyfin_published_server_url }}
extra_hosts:
- "host.docker.internal:host-gateway"

View File

@@ -0,0 +1,17 @@
---
[Unit]
Description=Jellyfin Docker Compose Service
After=docker.service
Requires=docker.service
[Service]
Type=simple
WorkingDirectory={{ jellyfin_directory }}
User={{ jellyfin_user }}
ExecStart=/usr/bin/docker compose up --remove-orphans
ExecStop=/usr/bin/docker compose down
Restart=on-failure
RestartSec=30
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,62 @@
# Kottos — Deployment Configuration
# Single source of truth for agent topology, ports, and registry metadata.
# Read by Pallas at startup.
name: kottos
version: "1.0.0"
host: {{ kottos_bind_host | default(inventory_hostname) }}
namespace: ca.helu.kottos
registry_port: {{ kottos_registry_port }}
agents:
harper:
module: agents.harper
port: 24101
title: Harper
description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving"
depends_on: [research, tech_research]
{% if kottos_harper_model is defined %}
model: {{ kottos_harper_model }}
{% endif %}
scotty:
module: agents.scotty
port: 24102
title: Scotty
description: "Systems administration expert — infrastructure diagnostics, security hardening, and keeping everything running"
depends_on: [tech_research]
{% if kottos_scotty_model is defined %}
model: {{ kottos_scotty_model }}
{% endif %}
research:
module: agents.research
port: 24150
title: Research Agent
description: "Web search via Argos and knowledge graph via Neo4j"
{% if kottos_research_model is defined %}
model: {{ kottos_research_model }}
model_capabilities:
vision: {{ kottos_research_model_vision | default(true) }}
context_window: {{ kottos_research_model_context_window | default(16384) }}
max_output_tokens: {{ kottos_research_model_max_output_tokens | default(8192) }}
{% endif %}
tech_research:
module: agents.tech_research
port: 24151
title: Tech Research
description: "Technical investigation — library comparisons, API docs, framework patterns, code examples"
{% if kottos_tech_research_model is defined %}
model: {{ kottos_tech_research_model }}
{% endif %}
case:
module: agents.case
port: 24152
title: CASE
description: "Field systems agent — SD card imaging, LAN scanning, and storage operations on korax.helu.ca"
depends_on: []
{% if kottos_case_model is defined %}
model: {{ kottos_case_model }}
{% endif %}

219
ansible/kottos/deploy.yml Normal file
View File

@@ -0,0 +1,219 @@
---
- name: Deploy Kottos AI Agent Platform
hosts: ubuntu
vars:
ansible_common_remote_group: "{{ kottos_group | default([]) }}"
allow_world_readable_tmpfiles: true
handlers:
- name: restart kottos
become: true
ansible.builtin.systemd:
name: kottos
state: restarted
tasks:
- name: Check if host has kottos service
ansible.builtin.set_fact:
has_kottos_service: "{{ 'kottos' in services | default([]) }}"
- name: Skip hosts without kottos service
ansible.builtin.meta: end_host
when: not has_kottos_service
- name: Install required packages
become: true
ansible.builtin.apt:
name:
- acl
- npm
- curl
state: present
update_cache: true
- name: Create Kottos group
become: true
ansible.builtin.group:
name: "{{ kottos_group }}"
state: present
- name: Create Kottos user
become: true
ansible.builtin.user:
name: "{{ kottos_user }}"
group: "{{ kottos_group }}"
home: "{{ kottos_directory }}"
shell: /bin/bash
system: true
create_home: false
- name: Add keeper_user to kottos group
become: true
ansible.builtin.user:
name: "{{ keeper_user }}"
groups: "{{ kottos_group }}"
append: true
- name: Add kottos user to docker group
become: true
ansible.builtin.user:
name: "{{ kottos_user }}"
groups: docker
append: true
notify: restart kottos
- name: Reset connection to pick up new group membership
ansible.builtin.meta: reset_connection
- name: Create Kottos directory
become: true
ansible.builtin.file:
path: "{{ kottos_directory }}"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
state: directory
mode: '750'
- name: Create vendored Pallas directory
become: true
ansible.builtin.file:
path: "{{ kottos_directory }}/vendor/pallas"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
state: directory
mode: '750'
- name: Ensure tar is installed for unarchive task
become: true
ansible.builtin.apt:
name:
- tar
state: present
update_cache: true
- name: Ensure Python 3.13, venv, dev headers, and ACL are installed
become: true
ansible.builtin.apt:
name:
- python3.13
- python3.13-venv
- python3.13-dev
- acl
state: present
update_cache: true
- name: Transfer and unarchive Kottos release
become: true
ansible.builtin.unarchive:
src: "~/rel/kottos_{{ kottos_rel }}.tar"
dest: "{{ kottos_directory }}"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '550'
notify: restart kottos
- name: Transfer and unarchive vendored Pallas source
become: true
ansible.builtin.unarchive:
src: "~/rel/pallas_{{ pallas_rel }}.tar"
dest: "{{ kottos_directory }}/vendor/pallas"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '550'
notify: restart kottos
- name: Rewrite pallas-mcp dependency to use vendored local path
become: true
ansible.builtin.replace:
path: "{{ kottos_directory }}/pyproject.toml"
regexp: '"pallas-mcp @ git\+ssh://[^"]+"'
replace: '"pallas-mcp @ file://{{ kottos_directory }}/vendor/pallas"'
notify: restart kottos
- name: Create virtual environment for Kottos (Python 3.13)
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.command:
cmd: "python3.13 -m venv {{ kottos_directory }}/.venv/"
creates: "{{ kottos_directory }}/.venv/bin/activate"
- name: Install wheel and mcp-server-time in virtualenv
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.pip:
name:
- wheel
- mcp-server-time
state: latest
virtualenv: "{{ kottos_directory }}/.venv"
- name: Install Kottos (and its rewritten local pallas-mcp) in virtualenv
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.pip:
chdir: "{{ kottos_directory }}"
name: .
virtualenv: "{{ kottos_directory }}/.venv"
virtualenv_command: python3.13 -m venv
notify: restart kottos
- name: Template agents.yaml
become: true
ansible.builtin.template:
src: agents.yaml.j2
dest: "{{ kottos_directory }}/agents.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '640'
notify: restart kottos
- name: Template fastagent.config.yaml
become: true
ansible.builtin.template:
src: fastagent.config.yaml.j2
dest: "{{ kottos_directory }}/fastagent.config.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '640'
notify: restart kottos
- name: Template fastagent.secrets.yaml
become: true
ansible.builtin.template:
src: fastagent.secrets.yaml.j2
dest: "{{ kottos_directory }}/fastagent.secrets.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '640'
notify: restart kottos
- name: Template systemd service file
become: true
ansible.builtin.template:
src: kottos.service.j2
dest: /etc/systemd/system/kottos.service
owner: root
group: root
mode: '644'
notify: restart kottos
- name: Enable and start kottos service
become: true
ansible.builtin.systemd:
name: kottos
enabled: true
state: started
daemon_reload: true
- name: Flush handlers to restart service before validation
ansible.builtin.meta: flush_handlers
- name: Validate Kottos registry liveness
ansible.builtin.uri:
url: "http://localhost:{{ kottos_registry_port }}/live"
status_code: 200
return_content: true
register: kottos_live
retries: 10
delay: 5
until: kottos_live.status == 200

View File

@@ -0,0 +1,131 @@
# Kottos — Configuration
# LLM provider and MCP server settings.
# Secrets (api_key, tokens) live in fastagent.secrets.yaml (gitignored)
#
# This template is intended to be byte-identical between environments
# (Virgo dev, Taurus prod). All environment-specific values come from
# host_vars or group_vars/all/vars.yml. Do NOT introduce environment-
# specific literals here.
# Default Model Definition
default_model: {{ kottos_default_model }}
# Declares capabilities for models not in fast-agent's ModelDatabase.
# vision: true adds image/jpeg, image/png, image/webp to the tokenizer list.
model_capabilities:
vision: {{ kottos_model_vision }}
context_window: {{ kottos_model_context_window }}
max_output_tokens: {{ kottos_model_max_output_tokens }}
# LLM Providers
anthropic:
base_url: {{ kottos_anthropic_base_url }}
generic:
base_url: {{ kottos_generic_base_url }}
openai:
base_url: {{ kottos_openai_base_url }}
# MCP Servers — alphabetical to match the dev sample (kottos/fastagent.config.yaml)
mcp:
servers:
## Andromeda Shell & File Operations — Kernos for Harper
### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token)
andromeda:
transport: http
url: "{{ kottos_andromeda_mcp_url }}"
## Argos Web Search & Page Fetch
### No Auth
argos:
transport: http
url: "{{ kottos_argos_mcp_url }}"
## Argus Shell & File Operations — Kernos for Scotty
### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token)
argus:
transport: http
url: "{{ kottos_argus_mcp_url }}"
## CASE Field Systems — LAN, SD Card, Provisioning
### No Auth
case:
transport: http
url: "http://{{ kottos_case_host }}:{{ kottos_case_port }}"
## Context7 Library/framework documentation (local stdio)
context7:
command: "npx"
args: ["-y", "@upstash/context7-mcp"]
## Gitea Git Repository Management
### No client auth (server-side auth only)
gitea:
transport: http
url: "{{ kottos_gitea_mcp_url }}"
## GitHub MCP Server (local Docker, stdio)
### GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
github:
command: "docker"
args:
- "run"
- "-i"
- "--rm"
- "-e"
- "GITHUB_PERSONAL_ACCESS_TOKEN"
- "ghcr.io/github/github-mcp-server"
## Grafana Observability
### No Auth
grafana:
transport: http
url: "{{ kottos_grafana_mcp_url }}"
## Korax Shell & File Operations — Kernos for CASE
### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token)
korax:
transport: http
url: "{{ kottos_korax_mcp_url }}"
load_on_start: false
## Mnemosyne Knowledge Library — workspace-scoped
### Auth is a long-lived team JWT rendered into fastagent.secrets.yaml from
### the OCI Vault entry {env}-mnemosyne-kottos-token.
mnemosyne:
transport: http
url: "{{ kottos_mnemosyne_mcp_url }}"
## Neo4j Cypher Memory Graph
neo4j_cypher:
transport: http
url: "{{ kottos_neo4j_mcp_url }}"
## Kottos internal sub-agents
### Research (Web, Knowledge)
research:
transport: http
url: "{{ kottos_research_mcp_url }}"
## Rommie Agent S Computer Use Agent
rommie:
transport: http
url: "{{ kottos_rommie_mcp_url }}"
load_on_start: false
### Research (Web, Context7)
tech_research:
transport: http
url: "{{ kottos_tech_research_mcp_url }}"
## Current time and time calculator (local stdio)
time:
command: "{{ kottos_directory }}/.venv/bin/mcp-server-time"
args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"]
logger:
type: console
level: info
progress_display: true
show_chat: true
show_tools: true
truncate_tools: true

View File

@@ -0,0 +1,35 @@
# Kottos — Secrets
# Managed by Ansible. Values fetched from OCI Vault at deploy time.
# Merges with fastagent.config.yaml (secrets take precedence).
openai:
api_key: "{{ kottos_openai_api_key }}"
anthropic:
api_key: "{{ kottos_anthropic_api_key }}"
mcp:
servers:
# Per-agent Kernos MCP bearer tokens so Kernos can distinguish callers.
# Kottos itself does not consume these — they are surfaced to each agent
# module via fast-agent's server auth headers below.
argus:
headers:
Authorization: "Bearer {{ scotty_kernos_mcp_token }}"
andromeda:
headers:
Authorization: "Bearer {{ harper_kernos_mcp_token }}"
korax:
headers:
Authorization: "Bearer {{ case_kernos_mcp_token }}"
# Downstream MCP bearer tokens
arke:
headers:
Authorization: "Bearer {{ kottos_arke_mcp_token }}"
mnemosyne:
headers:
Authorization: "Bearer {{ mnemosyne_kottos_token }}"
github:
env:
GITHUB_PERSONAL_ACCESS_TOKEN: "{{ kottos_github_pa_token }}"

View File

@@ -0,0 +1,24 @@
[Unit]
Description=Kottos AI Agent Platform
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ kottos_user }}
Group={{ kottos_group }}
WorkingDirectory={{ kottos_directory }}
Environment="PATH={{ kottos_directory }}/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
ExecStart={{ kottos_directory }}/.venv/bin/pallas
Restart=always
RestartSec=10
# Security hardening
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths={{ kottos_directory }}
[Install]
WantedBy=multi-user.target

34
ansible/kottos/remove.yml Normal file
View File

@@ -0,0 +1,34 @@
---
- name: Remove Kottos AI Agent Platform
hosts: ubuntu
become: true
tasks:
- name: Check if host has kottos service
ansible.builtin.set_fact:
has_kottos_service: "{{ 'kottos' in services | default([]) }}"
- name: Skip hosts without kottos service
ansible.builtin.meta: end_host
when: not has_kottos_service
- name: Stop and disable kottos service
ansible.builtin.systemd:
name: kottos
state: stopped
enabled: false
ignore_errors: true
- name: Remove systemd service file
ansible.builtin.file:
path: /etc/systemd/system/kottos.service
state: absent
- name: Reload systemd daemon
ansible.builtin.systemd:
daemon_reload: true
- name: Remove Kottos directory
ansible.builtin.file:
path: "{{ kottos_directory }}"
state: absent

84
ansible/kottos/stage.yml Normal file
View File

@@ -0,0 +1,84 @@
---
- name: Stage Kottos and Pallas release tarballs
hosts: localhost
gather_facts: false
vars:
kottos_archive_path: "{{ rel_dir }}/kottos_{{ kottos_rel }}.tar"
kottos_repo_url: "ssh://git@git.helu.ca:22022/r/kottos.git"
kottos_repo_dir: "{{ repo_dir }}/kottos"
pallas_archive_path: "{{ rel_dir }}/pallas_{{ pallas_rel }}.tar"
pallas_repo_url: "ssh://git@git.helu.ca:22022/r/pallas.git"
pallas_repo_dir: "{{ repo_dir }}/pallas"
tasks:
- name: Ensure release directory exists
ansible.builtin.file:
path: "{{ rel_dir }}"
state: directory
mode: '755'
- name: Ensure repo directory exists
ansible.builtin.file:
path: "{{ repo_dir }}"
state: directory
mode: '755'
# --- Kottos ------------------------------------------------------------
- name: Clone Kottos repository if not present
ansible.builtin.git:
repo: "{{ kottos_repo_url }}"
dest: "{{ kottos_repo_dir }}"
version: "{{ kottos_rel }}"
accept_hostkey: true
register: kottos_clone
ignore_errors: true
- name: Fetch all remote branches and tags (kottos)
ansible.builtin.command: git fetch --all
args:
chdir: "{{ kottos_repo_dir }}"
when: kottos_clone is not changed
changed_when: false
- name: Pull latest changes (kottos)
ansible.builtin.command: git pull
args:
chdir: "{{ kottos_repo_dir }}"
when: kottos_clone is not changed
changed_when: false
- name: Create Kottos archive for specified release
ansible.builtin.command: git archive -o "{{ kottos_archive_path }}" "{{ kottos_rel }}"
args:
chdir: "{{ kottos_repo_dir }}"
changed_when: true
# --- Pallas (kottos runtime dependency) --------------------------------
- name: Clone Pallas repository if not present
ansible.builtin.git:
repo: "{{ pallas_repo_url }}"
dest: "{{ pallas_repo_dir }}"
version: "{{ pallas_rel }}"
accept_hostkey: true
register: pallas_clone
ignore_errors: true
- name: Fetch all remote branches and tags (pallas)
ansible.builtin.command: git fetch --all
args:
chdir: "{{ pallas_repo_dir }}"
when: pallas_clone is not changed
changed_when: false
- name: Pull latest changes (pallas)
ansible.builtin.command: git pull
args:
chdir: "{{ pallas_repo_dir }}"
when: pallas_clone is not changed
changed_when: false
- name: Create Pallas archive for specified release
ansible.builtin.command: git archive -o "{{ pallas_archive_path }}" "{{ pallas_rel }}"
args:
chdir: "{{ pallas_repo_dir }}"
changed_when: true

View File

@@ -4,47 +4,17 @@
"command": "/srv/mcpo/.venv/bin/python",
"args": ["/srv/mcpo/.venv/bin/mcp-server-time", "--local-timezone=America/Toronto"]
},
"upstash-context7": {
"command": "npx",
"args": [
"-y",
"@upstash/context7-mcp"
]
},
"angelia": {
"url": "https://ouranos.helu.ca/mcp/sse/",
"headers": {
"Authorization": "Bearer LmDTU1OoQm7nk8-T7NtGwwA5aut7LqcpVYpLxRKUS51klljJkFUbmu3KYnR8V6Ww"
}
},
"argos": {
"type": "streamable_http",
"url": "{{argos_mcp_url}}"
},
"athena": {
"url": "https://athena.ouranos.helu.ca/mcp/sse/",
"headers": {
"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIyIiwidXNlcl9pZCI6MiwidXNlcm5hbWUiOiJyQGhlbHUuY2EiLCJpc3MiOiJhdGhlbmEiLCJhdWQiOiJhdGhlbmEtbWNwIiwiaWF0IjoxNzczODc4MDgwLCJrZXlfbmFtZSI6Ik1pcmFuZGEgTUNQTyBLZXkiLCJ0ZW5hbnRfaWQiOjF9.bpFKRbfEygKOW6_UlfQ7H5ZZZ5-LgMJ2UP653GhpZ5A"
},
"caliban": {
"type": "streamable_http",
"url": "{{caliban_mcp_url}}"
},
"gitea": {
"type": "streamable_http",
"url": "{{gitea_mcp_url}}"
},
"korax": {
"type": "streamable_http",
"url": "{{korax_mcp_url}}"
},
"neo4j-cypher": {
"type": "streamable_http",
"url": "{{neo4j_mcp_url}}"
},
"nike": {
"type": "streamable_http",
"url": "{{nike_mcp_url}}"
}
}
}

View File

@@ -24,9 +24,9 @@
group: "{{neo4j_group}}"
system: true
- name: Add group neo4j to keeper_user
- name: Add group neo4j to user ponos
ansible.builtin.user:
name: "{{keeper_user}}"
name: ponos
groups: "{{neo4j_group}}"
append: true
@@ -38,6 +38,14 @@
state: directory
mode: '750'
- name: Create neo4j data directory
ansible.builtin.file:
path: "{{neo4j_directory}}/data"
owner: "{{neo4j_user}}"
group: "{{neo4j_group}}"
state: directory
mode: '750'
- name: Template docker-compose file
ansible.builtin.template:
src: docker-compose.yml.j2

View File

@@ -1,6 +1,7 @@
services:
neo4j:
image: neo4j:{{neo4j_version}}
pull_policy: always
container_name: neo4j
restart: unless-stopped
ports:
@@ -11,13 +12,16 @@ services:
- neo4j_logs:/logs
- neo4j_plugins:/plugins
environment:
NEO4J_AUTH: "{{neo4j_auth_user}}/{{neo4j_auth_password}}"
# APOC Plugin
NEO4J_PLUGINS: '["apoc"]'
NEO4J_AUTH: "{{neo4j_user}}/{{neo4j_password}}"
# APOC Plugin — core ("apoc") is required by apoc-extended.
# Listing only apoc-extended fails to expose apoc.version(),
# apoc.coll.*, apoc.date.* — declare both.
NEO4J_PLUGINS: '["apoc", "apoc-extended"]'
NEO4J_apoc_export_file_enabled: "true"
NEO4J_apoc_import_file_enabled: "true"
NEO4J_apoc_import_file_use__neo4j__config: "true"
NEO4J_dbms_security_procedures_unrestricted: "{{neo4j_apoc_unrestricted}}"
NEO4J_server_default__listen__address: "0.0.0.0"
logging:
driver: syslog
options:
@@ -25,7 +29,31 @@ services:
syslog-format: "{{syslog_format}}"
tag: "neo4j"
neo4j-exporter:
image: stscoundrel/neo4j-apoc-exporter:v0.1.0
restart: unless-stopped
ports:
- "{{neo4j_metrics_port}}:17687"
environment:
- NEO4J_URI=bolt://neo4j:7687
- NEO4J_USER={{neo4j_user}}
- NEO4J_PASSWORD={{neo4j_password}}
- EXPORTER_PORT=17687
depends_on:
- neo4j
logging:
driver: syslog
options:
syslog-address: "tcp://127.0.0.1:{{neo4j_syslog_port}}"
syslog-format: "{{syslog_format}}"
tag: "neo4j-exporter"
volumes:
neo4j_data:
driver: local
driver_opts:
type: none
device: {{neo4j_directory}}/data
o: bind
neo4j_logs:
neo4j_plugins:

View File

@@ -1,7 +1,7 @@
# Generated by Ansible - do not edit manually
services:
neo4j-cypher:
image: mcp/neo4j-cypher:latest
pull_policy: always
image: mcp/neo4j-cypher:{{ neo4j_mcp_image_version }}
container_name: neo4j-cypher
restart: unless-stopped
ports:
@@ -9,14 +9,14 @@ services:
environment:
- NEO4J_URI=bolt://{{neo4j_host}}:{{neo4j_bolt_port}}
- NEO4J_USERNAME=neo4j
- NEO4J_PASSWORD={{neo4j_auth_password}}
- NEO4J_PASSWORD={{neo4j_cypher_password}}
- NEO4J_DATABASE=neo4j
- NEO4J_TRANSPORT=http
- NEO4J_MCP_SERVER_HOST=0.0.0.0
- NEO4J_MCP_SERVER_PORT=8000
- NEO4J_MCP_SERVER_PATH=/mcp
- NEO4J_NAMESPACE=local
- NEO4J_MCP_SERVER_ALLOWED_HOSTS=localhost,127.0.0.1,miranda.incus,rosalind.incus,miranda.incus:{{neo4j_cypher_mcp_port}}
- NEO4J_MCP_SERVER_ALLOWED_HOSTS={{neo4j_mcp_server_allowed_hosts}}
- NEO4J_MCP_SERVER_ALLOW_ORIGINS=
- NEO4J_READ_TIMEOUT=30
logging:

View File

@@ -203,6 +203,9 @@
- { user: "{{ nike_db_user }}", password: "{{ nike_db_password }}" }
- { user: "{{ periplus_db_user }}", password: "{{ periplus_db_password }}" }
- { user: "{{ daedalus_db_user }}", password: "{{ daedalus_db_password }}" }
- { user: "{{ mnemosyne_db_user }}", password: "{{ mnemosyne_db_password }}" }
- { user: "{{ hold_slayer_db_user }}", password: "{{ hold_slayer_db_password }}" }
- { user: "{{ hecate_db_user }}", password: "{{ hecate_db_password }}" }
no_log: true
- name: Create application databases with owners
@@ -226,6 +229,9 @@
- { name: "{{ nike_db_name }}", owner: "{{ nike_db_user }}" }
- { name: "{{ periplus_db_name }}", owner: "{{ periplus_db_user }}" }
- { name: "{{ daedalus_db_name }}", owner: "{{ daedalus_db_user }}" }
- { name: "{{ mnemosyne_db_name }}", owner: "{{ mnemosyne_db_user }}" }
- { name: "{{ hold_slayer_db_name }}", owner: "{{ hold_slayer_db_user }}" }
- { name: "{{ hecate_db_name }}", owner: "{{ hecate_db_user }}" }
- name: Enable postgis and pg_trgm extensions in periplus database
community.postgresql.postgresql_ext:
@@ -254,6 +260,7 @@
- "{{ spelunker_db_name }}"
- "{{ anythingllm_db_name }}"
- "{{ daedalus_db_name }}"
- "{{ hold_slayer_db_name }}"
handlers:
- name: restart postgresql

View File

@@ -244,6 +244,23 @@ groups:
summary: "High log ingestion rate"
description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging"
# ============================================================================
# Django Application Alerts (generic — any Django app exporting the counter)
# ============================================================================
# Apps emit django_superuser_logins_total from a user_logged_in signal when
# the authenticating user is a superuser. The job/component labels identify
# which app fired; forensic detail (user, IP) is in the matching Loki line.
- name: django_alerts
rules:
- alert: DjangoSuperuserLogin
expr: increase(django_superuser_logins_total[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Superuser login on {{ $labels.job }}"
description: "A superuser account just logged in to {{ $labels.job }} (component {{ $labels.component }}). This account is rarely used — confirm it was expected. Forensic detail (user, IP) in Loki: {service=\"{{ $labels.job }}\"} |= \"event=superuser_login\"."
# ============================================================================
# Daedalus Application Alerts
# ============================================================================
@@ -312,6 +329,120 @@ groups:
summary: "Daedalus S3 error rate above 1%"
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
# ============================================================================
# Mnemosyne Application Alerts
# ============================================================================
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
# The Django app container hosts the single prometheus_client registry that
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
# call counters) write to, so "MCP is broken" signals show up as
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
# than a separate up{} series.
- name: mnemosyne_alerts
rules:
- alert: MnemosyneDown
expr: up{job="mnemosyne"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Mnemosyne is down"
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
- alert: MnemosyneHighErrorRate
expr: |
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne HTTP 5xx error rate above 5%"
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
- alert: MnemosyneSlowResponses
expr: |
histogram_quantile(0.95,
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne p95 response time above 5s"
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
# MCP tool-call error surface — owned by mcp_server.metrics on the
# same /metrics endpoint. This complements MnemosyneDown by catching
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
# failing, or Neo4j vector search is 500-ing.
- alert: MnemosyneMCPToolErrors
expr: |
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne MCP tool error rate above 10%"
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
# Celery queue depth — high pending count usually means the embedding
# worker is stuck or throttled by the embedding provider. Requires
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
# if that is not deployed yet, this rule simply never fires.
- alert: MnemosyneCeleryBacklog
expr: |
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# ============================================================================
# Neo4j Alerts (neo4j-apoc-exporter sidecar)
# ============================================================================
# Metrics come from stscoundrel/neo4j-apoc-exporter, which connects to
# Neo4j over Bolt and surfaces apoc.monitor.* gauges plus standard JVM
# metrics. "Exporter down" therefore covers both "exporter container
# crashed" and "exporter cannot reach Bolt" — either way Neo4j is
# effectively unobservable. Hostname-only — purpose of each instance
# is implied by the host (e.g. ariel = LLM memory, umbriel = Mnemosyne).
- name: neo4j_alerts
rules:
- alert: Neo4jExporterDown
expr: up{job="neo4j"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Neo4j exporter down on {{ $labels.instance }}"
description: "The neo4j-apoc-exporter on {{ $labels.instance }} has been unreachable for more than 5 minutes. Either the sidecar container is down or it cannot connect to Neo4j over Bolt — check `docker ps` and `docker logs neo4j-exporter` on the host."
- alert: Neo4jHighRollbackRate
expr: |
rate(neo4j_monitor_tx_rolledBackTx[10m])
/ clamp_min(rate(neo4j_monitor_tx_totalOpenedTx[10m]), 1) > 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "Neo4j transaction rollback rate above 10% on {{ $labels.instance }}"
description: "More than 10% of transactions on {{ $labels.instance }} have rolled back over the last 10 minutes — check application logs in Loki ({job=\"neo4j\", hostname=\"{{ $labels.instance }}\"})."
- alert: Neo4jStoreGrowthStalled
expr: |
rate(neo4j_monitor_tx_totalOpenedTx[15m]) == 0
and neo4j_monitor_tx_currentOpenedTx > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Neo4j has open transactions but zero throughput on {{ $labels.instance }}"
description: "{{ $labels.instance }} shows {{ $value }} currently-open transactions but no new transactions opened in 15 minutes — possible Bolt-side hang or stuck query."
# Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed"
{% endraw %}

View File

@@ -200,14 +200,6 @@
# Grafana
# ===========================================================================
- name: Create dashboards directory
ansible.builtin.file:
path: /var/lib/grafana/dashboards
state: directory
owner: grafana
group: grafana
mode: '750'
- name: Template Grafana main configuration
ansible.builtin.template:
src: "grafana.ini.j2"

View File

@@ -47,8 +47,63 @@ scrape_configs:
- job_name: 'daedalus'
static_configs:
- targets: ['puck.incus:22181']
- targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}']
metrics_path: '/metrics'
scrape_interval: 15s
# Mnemosyne — app exposes /metrics on the Django container (proxied via
# nginx); a single prometheus_client process registry serves both
# django-prometheus (HTTP/Celery) and the MCP server's tool-call counters
# (the mcp container itself does not expose /metrics). Web is an
# nginx-prometheus-exporter sidecar that scrapes the web container's
# stub_status and re-exposes it in Prometheus format.
- job_name: 'mnemosyne'
metrics_path: '/metrics'
scrape_interval: 15s
static_configs:
- targets: ['{{ mnemosyne_app_metrics_host }}:{{ mnemosyne_app_metrics_port }}']
labels:
component: app
- targets: ['{{ mnemosyne_web_metrics_host }}:{{ mnemosyne_web_metrics_port }}']
labels:
component: web
# Athena — same shape as Mnemosyne: the Django container exposes /metrics
# (django-prometheus) proxied via nginx on the app port; a separate
# nginx-prometheus-exporter sidecar re-exposes the web container's
# stub_status in Prometheus format on the web-metrics port.
- job_name: 'athena'
metrics_path: '/metrics'
scrape_interval: 15s
static_configs:
- targets: ['{{ athena_app_metrics_host }}:{{ athena_app_metrics_port }}']
labels:
component: app
- targets: ['{{ athena_web_metrics_host }}:{{ athena_web_metrics_port }}']
labels:
component: web
# Pallas — each deployment is one scrape target (registry port).
# Pallas uses a single process-global registry, so per-agent /metrics
# endpoints serve the same snapshot; the `agent` dimension is carried
# as a metric label, not a target. Targets are defined per
# environment in pallas_metrics_targets (host_vars on the Prometheus
# host); instances are differentiated by the `instance` label.
{% if pallas_metrics_targets | default([]) %}
- job_name: 'pallas'
metrics_path: '/metrics'
scrape_interval: 15s
static_configs: {{ pallas_metrics_targets | to_json }}
{% endif %}
# Neo4j — stscoundrel/neo4j-apoc-exporter sidecar connects to the local
# Neo4j over Bolt and exposes apoc.monitor.* (tx/ids/store) plus JVM
# metrics. Targets are listed per-environment in neo4j_metrics_targets
# (host_vars on the Prometheus host) — instances are differentiated by
# hostname only.
- job_name: 'neo4j'
static_configs:
- targets: {{ neo4j_metrics_targets | to_json }}
metrics_path: '/metrics'
scrape_interval: 15s
# Red Panda Approved Prometheus Configuration

View File

@@ -29,4 +29,15 @@ ROMMIE_GROUNDING_HEIGHT={{ rommie_grounding_height | default(1024) }}
# ============================================================================
ROMMIE_HOST={{ rommie_host | default('0.0.0.0') }}
ROMMIE_PORT={{ rommie_port }}
ROMMIE_ALLOWED_HOSTS={{ rommie_allowed_hosts }}
# Idle MCP sessions are reaped after this many seconds (<=0 disables).
# Prevents unbounded StreamableHTTP transport accumulation from clients
# that drop their connection without sending an explicit DELETE.
ROMMIE_SESSION_IDLE_TIMEOUT={{ rommie_session_idle_timeout | default(1800) }}
# ============================================================================
# get_screenshot (parent-agent) output
# JPEG-encode and refuse if over the cap (asks operator to lower RDP resolution)
# ============================================================================
ROMMIE_SCREENSHOT_JPEG_QUALITY={{ rommie_screenshot_jpeg_quality | default(80) }}
ROMMIE_SCREENSHOT_MAX_KB={{ rommie_screenshot_max_kb | default(512) }}

View File

@@ -52,6 +52,8 @@
src: .env.j2
dest: "{{rommie_repo}}/.env"
mode: '0600'
notify:
- Restart rommie
- name: Deploy Rommie systemd service
template:

View File

@@ -57,78 +57,3 @@
project_src: "{{searxng_directory}}"
state: present
pull: always
# ===========================================================================
# OAuth2-Proxy Sidecar
# Note: Each host supports at most one OAuth2-Proxy sidecar instance
# (binary shared at /usr/local/bin/oauth2-proxy, unique systemd unit per service)
# ===========================================================================
- name: Create oauth2-proxy directory
ansible.builtin.file:
path: "{{ searxng_oauth2_proxy_dir }}"
owner: root
group: root
state: directory
mode: '0755'
- name: Download oauth2-proxy binary
ansible.builtin.get_url:
url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{ searxng_oauth2_proxy_version }}/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64.tar.gz"
dest: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz"
mode: '0644'
- name: Extract oauth2-proxy binary
ansible.builtin.unarchive:
src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz"
dest: /tmp
remote_src: true
creates: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy"
- name: Install oauth2-proxy binary
ansible.builtin.copy:
src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy"
dest: /usr/local/bin/oauth2-proxy
owner: root
group: root
mode: '0755'
remote_src: true
- name: Template oauth2-proxy configuration
ansible.builtin.template:
src: oauth2-proxy-searxng.cfg.j2
dest: "{{ searxng_oauth2_proxy_dir }}/oauth2-proxy.cfg"
owner: root
group: root
mode: '0600'
notify: restart oauth2-proxy-searxng
- name: Template oauth2-proxy systemd service
ansible.builtin.template:
src: oauth2-proxy-searxng.service.j2
dest: /etc/systemd/system/oauth2-proxy-searxng.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart oauth2-proxy-searxng
# ===========================================================================
# Service Management
# ===========================================================================
- name: Enable and start OAuth2-Proxy service
ansible.builtin.systemd:
name: oauth2-proxy-searxng
enabled: true
state: started
daemon_reload: true
handlers:
- name: reload systemd
ansible.builtin.systemd:
daemon_reload: true
- name: restart oauth2-proxy-searxng
ansible.builtin.systemd:
name: oauth2-proxy-searxng
state: restarted

View File

@@ -0,0 +1,86 @@
---
- name: Deploy OAuth2-Proxy sidecar for SearXNG
hosts: ubuntu
become: true
tasks:
- name: Check if host has searxng service with OAuth2 configured
ansible.builtin.set_fact:
has_searxng_oauth2: >-
{{ 'searxng' in services
and (searxng_oauth2_client_id | default('')) | length > 0 }}
- name: Skip hosts without SearXNG OAuth2-Proxy configuration
ansible.builtin.meta: end_host
when: not has_searxng_oauth2
# ===========================================================================
# OAuth2-Proxy Sidecar
# Note: Each host supports at most one OAuth2-Proxy sidecar instance
# (binary shared at /usr/local/bin/oauth2-proxy, unique systemd unit per service)
# ===========================================================================
- name: Create oauth2-proxy directory
ansible.builtin.file:
path: "{{ searxng_oauth2_proxy_dir }}"
owner: root
group: root
state: directory
mode: '0755'
- name: Download oauth2-proxy binary
ansible.builtin.get_url:
url: "https://github.com/oauth2-proxy/oauth2-proxy/releases/download/v{{ searxng_oauth2_proxy_version }}/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64.tar.gz"
dest: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz"
mode: '0644'
- name: Extract oauth2-proxy binary
ansible.builtin.unarchive:
src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.tar.gz"
dest: /tmp
remote_src: true
creates: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy"
- name: Install oauth2-proxy binary
ansible.builtin.copy:
src: "/tmp/oauth2-proxy-v{{ searxng_oauth2_proxy_version }}.linux-amd64/oauth2-proxy"
dest: /usr/local/bin/oauth2-proxy
owner: root
group: root
mode: '0755'
remote_src: true
- name: Template oauth2-proxy configuration
ansible.builtin.template:
src: oauth2-proxy-searxng.cfg.j2
dest: "{{ searxng_oauth2_proxy_dir }}/oauth2-proxy.cfg"
owner: root
group: root
mode: '0600'
notify: restart oauth2-proxy-searxng
- name: Template oauth2-proxy systemd service
ansible.builtin.template:
src: oauth2-proxy-searxng.service.j2
dest: /etc/systemd/system/oauth2-proxy-searxng.service
owner: root
group: root
mode: '0644'
notify:
- reload systemd
- restart oauth2-proxy-searxng
- name: Enable and start OAuth2-Proxy service
ansible.builtin.systemd:
name: oauth2-proxy-searxng
enabled: true
state: started
daemon_reload: true
handlers:
- name: reload systemd
ansible.builtin.systemd:
daemon_reload: true
- name: restart oauth2-proxy-searxng
ansible.builtin.systemd:
name: oauth2-proxy-searxng
state: restarted

View File

@@ -18,7 +18,7 @@ server:
bind_address: "0.0.0.0"
secret_key: "{{ searxng_secret_key }}"
base_url: "{{ searxng_base_url }}"
limiter: true
limiter: false
public_instance: false
method: "GET"
image_proxy: true
@@ -32,11 +32,40 @@ ui:
# Red Panda Approved Search Configuration
engines:
# --- General web ---
- name: google
disabled: false
disabled: true
- name: brave
disabled: true
- name: duckduckgo
disabled: false
- name: bing
disabled: false
- name: startpage
disabled: false
- name: mojeek
disabled: false
- name: braveapi
engine: braveapi
api_key: "{{ searxng_brave_api_key }}"
results_per_page: 20
inactive: false
disabled: false
# --- Images: disable engines returning suspended / access denied ---
- name: brave.images
disabled: true
- name: duckduckgo images
disabled: true
- name: pexels
disabled: true
# --- Videos: disable engines returning suspended / access denied ---
- name: brave.videos
disabled: true
- name: vimeo
disabled: true
# --- News: disable engines returning suspended / parsing errors ---
- name: brave.news
disabled: true
- name: bing news
disabled: true

View File

@@ -33,6 +33,9 @@
- name: Deploy SearXNG
import_playbook: searxng/deploy.yml
- name: Deploy SearXNG OAuth2-Proxy sidecar
import_playbook: searxng/deploy_oauth2.yml
- name: Deploy HAProxy
import_playbook: haproxy/deploy.yml
@@ -44,3 +47,12 @@
- name: Deploy Agent S
import_playbook: agent_s/deploy.yml
- name: Deploy Rommie MCP Server
import_playbook: rommie/deploy.yml
- name: Stage Kottos (Pallas FastAgent runtime)
import_playbook: kottos/stage.yml
- name: Deploy Kottos
import_playbook: kottos/deploy.yml

307
dashboards/argos.json Normal file
View File

@@ -0,0 +1,307 @@
{
"title": "Argos",
"uid": "argos",
"tags": ["argos", "mcp", "searxng", "ouranos"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "SearXNG dashboard",
"tooltip": "SearXNG instance probes (miranda, rosalind)",
"type": "link",
"url": "/d/searxng"
}
],
"templating": {
"list": [
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "instance",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(up{job=\"argos\"}, instance)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Argos host"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Health",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "stat",
"title": "Argos up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"argos\", instance=~\"$instance\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 3,
"type": "stat",
"title": "SearXNG instances healthy (per Argos)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 4, "y": 1},
"targets": [
{"refId": "A", "expr": "sum by (instance) (argos_searxng_instance_up{instance=~\"$instance\"})", "legendFormat": "{{instance}}"},
{"refId": "B", "expr": "count by (instance) (argos_searxng_instance_up{instance=~\"$instance\"})", "legendFormat": "{{instance}} total", "hide": true}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name", "colorMode": "value"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 1}, {"color": "green", "value": 2}]}}}
},
{
"id": 4,
"type": "stat",
"title": "Tool error ratio (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 10, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(rate(argos_tool_calls_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum(rate(argos_tool_calls_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "errors"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.20}]}}}
},
{
"id": 5,
"type": "stat",
"title": "Tool calls/sec (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 14, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(rate(argos_tool_calls_total{instance=~\"$instance\"}[5m]))", "legendFormat": "calls/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "ops"}}
},
{
"id": 6,
"type": "stat",
"title": "Build",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
"targets": [
{"refId": "A", "expr": "argos_build_info{instance=~\"$instance\"}", "legendFormat": "{{instance}} v{{version}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "name", "colorMode": "none"},
"fieldConfig": {"defaults": {"unit": "none"}}
},
{
"id": 10,
"type": "row",
"title": "Tools",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
},
{
"id": 11,
"type": "timeseries",
"title": "Tool calls/sec by tool (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
"targets": [
{"refId": "A", "expr": "sum by (tool) (rate(argos_tool_calls_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{tool}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "timeseries",
"title": "Tool error ratio by tool (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
"targets": [
{"refId": "A", "expr": "sum by (tool) (rate(argos_tool_calls_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum by (tool) (rate(argos_tool_calls_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "{{tool}}"}
],
"fieldConfig": {"defaults": {"unit": "percentunit"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 13,
"type": "timeseries",
"title": "Tool latency p50 / p95 / p99 (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.50, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p50"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p95"},
{"refId": "C", "expr": "histogram_quantile(0.99, sum by (le) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "p99"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 14,
"type": "timeseries",
"title": "Tool latency p95 by tool (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, tool) (rate(argos_tool_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "{{tool}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 20,
"type": "row",
"title": "Upstream SearXNG",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 22}
},
{
"id": 21,
"type": "table",
"title": "SearXNG instances (per-Argos view)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 23},
"targets": [
{"refId": "A", "expr": "argos_searxng_instance_up{instance=~\"$instance\"}", "legendFormat": "{{searxng_instance}}", "format": "table", "instant": true}
],
"transformations": [
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true, "environment": true, "hostname": true}}}
],
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "custom": {"cellOptions": {"type": "color-background"}}}}
},
{
"id": 22,
"type": "timeseries",
"title": "Upstream SearXNG requests/sec by instance (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 23},
"targets": [
{"refId": "A", "expr": "sum by (instance, searxng_instance) (rate(argos_searxng_requests_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{instance}} → {{searxng_instance}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 23,
"type": "timeseries",
"title": "Upstream SearXNG error ratio by instance (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 29},
"targets": [
{"refId": "A", "expr": "sum by (searxng_instance) (rate(argos_searxng_requests_total{status=\"error\", instance=~\"$instance\"}[5m])) / clamp_min(sum by (searxng_instance) (rate(argos_searxng_requests_total{instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "{{searxng_instance}}"}
],
"fieldConfig": {"defaults": {"unit": "percentunit"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 24,
"type": "timeseries",
"title": "Upstream SearXNG latency p95 by instance (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 29},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, searxng_instance) (rate(argos_searxng_request_duration_seconds_bucket{instance=~\"$instance\"}[5m])))", "legendFormat": "{{searxng_instance}} p95"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 30,
"type": "row",
"title": "Cache & webpage fetch",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 35}
},
{
"id": 31,
"type": "stat",
"title": "Cache hit ratio (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 36},
"targets": [
{"refId": "A", "expr": "sum(rate(argos_cache_operations_total{operation=\"get\", result=\"hit\", instance=~\"$instance\"}[5m])) / clamp_min(sum(rate(argos_cache_operations_total{operation=\"get\", instance=~\"$instance\"}[5m])), 0.0001)", "legendFormat": "hits"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.10}, {"color": "green", "value": 0.30}]}}}
},
{
"id": 32,
"type": "timeseries",
"title": "Cache ops/sec by result (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 9, "x": 6, "y": 36},
"targets": [
{"refId": "A", "expr": "sum by (operation, result) (rate(argos_cache_operations_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{operation}}/{{result}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 33,
"type": "timeseries",
"title": "Webpage fetch outcomes/sec (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 9, "x": 15, "y": 36},
"targets": [
{"refId": "A", "expr": "sum by (status) (rate(argos_webpage_fetch_total{instance=~\"$instance\"}[5m]))", "legendFormat": "{{status}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 90,
"type": "row",
"title": "Logs",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 44}
},
{
"id": 91,
"type": "logs",
"title": "argos (Loki)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 45},
"targets": [
{"refId": "A", "expr": "{job=\"argos\"}"}
],
"options": {"showTime": true, "wrapLogMessage": true, "enableLogDetails": true, "dedupStrategy": "none"}
}
]
}

View File

@@ -0,0 +1,702 @@
{
"title": "Daedalus Stack",
"uid": "daedalus-stack",
"tags": ["daedalus", "mnemosyne", "pallas", "ouranos"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Neo4j dashboard",
"tooltip": "Detailed Neo4j metrics (ariel, umbriel)",
"type": "link",
"url": "/d/neo4j"
},
{
"asDropdown": false,
"icon": "doc",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Explore Logs",
"tooltip": "Loki: daedalus + mnemosyne + pallas",
"type": "link",
"url": "/explore?orgId=1&left=%7B%22datasource%22:%22Loki%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bservice%3D~%5C%22daedalus%7Cmnemosyne%7Cpallas%5C%22%7D%22%7D%5D%7D"
}
],
"templating": {
"list": [
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "pallas_inst",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(up{job=\"pallas\"}, instance)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Pallas instance"
},
{
"name": "agent",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(pallas_send_message_total{instance=~\"$pallas_inst\"}, agent)",
"refresh": 2,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Agent"
}
]
},
"panels": [
{
"id": 100,
"type": "row",
"title": "Summary",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 101,
"type": "stat",
"title": "Daedalus",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 0, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"daedalus\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 102,
"type": "stat",
"title": "Mnemosyne app",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 3, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"app\"}", "legendFormat": "app"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 103,
"type": "stat",
"title": "Mnemosyne web",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 6, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"web\"}", "legendFormat": "web"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 104,
"type": "stat",
"title": "Pallas up ratio",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 9, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(up{job=\"pallas\"}) / count(up{job=\"pallas\"})", "legendFormat": "up ratio"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.67}, {"color": "green", "value": 1}]}}}
},
{
"id": 105,
"type": "stat",
"title": "Agents healthy",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 12, "y": 1},
"targets": [
{"refId": "A", "expr": "sum(daedalus_agents_by_health{status=\"ok\"}) / clamp_min(daedalus_agents_total, 1)", "legendFormat": "healthy"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.7}, {"color": "green", "value": 1}]}}}
},
{
"id": 106,
"type": "stat",
"title": "Chat p95 (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 3, "x": 15, "y": 1},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "chat p95"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 10}, {"color": "red", "value": 30}]}}}
},
{
"id": 107,
"type": "timeseries",
"title": "Stack up (last hour)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
"targets": [
{"refId": "A", "expr": "up{job=~\"daedalus|mnemosyne|pallas\"}", "legendFormat": "{{job}} {{instance}} {{component}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "min": 0, "max": 1, "custom": {"drawStyle": "line", "lineInterpolation": "stepBefore", "fillOpacity": 10}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 200,
"type": "row",
"title": "Daedalus",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
},
{
"id": 201,
"type": "stat",
"title": "Daedalus up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 6},
"targets": [
{"refId": "A", "expr": "daedalus_up", "legendFormat": "daedalus"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 202,
"type": "stat",
"title": "5xx error rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)", "legendFormat": "5xx"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 203,
"type": "stat",
"title": "MCP connections active",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(daedalus_mcp_connections_active)", "legendFormat": "active"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 204,
"type": "stat",
"title": "Avg context window %",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 6},
"targets": [
{"refId": "A", "expr": "avg(daedalus_chat_context_pct)", "legendFormat": "avg"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percent", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 70}, {"color": "red", "value": 90}]}}}
},
{
"id": 205,
"type": "stat",
"title": "Tokens/sec (5m, total)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 6},
"targets": [
{"refId": "A", "expr": "sum(rate(daedalus_chat_tokens_total[5m]))", "legendFormat": "tok/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 210,
"type": "timeseries",
"title": "Chat latency (p50/p95/p99)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.50, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p50"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p95"},
{"refId": "C", "expr": "histogram_quantile(0.99, sum by (le) (rate(daedalus_agent_response_duration_seconds_bucket{source=\"chat\"}[5m])))", "legendFormat": "p99"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 211,
"type": "timeseries",
"title": "Voice pipeline p95 (STT / agent / TTS / total)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_stt_duration_seconds_bucket[5m])))", "legendFormat": "stt"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_agent_duration_seconds_bucket[5m])))", "legendFormat": "agent"},
{"refId": "C", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_tts_duration_seconds_bucket[5m])))", "legendFormat": "tts"},
{"refId": "D", "expr": "histogram_quantile(0.95, sum by (le) (rate(daedalus_voice_pipeline_duration_seconds_bucket[5m])))", "legendFormat": "total"}
],
"fieldConfig": {"defaults": {"unit": "s", "custom": {"stacking": {"mode": "none"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 220,
"type": "timeseries",
"title": "Pallas reach — MCP error ratio by server (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 18},
"targets": [
{"refId": "A", "expr": "sum by (server) (rate(daedalus_mcp_requests_total{status=\"error\"}[5m])) / clamp_min(sum by (server) (rate(daedalus_mcp_requests_total[5m])), 0.0001)", "legendFormat": "{{server}}"}
],
"fieldConfig": {"defaults": {"unit": "percentunit"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 221,
"type": "timeseries",
"title": "Mnemosyne reach — p95 latency by operation (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 18},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, operation) (rate(daedalus_mnemosyne_request_duration_seconds_bucket[5m])))", "legendFormat": "{{operation}} p95"},
{"refId": "B", "expr": "sum(rate(daedalus_mnemosyne_requests_total{status=\"error\"}[5m]))", "legendFormat": "errors/s (right)"}
],
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": [{"matcher": {"id": "byName", "options": "errors/s (right)"}, "properties": [{"id": "unit", "value": "ops"}, {"id": "custom.axisPlacement", "value": "right"}]}]},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 230,
"type": "timeseries",
"title": "Token burn by direction (tokens/sec, 5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 26},
"targets": [
{"refId": "A", "expr": "sum by (direction) (rate(daedalus_chat_tokens_total[5m]))", "legendFormat": "{{direction}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20, "stacking": {"mode": "normal"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 231,
"type": "timeseries",
"title": "Mnemosyne ingest jobs (status, 5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 26},
"targets": [
{"refId": "A", "expr": "sum by (status) (rate(daedalus_mnemosyne_ingest_jobs_total[5m]))", "legendFormat": "{{status}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 300,
"type": "row",
"title": "Mnemosyne",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 34}
},
{
"id": 301,
"type": "stat",
"title": "App",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 35},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"app\"}", "legendFormat": "app"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 302,
"type": "stat",
"title": "Web (nginx)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 35},
"targets": [
{"refId": "A", "expr": "up{job=\"mnemosyne\", component=\"web\"}", "legendFormat": "web"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 303,
"type": "stat",
"title": "Search rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 35},
"targets": [
{"refId": "A", "expr": "sum(rate(mnemosyne_search_requests_total[5m]))", "legendFormat": "req/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "reqps"}}
},
{
"id": 304,
"type": "stat",
"title": "Embedding queue depth",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 35},
"targets": [
{"refId": "A", "expr": "mnemosyne_embedding_queue_size", "legendFormat": "queue"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 10}, {"color": "red", "value": 100}]}}}
},
{
"id": 305,
"type": "stat",
"title": "Pipeline in-progress",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 35},
"targets": [
{"refId": "A", "expr": "mnemosyne_pipeline_items_in_progress", "legendFormat": "in-flight"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 306,
"type": "stat",
"title": "MCP tool error rate (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 4, "x": 20, "y": 35},
"targets": [
{"refId": "A", "expr": "sum(rate(mcp_tool_invocations_total{status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total[5m])), 0.0001)", "legendFormat": "err"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 310,
"type": "timeseries",
"title": "Search rate by type (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 39},
"targets": [
{"refId": "A", "expr": "sum by (search_type) (rate(mnemosyne_search_requests_total[5m]))", "legendFormat": "{{search_type}}"}
],
"fieldConfig": {"defaults": {"unit": "reqps"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 311,
"type": "timeseries",
"title": "Search latency p95 by type (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 39},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, search_type) (rate(mnemosyne_search_duration_seconds_bucket[5m])))", "legendFormat": "{{search_type}} p95"},
{"refId": "B", "expr": "histogram_quantile(0.95, sum by (le) (rate(mnemosyne_search_total_duration_seconds_bucket[5m])))", "legendFormat": "end-to-end p95"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 320,
"type": "timeseries",
"title": "Embedding queue depth over time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 47},
"targets": [
{"refId": "A", "expr": "mnemosyne_embedding_queue_size", "legendFormat": "queue"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 321,
"type": "timeseries",
"title": "Embeddings generated (per sec, by model)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 47},
"targets": [
{"refId": "A", "expr": "sum by (model_name) (rate(mnemosyne_embeddings_generated_total[5m]))", "legendFormat": "{{model_name}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 322,
"type": "timeseries",
"title": "Pipeline items (per sec, by status)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 47},
"targets": [
{"refId": "A", "expr": "sum by (status) (rate(mnemosyne_pipeline_items_total[5m]))", "legendFormat": "{{status}}"},
{"refId": "B", "expr": "sum by (error_type) (rate(mnemosyne_embedding_api_errors_total[5m]))", "legendFormat": "api err: {{error_type}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 330,
"type": "timeseries",
"title": "Neo4j @ umbriel — transactions (rate / open)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 55},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"umbriel.*\"}[5m])", "legendFormat": "{{instance}} open rate"},
{"refId": "B", "expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"umbriel.*\"}", "legendFormat": "{{instance}} current open"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 331,
"type": "stat",
"title": "Neo4j @ umbriel — rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 55},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"umbriel.*\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"umbriel.*\"}[10m]), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 332,
"type": "stat",
"title": "Neo4j @ umbriel — store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 55},
"targets": [
{"refId": "A", "expr": "neo4j_monitor_store_totalStoreSize{instance=~\"umbriel.*\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
},
{
"id": 400,
"type": "row",
"title": "Pallas Agents",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 63}
},
{
"id": 401,
"type": "stat",
"title": "Instance up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 64},
"targets": [
{"refId": "A", "expr": "up{job=\"pallas\", instance=~\"$pallas_inst\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": ""}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 402,
"type": "stat",
"title": "Aggregate agent health (min)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 64},
"targets": [
{"refId": "A", "expr": "min by (instance) (pallas_agent_health_status{instance=~\"$pallas_inst\"})", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "ERROR", "color": "red"}, "0.5": {"text": "DEGRADED", "color": "orange"}, "1": {"text": "OK", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.5}, {"color": "green", "value": 1}]}}}
},
{
"id": 403,
"type": "stat",
"title": "Downstream MCPs up (ratio)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 64},
"targets": [
{"refId": "A", "expr": "sum by (instance) (pallas_downstream_up{instance=~\"$pallas_inst\"}) / clamp_min(count by (instance) (pallas_downstream_up{instance=~\"$pallas_inst\"}), 1)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.5}, {"color": "green", "value": 1}]}}}
},
{
"id": 404,
"type": "stat",
"title": "Turn error ratio (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 64},
"targets": [
{"refId": "A", "expr": "sum by (instance) (rate(pallas_send_message_total{outcome=\"error\", instance=~\"$pallas_inst\"}[5m])) / clamp_min(sum by (instance) (rate(pallas_send_message_total{instance=~\"$pallas_inst\"}[5m])), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 410,
"type": "timeseries",
"title": "Turn latency p95 by agent (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 68},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, agent, instance) (rate(pallas_send_message_duration_seconds_bucket{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 411,
"type": "table",
"title": "Long-running agents — p99 turn (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 68},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.99, sum by (agent, instance, le) (rate(pallas_send_message_duration_seconds_bucket{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "", "format": "table", "instant": true}
],
"fieldConfig": {"defaults": {"unit": "s", "custom": {"align": "auto"}, "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 30}, {"color": "red", "value": 60}]}, "color": {"mode": "thresholds"}}, "overrides": [{"matcher": {"id": "byName", "options": "Value"}, "properties": [{"id": "displayName", "value": "p99 (s)"}, {"id": "custom.cellOptions", "value": {"type": "color-background"}}]}]},
"options": {"showHeader": true, "sortBy": [{"displayName": "p99 (s)", "desc": true}]},
"transformations": [{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true}}}]
},
{
"id": 420,
"type": "timeseries",
"title": "Turn errors per agent (15m increase)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 76},
"targets": [
{"refId": "A", "expr": "sum by (agent, instance) (increase(pallas_send_message_total{outcome=\"error\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[15m]))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 421,
"type": "timeseries",
"title": "Tokens/sec by kind (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 76},
"targets": [
{"refId": "A", "expr": "sum by (kind) (rate(pallas_llm_tokens_total{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "{{kind}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "custom": {"drawStyle": "line", "fillOpacity": 20, "stacking": {"mode": "normal"}}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 422,
"type": "table",
"title": "Top-burning agents (24h, input+output tokens)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 84},
"targets": [
{"refId": "A", "expr": "topk(10, sum by (agent, model, instance) (increase(pallas_llm_tokens_total{kind=~\"input|output\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[24h])))", "legendFormat": "", "format": "table", "instant": true}
],
"fieldConfig": {"defaults": {"unit": "short"}, "overrides": [{"matcher": {"id": "byName", "options": "Value"}, "properties": [{"id": "displayName", "value": "tokens (24h)"}, {"id": "custom.cellOptions", "value": {"type": "gauge", "mode": "gradient"}}]}]},
"options": {"showHeader": true, "sortBy": [{"displayName": "tokens (24h)", "desc": true}]},
"transformations": [{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true, "job": true}}}]
},
{
"id": 423,
"type": "stat",
"title": "Cache effectiveness (1h)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 84},
"targets": [
{"refId": "A", "expr": "sum(rate(pallas_llm_tokens_total{kind=\"cache_read\", instance=~\"$pallas_inst\"}[1h])) / clamp_min(sum(rate(pallas_llm_tokens_total{kind=~\"input|cache_read\", instance=~\"$pallas_inst\"}[1h])), 0.0001)", "legendFormat": "cache hit"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "orange", "value": 0.2}, {"color": "green", "value": 0.5}]}}}
},
{
"id": 424,
"type": "stat",
"title": "LLM turns/sec (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 84},
"targets": [
{"refId": "A", "expr": "sum(rate(pallas_llm_turns_total{instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "turns/s"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "ops"}}
},
{
"id": 430,
"type": "timeseries",
"title": "Cypher tool calls — rate by outcome (Pallas → ariel)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 92},
"targets": [
{"refId": "A", "expr": "sum by (outcome) (rate(pallas_tool_calls_total{server=\"neo4j_cypher\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m]))", "legendFormat": "{{outcome}}"}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 431,
"type": "timeseries",
"title": "Cypher tool calls — p95 latency by agent",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 92},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, agent, instance) (rate(pallas_tool_call_duration_seconds_bucket{server=\"neo4j_cypher\", instance=~\"$pallas_inst\", agent=~\"$agent\"}[5m])))", "legendFormat": "{{instance}}/{{agent}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 440,
"type": "timeseries",
"title": "Neo4j @ ariel — transactions (rate / open)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 100},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"ariel.*\"}[5m])", "legendFormat": "{{instance}} open rate"},
{"refId": "B", "expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"ariel.*\"}", "legendFormat": "{{instance}} current open"}
],
"fieldConfig": {"defaults": {"unit": "short"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 441,
"type": "stat",
"title": "Neo4j @ ariel — rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 100},
"targets": [
{"refId": "A", "expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"ariel.*\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"ariel.*\"}[10m]), 0.0001)", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 442,
"type": "stat",
"title": "Neo4j @ ariel — store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 100},
"targets": [
{"refId": "A", "expr": "neo4j_monitor_store_totalStoreSize{instance=~\"ariel.*\"}", "legendFormat": "{{instance}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
}
]
}

351
dashboards/neo4j.json Normal file
View File

@@ -0,0 +1,351 @@
{
"title": "Neo4j",
"uid": "neo4j",
"tags": ["neo4j", "graph"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"templating": {
"list": [
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "instance",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(up{job=\"neo4j\"}, instance)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "Instance"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Overview",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "stat",
"title": "Exporter up",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 1},
"targets": [
{
"refId": "A",
"expr": "up{job=\"neo4j\", instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 3,
"type": "stat",
"title": "Nodes",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_ids_nodeIds{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 4,
"type": "stat",
"title": "Relationships",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_ids_relIds{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 5,
"type": "stat",
"title": "Total store size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 1},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_totalStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "bytes"}}
},
{
"id": 10,
"type": "row",
"title": "Transactions",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}
},
{
"id": 11,
"type": "timeseries",
"title": "Transaction open rate (per second)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
"targets": [
{
"refId": "A",
"expr": "rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[5m])",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {"defaults": {"unit": "ops"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "timeseries",
"title": "Currently open transactions",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_tx_currentOpenedTx{instance=~\"$instance\"}",
"legendFormat": "{{instance}} current"
},
{
"refId": "B",
"expr": "neo4j_monitor_tx_peakTx{instance=~\"$instance\"}",
"legendFormat": "{{instance}} peak"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 13,
"type": "stat",
"title": "Rollback ratio (10m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 12, "x": 0, "y": 14},
"targets": [
{
"refId": "A",
"expr": "rate(neo4j_monitor_tx_rolledBackTx{instance=~\"$instance\"}[10m]) / clamp_min(rate(neo4j_monitor_tx_totalOpenedTx{instance=~\"$instance\"}[10m]), 0.0001)",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 14,
"type": "stat",
"title": "Last tx ID",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 12, "x": 12, "y": 14},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_tx_lastTxId{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short"}}
},
{
"id": 20,
"type": "row",
"title": "Store breakdown",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}
},
{
"id": 21,
"type": "timeseries",
"title": "Store size by component",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 19},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_nodeStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} nodes"
},
{
"refId": "B",
"expr": "neo4j_monitor_store_relStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} rels"
},
{
"refId": "C",
"expr": "neo4j_monitor_store_propStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} props"
},
{
"refId": "D",
"expr": "neo4j_monitor_store_stringStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} strings"
},
{
"refId": "E",
"expr": "neo4j_monitor_store_arrayStoreSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}} arrays"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 22,
"type": "timeseries",
"title": "Transaction log size",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 19},
"targets": [
{
"refId": "A",
"expr": "neo4j_monitor_store_logSize{instance=~\"$instance\"}",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 30,
"type": "row",
"title": "Exporter JVM (sidecar health)",
"collapsed": true,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}
},
{
"id": 31,
"type": "timeseries",
"title": "Exporter JVM heap used / max",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 28},
"targets": [
{
"refId": "A",
"expr": "jvm_memory_used_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}",
"legendFormat": "{{instance}} used"
},
{
"refId": "B",
"expr": "jvm_memory_max_bytes{job=\"neo4j\", area=\"heap\", instance=~\"$instance\"}",
"legendFormat": "{{instance}} max"
}
],
"fieldConfig": {"defaults": {"unit": "bytes"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 32,
"type": "timeseries",
"title": "Exporter GC time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 28},
"targets": [
{
"refId": "A",
"expr": "rate(jvm_gc_collection_seconds_sum{job=\"neo4j\", instance=~\"$instance\"}[5m])",
"legendFormat": "{{instance}} {{gc}}"
}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 40,
"type": "row",
"title": "Logs",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 36}
},
{
"id": 41,
"type": "timeseries",
"title": "Neo4j log rate by host",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 37},
"targets": [
{
"refId": "A",
"expr": "sum by (hostname) (rate({job=\"neo4j\"}[5m]))",
"legendFormat": "{{hostname}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 42,
"type": "logs",
"title": "Neo4j — last 50 lines (errors/warnings first)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 37},
"targets": [
{
"refId": "A",
"expr": "{job=\"neo4j\"} |~ \"(?i)error|warn|exception\"",
"maxLines": 50
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
},
{
"id": 43,
"type": "logs",
"title": "Neo4j — all logs (live tail)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 45},
"targets": [
{
"refId": "A",
"expr": "{job=\"neo4j\"}",
"maxLines": 100
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
}
]
}

202
dashboards/searxng.json Normal file
View File

@@ -0,0 +1,202 @@
{
"title": "SearXNG",
"uid": "searxng",
"tags": ["searxng", "argos", "ouranos"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Argos dashboard",
"tooltip": "Argos MCP server using these SearXNG instances",
"type": "link",
"url": "/d/argos"
},
{
"asDropdown": false,
"icon": "doc",
"includeVars": true,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "SearXNG logs",
"tooltip": "Loki: {job=\"searxng\"}",
"type": "link",
"url": "/explore?orgId=1&left=%7B%22datasource%22:%22Loki%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bjob%3D%5C%22searxng%5C%22%7D%22%7D%5D%7D"
}
],
"templating": {
"list": [
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
},
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "host",
"type": "query",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"query": "label_values(probe_success{service=\"searxng\"}, hostname)",
"refresh": 1,
"includeAll": true,
"multi": true,
"current": {"selected": true, "text": "All", "value": "$__all"},
"label": "SearXNG host"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Independent probe (Alloy blackbox /healthz)",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "stat",
"title": "SearXNG /healthz",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 5, "w": 8, "x": 0, "y": 1},
"targets": [
{"refId": "A", "expr": "probe_success{service=\"searxng\", hostname=~\"$host\"}", "legendFormat": "{{hostname}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "background", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 1}]}}}
},
{
"id": 3,
"type": "stat",
"title": "Last probe HTTP status",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 5, "w": 8, "x": 8, "y": 1},
"targets": [
{"refId": "A", "expr": "probe_http_status_code{service=\"searxng\", hostname=~\"$host\"}", "legendFormat": "{{hostname}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name"},
"fieldConfig": {"defaults": {"unit": "short", "thresholds": {"mode": "absolute", "steps": [{"color": "red"}, {"color": "green", "value": 200}, {"color": "orange", "value": 300}, {"color": "red", "value": 400}]}}}
},
{
"id": 4,
"type": "stat",
"title": "Probe duration (last)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 5, "w": 8, "x": 16, "y": 1},
"targets": [
{"refId": "A", "expr": "probe_duration_seconds{service=\"searxng\", hostname=~\"$host\"}", "legendFormat": "{{hostname}}"}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}, "colorMode": "value", "textMode": "value_and_name", "graphMode": "area"},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 3}]}}}
},
{
"id": 5,
"type": "timeseries",
"title": "Probe success over time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
"targets": [
{"refId": "A", "expr": "probe_success{service=\"searxng\", hostname=~\"$host\"}", "legendFormat": "{{hostname}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "min": 0, "max": 1, "custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 20}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 6,
"type": "timeseries",
"title": "Probe duration over time",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
"targets": [
{"refId": "A", "expr": "probe_duration_seconds{service=\"searxng\", hostname=~\"$host\"}", "legendFormat": "{{hostname}}"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 10,
"type": "row",
"title": "Argos's view of these instances",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 14}
},
{
"id": 11,
"type": "timeseries",
"title": "argos_searxng_instance_up by SearXNG instance",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
"targets": [
{"refId": "A", "expr": "argos_searxng_instance_up", "legendFormat": "{{searxng_instance}}"}
],
"fieldConfig": {"defaults": {"unit": "short", "min": 0, "max": 1, "custom": {"drawStyle": "line", "lineWidth": 2, "fillOpacity": 20}}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "timeseries",
"title": "Search latency p95 from Argos (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
"targets": [
{"refId": "A", "expr": "histogram_quantile(0.95, sum by (le, searxng_instance) (rate(argos_searxng_request_duration_seconds_bucket[5m])))", "legendFormat": "{{searxng_instance}} p95"}
],
"fieldConfig": {"defaults": {"unit": "s"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 13,
"type": "timeseries",
"title": "Search request error ratio from Argos (5m)",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 23},
"targets": [
{"refId": "A", "expr": "sum by (searxng_instance) (rate(argos_searxng_requests_total{status=\"error\"}[5m])) / clamp_min(sum by (searxng_instance) (rate(argos_searxng_requests_total[5m])), 0.0001)", "legendFormat": "{{searxng_instance}}"}
],
"fieldConfig": {"defaults": {"unit": "percentunit"}},
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 90,
"type": "row",
"title": "Logs",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 31}
},
{
"id": 91,
"type": "logs",
"title": "searxng (Loki)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 12, "w": 24, "x": 0, "y": 32},
"targets": [
{"refId": "A", "expr": "{job=\"searxng\"}"}
],
"options": {"showTime": true, "wrapLogMessage": true, "enableLogDetails": true, "dedupStrategy": "none"}
}
]
}

View File

@@ -0,0 +1,508 @@
# Red Panda Approval™ — Django Addendum
**Owner:** Robert Helewka &lt;r@helu.ca&gt;
**Version:** 1.02
**Last reviewed:** 2026-04-20
**Parent document:** [Red_Panda_Standards_V1-00.md](Red_Panda_Standards_V1-00.md)
This document extends the main Red Panda Standards with Django-specific conventions. Where the two documents overlap, the **main standard governs** — this addendum only adds Django-specific detail or explicitly-noted exceptions.
## 🐾 Red Panda Approval™
This project follows Red Panda Approval standards — our gold standard for Django application quality. Code must be elegant, reliable, and maintainable to earn the approval of our adorable red panda judges.
### The 5 Sacred Django Criteria
1. **Fresh Migration Test** — Clean migrations from empty database
2. **Elegant Simplicity** — No unnecessary complexity
3. **Observable & Debuggable** — Proper logging and error handling
4. **Consistent Patterns** — Follow Django conventions
5. **Actually Works** — Passes all checks and serves real user needs
## Environment Standards
- Virtual environment: ~/env/PROJECT/bin/activate
- Use pyproject.toml for project configuration (no setup.py, no requirements.txt)
- **Build backend: `setuptools`** — use `setuptools` (not Hatchling or Flit) as the build backend in all Python projects. Reason: C extension modules require setuptools; standardizing on one backend eliminates backend-switching when native modules are added.
- Python version: specified in pyproject.toml
- Dependencies: floor-pinned with ceiling (e.g. `Django>=5.2,<6.0`)
### pyproject.toml build backend
```toml
[build-system]
requires = ["setuptools>=70"]
build-backend = "setuptools.backends.legacy:build"
```
### Dependency Pinning
```toml
# Correct — floor pin with ceiling
dependencies = [
"Django>=5.2,<6.0",
"djangorestframework>=3.14,<4.0",
"cryptography>=41.0,<45.0",
]
# Wrong — exact pins in library packages
dependencies = [
"Django==5.2.7", # too strict, breaks downstream
]
```
Exact pins (`==`) are only appropriate in application-level lock files, not in reusable library packages.
## Directory Structure
myproject/ # Git repository root
├── .gitignore
├── README.md
├── pyproject.toml # Project configuration (moved to repo root)
├── docker-compose.yml
├── .env # Docker Compose environment
│ # ANGELIA_DB_ENGINE=postgresql
│ # ANGELIA_DB_NAME=angelia2
│ # ANGELIA_DB_USER=angelia
│ # ANGELIA_DB_PASSWORD=changeme
│ # ANGELIA_DB_HOST=db
│ # ANGELIA_DB_PORT=5432
├── .env.example
├── project/ # Django project root (manage.py lives here)
│ ├── manage.py
│ ├── Dockerfile
│ ├── .env # Local development environment
│ │ # ANGELIA_DB_ENGINE=sqlite
├── .env.example
├── config/ # Django configuration module
│ │ ├── __init__.py
│ │ ├── settings.py
│ │ ├── urls.py
│ │ ├── wsgi.py
│ │ └── asgi.py
│ │
│ ├── accounts/ # Django app
│ │ ├── __init__.py
│ │ ├── models.py
│ │ ├── views.py
│ │ └── urls.py
│ │
│ ├── blog/ # Django app
│ │ ├── __init__.py
│ │ ├── models.py
│ │ ├── views.py
│ │ └── urls.py
│ │
│ ├── static/
│ │ ├── css/
│ │ └── js/
│ │
│ └── templates/
│ └── base.html
├── web/ # Nginx configuration
│ └── nginx.conf
├── db/ # PostgreSQL configuration
│ └── postgresql.conf
└── docs/ # Project documentation
└── index.md
## Settings Structure
- Use a single settings.py file
- Use django-environ or python-dotenv for environment variables
- Never commit .env files to version control
- Provide .env.example with all required variables documented
- Create .gitignore file
- Create a .dockerignore file
## Environment Variables
All env vars in `.env` MUST use the `SERVICENAME_` prefix (per main standard). The examples below use `ANGELIA_` — substitute the actual service name for your app.
### PostgreSQL settings (only if `SERVICENAME_DB_ENGINE=postgresql`)
```
ANGELIA_DB_NAME=angelia2
ANGELIA_DB_USER=angelia
ANGELIA_DB_PASSWORD=changeme
ANGELIA_DB_HOST=db
ANGELIA_DB_PORT=5432
```
### Rules
- Never use `DATABASE_URL` or `dj-database-url` — always individual vars
- Never use unprefixed `DB_HOST` / `APP_DB_NAME` — always service-prefixed
- The Django `Settings` class declares each prefixed var explicitly so the full config is documented in one place
- `.env` is gitignored; `.env.example` with placeholder values is committed
## Code Organization
- Imports: PEP 8 ordering (stdlib, third-party, local)
- Type hints on function parameters
- CSS: External .css files only (no inline styles, no embedded `<style>` tags)
- JS: External .js files only (no inline handlers, no embedded `<script>` blocks)
- Maximum file length: 1000 lines
- If a file exceeds 500 lines, consider splitting by domain concept
## Database Conventions
- Migrations run cleanly from empty database
- Never edit deployed migrations
- Use meaningful migration names: --name add_email_to_profile
- One logical change per migration when possible
- Test migrations both forward and backward
### Development vs Production
- Development: SQLite
- Production: PostgreSQL
## Caching
- Expensive queries are cached
- Cache keys follow naming convention
- TTLs are appropriate (not infinite)
- Invalidation is documented
- Key Naming Pattern: {app}:{model}:{identifier}:{field}
## Model Naming
- Model names: singular PascalCase (User, BlogPost, OrderItem)
- Correct English pluralization on related names
- All models have created_at and updated_at
- All models define __str__ and get_absolute_url
- TextChoices used for status fields
- related_name defined on ForeignKey fields
- Related names: plural snake_case with proper English pluralization
## Forms
- Use ModelForm with explicit fields list (never __all__)
## Field Naming
- Foreign keys: singular without _id suffix (author, category, parent)
- Boolean fields: use prefixes (is_active, has_permission, can_edit)
- Date fields: use suffixes (created_at, updated_at, published_on)
- Avoid abbreviations (use description, not desc)
## Required Model Fields
- All models should include:
- created_at = models.DateTimeField(auto_now_add=True)
- updated_at = models.DateTimeField(auto_now=True)
- Consider adding:
- id = models.UUIDField(primary_key=True) for public-facing models
- is_active = models.BooleanField(default=True) for soft deletes
## Indexing
- Add db_index=True to frequently queried fields
- Use Meta.indexes for composite indexes
- Document why each index exists
## Queries
- Use select_related() for foreign keys
- Use prefetch_related() for reverse relations and M2M
- Avoid queries in loops (N+1 problem)
- Use .only() and .defer() for large models
- Add comments explaining complex querysets
## Docstrings
- Use Sphinx style docstrings
- Document all public functions, classes, and modules
- Skip docstrings for obvious one-liners and standard Django overrides
## Views
- Use Function-Based Views (FBVs) exclusively
- Explicit logic is preferred over implicit inheritance
- Extract shared logic into utility functions
## URLs & Identifiers
- Public URLs use short UUIDs (12 characters) via `shortuuid`
- Never expose sequential IDs in URLs (security/enumeration risk)
- Internal references may use standard UUIDs or PKs
## URL Patterns
- Resource-based URLs (RESTful style)
- Namespaced URL names per app
- Trailing slashes (Django default)
- Flat structure preferred over deep nesting
## Background Tasks
- All tasks are run synchronously unless the design specifies background tasks are needed for long operations
- Long operations use Celery tasks
- Use Memcached, task progress pattern: {app}:task:{task_id}:progress
- Tasks are idempotent
- Tasks include retry logic
- Tasks live in app/tasks.py
- RabbitMQ is the Message Broker
- Flower Monitoring: Use for debugging failed tasks
### Celery Observability (per main standard)
Celery workers are "long-running background workers" under the main standard and MUST comply with its Background Worker & Queue Monitoring section:
- **Heartbeat**: every 60 seconds at INFO level, e.g. `logger.info("celery worker alive, processed %d tasks in last 5m, queue depth: %d", n, depth)`. Implement as a Celery beat task or a dedicated heartbeat thread.
- **Startup / shutdown / crash-exit** logged at INFO — hook `worker_ready`, `worker_shutdown`, `worker_process_init` signals.
- **Queue depth** exposed as a Prometheus metric (via `celery-exporter` or equivalent) so a growing-queue-with-no-consumers alert can fire at ERROR severity.
- **Grafana staleness alert**: `absent_over_time({service_name="celery_worker_<app>"}[10m])` → ERROR → email via AlertManager.
- **Crash-on-start**: rely on the systemd unit or Docker restart policy to log the exit — do not assume the crashing Celery worker will log its own death.
## Logging (per main standard)
Django apps follow the main standard's [Log Level Standards](Red_Panda_Standards_V1-00.md#log-level-standards). Django-specific implementation notes:
- **Default level: `WARNING`** for app loggers in production. Business logic only surfaces when degraded or broken.
- **Level casing: UPPERCASE** (`INFO`, `WARNING`, `ERROR`, `DEBUG`) — Python/Django convention.
- **Never use `print()`** — always `logger = logging.getLogger(__name__)`.
- **Client telemetry** received at `POST /api/v1/telemetry` MUST be logged at `WARNING` level (browser-side errors are user-facing problems, not server failures).
- **Access log filtering**: Gunicorn AND the upstream reverse proxy (nginx) must not emit 2xx/3xx entries for `/live`, `/ready`, `/metrics`, `/nginx_status`, `/health*`, `/ping`, or service-specific probes like `/mcp/health`. Filter these in the access-log handler. Both trailing-slash and non-trailing-slash forms MUST be matched. Implementation recipes are in the Gunicorn and nginx subsections under Health Check Endpoints below.
- **Structured output**: log to stdout in a format Alloy can parse (JSON preferred). Every log line MUST carry a `level` label downstream.
- **Expected conditions are not ERROR**: failed logins, form validation errors, 404s on user-supplied slugs → WARNING or INFO. Reserve ERROR for things that are actually broken.
## Health Check Endpoints (per main standard)
Every Django service MUST expose:
| Endpoint | Purpose | Auth |
|----------|---------|------|
| `GET /live/` | Liveness — process is running | None |
| `GET /ready/` | Readiness — DB, cache, upstream deps all healthy | None |
| `GET /metrics` | Prometheus metrics | IP-restricted, no JWT |
- **Trailing slash**: standard is `/live/` and `/ready/`. Django's `APPEND_SLASH` redirects un-slashed requests to the canonical slashed form — document as an exception only if you disable that behavior.
- **Readiness logic** MUST actually probe dependencies: `connection.ensure_connection()` for the DB, a Memcached `ping`, a minimal RabbitMQ connection check. A bare `return HttpResponse(status=200)` fails the main standard.
- **Do NOT require authentication** on health endpoints — HAProxy and Prometheus scrapers cannot authenticate.
- **`/metrics`** is exposed via `django-prometheus` (preferred) and IP-restricted to internal networks per the main standard.
### Internal-network allowlist (nginx)
Any endpoint restricted to "internal networks only" (`/metrics`, `/nginx_status`, `nginx-prometheus-exporter` scrape targets, etc.) MUST use the full RFC1918 + loopback allowlist — **all four ranges**, in this order:
```nginx
allow 127.0.0.0/8; # loopback
allow 10.0.0.0/8; # RFC1918 — primary internal range
allow 172.16.0.0/12; # RFC1918 — Docker default bridge range
allow 192.168.0.0/16; # RFC1918
deny all;
```
Omitting `10.0.0.0/8` is the most common mistake and will silently break Prometheus scrapes from hosts on that network. Do not copy a shorter allowlist from older configs.
### Gunicorn configuration
Gunicorn MUST:
- Log access AND error output to **stdout/stderr** — never a file inside the container. The Docker logging driver (syslog → Alloy in our stack) is the single collection point.
- Use a `gunicorn.conf.py` referenced via `--config` so configuration lives in version control rather than a growing CMD string.
- Filter probe paths out of the access log via a `logging.Filter` attached to the `gunicorn.access` logger in BOTH `on_starting` (master) AND `post_worker_init` (workers — Gunicorn re-applies logger config per worker, so a master-only filter is silently stripped).
Canonical launch command:
```dockerfile
CMD ["gunicorn", \
"--config", "/srv/<app>/gunicorn.conf.py", \
"--bind", ":8080", \
"--workers", "3", \
"--timeout", "120", \
"--keep-alive", "5", \
"--access-logfile", "-", \
"--error-logfile", "-", \
"<app>.wsgi:application"]
```
Canonical `gunicorn.conf.py` probe filter:
```python
import logging
import re
_PROBE_PATH = re.compile(
r"^(?:/live|/ready|/metrics|/nginx_status|/health[^ ]*|/ping|/mcp/health)/?(?:\?|$)"
)
class _ProbePathFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
request = getattr(record, "args", None)
if isinstance(request, dict):
# Gunicorn access log atoms: 'U' = URL path, 'r' = full request line
path = request.get("U") or request.get("r", "")
else:
path = record.getMessage()
return not _PROBE_PATH.search(path)
_filter = _ProbePathFilter()
def on_starting(server):
logging.getLogger("gunicorn.access").addFilter(_filter)
def post_worker_init(worker):
logging.getLogger("gunicorn.access").addFilter(_filter)
```
Update the probe-path regex if the service exposes additional health endpoints (e.g. sidecar servers). Do NOT special-case by status code — a 500 on `/ready/` is noise in Gunicorn's access log but is already surfaced via the readiness probe failing and the error log.
### Nginx access-log filtering
The reverse proxy sees the same probe traffic and will log it unless filtered. Use a `map` + conditional `access_log`:
```nginx
http {
map $request_uri $loggable {
default 1;
~^/live(/|\?|$) 0;
~^/ready(/|\?|$) 0;
~^/metrics(/|\?|$) 0;
~^/nginx_status(/|\?|$) 0;
~^/health 0;
~^/ping(/|\?|$) 0;
~^/mcp/health(/|\?|$) 0;
}
access_log /var/log/nginx/access.log combined if=$loggable;
# ...
}
```
This is an nginx-wide switch — do not duplicate per `location` block. Error logging is unaffected; genuine 4xx/5xx on probe paths still surface via the error log and the probe itself failing.
See [Red_Panda_Standards_V1-00.md §Health Check Endpoints](Red_Panda_Standards_V1-00.md#health-check-endpoints) for the full definition.
## Testing
- Framework: Django TestCase (not pytest)
- Separate test files per module: test_models.py, test_views.py, test_forms.py
## Frontend Standards
### New Projects (DaisyUI + Tailwind)
- DaisyUI 4 via CDN for component classes
- Tailwind CSS via CDN for utility classes
- Theme management via Themis (DaisyUI `data-theme` attribute)
- All apps extend `themis/base.html` for consistent navigation
- No inline styles or scripts
### Existing Projects (Bootstrap 5)
- Bootstrap 5 via CDN
- Bootstrap Icons via CDN
- Bootswatch for theme variants (if applicable)
- django-bootstrap5 and crispy-bootstrap5 for form rendering
## Preferred Packages
### Core Django
- django>=5.2,<6.0
- django-environ — Environment variables
### Authentication & Security
- django-allauth — User management
- django-allauth-2fa — Two-factor authentication
### API Development
- djangorestframework>=3.14,<4.0 — REST APIs
- drf-spectacular — OpenAPI/Swagger documentation
### Encryption
- cryptography — Fernet encryption for secrets/API keys
### Background Tasks
- celery — Async task queue
- django-celery-progress — Progress bars
- flower — Celery monitoring
### Caching
- pymemcache — Memcached backend
### Observability
- django-prometheus — `/metrics` endpoint in Prometheus exposition format
- celery-exporter (or equivalent) — queue depth metrics for Celery workers
### Database
- psycopg[binary] — PostgreSQL adapter
- shortuuid — Short UUIDs for public URLs
### Production
- gunicorn — WSGI server
### Shared Apps
- django-heluca-themis — User preferences, themes, key management, navigation
### Deprecated / Removed
- ~~pytz~~ — Use stdlib `zoneinfo` (Python 3.9+, Django 4+)
- ~~Pillow~~ — Only add if your app needs ImageField
- ~~django-heluca-core~~ — Replaced by Themis
- ~~dj-database-url~~ — Use individual Django DB env vars instead
## Anti-Patterns to Avoid
### Models
- Don't use `Model.objects.get()` without handling `DoesNotExist`
- Don't use `null=True` on `CharField` or `TextField` (use `blank=True, default=""`)
- Don't use `related_name='+'` unless you have a specific reason
- Don't override `save()` for business logic (use signals or service functions)
- Don't use `auto_now=True` on fields you might need to manually set
- Don't use `ForeignKey` without specifying `on_delete` explicitly
- Don't use `Meta.ordering` on large tables (specify ordering in queries)
### Queries
- Don't query inside loops (N+1 problem)
- Don't use `.all()` when you need a subset
- Don't use raw SQL unless absolutely necessary
- Don't forget `select_related()` and `prefetch_related()`
### Views
- Don't put business logic in views
- Don't use `request.POST.get()` without validation (use forms)
- Don't return sensitive data in error messages
- Don't forget `login_required` decorator on protected views
### Forms
- Don't use `fields = '__all__'` in ModelForm
- Don't trust client-side validation alone
- Don't use `exclude` in ModelForm (use explicit `fields`)
### Templates
- Don't use `{{ variable }}` for URLs (use `{% url %}` tag)
- Don't put logic in templates
- Don't use inline CSS or JavaScript (external files only)
- Don't forget `{% csrf_token %}` in forms
### Security
- Don't store secrets in `settings.py` (use environment variables)
- Don't commit `.env` files to version control
- Don't use `DEBUG=True` in production
- Don't expose sequential IDs in public URLs
- Don't use `mark_safe()` on user-supplied content
- Don't disable CSRF protection
### Imports & Code Style
- Don't use `from module import *`
- Don't use mutable default arguments
- Don't use bare `except:` clauses
- Don't ignore linter warnings without documented reason
### Migrations
- Don't edit migrations that have been deployed
- Don't use `RunPython` without a reverse function
- Don't add non-nullable fields without a default value
### Celery Tasks
- Don't pass model instances to tasks (pass IDs and re-fetch)
- Don't assume tasks run immediately
- Don't forget retry logic for external service calls
- Don't run a Celery worker without a heartbeat (see Celery Observability)
### Logging
- Don't use `print()` — always use `logging.getLogger(__name__)`
- Don't log at ERROR for expected conditions (failed logins, 404s, validation errors)
- Don't log at INFO for successful probes of `/live`, `/ready`, `/metrics`
- Don't log passwords, tokens, API keys, session cookies, or PII at any level
- Don't use lowercase level names in Python code (UPPERCASE for Django/Python)
---
## Exceptions
Per the main standard, deviations from Red Panda requirements MUST be recorded rather than hidden. Third-party Django packages, framework defaults, or deliberate trade-offs all go here.
| Service | Standard waived | Reason | Reviewed |
|---------|-----------------|--------|----------|
| _(add as discovered)_ | | | |
Exceptions MUST be re-reviewed on the doc's `Last reviewed` date. Remove entries whose underlying reason has gone away.

View File

@@ -2,6 +2,10 @@
Quality and observability standards for the Ouranos Lab. All infrastructure code, application code, and LLM-generated code deployed into this environment must meet these standards.
**Owner:** Robert Helewka &lt;r@helu.ca&gt;
**Version:** 1.00
**Last reviewed:** 2026-04-18
---
## 🐾 Red Panda Approval™
@@ -24,7 +28,7 @@ All sensitive information is encrypted using Ansible Vault with AES256 encryptio
- Database passwords (PostgreSQL, Neo4j)
- API keys (OpenAI, Anthropic, Mistral, Groq)
- Application secrets (Grafana, SearXNG, Arke)
- Monitoring alerts (Pushover integration)
- Monitoring alerts (AlertManager email integration)
**Security rules:**
- AES256 encryption with `ansible-vault`
@@ -42,7 +46,7 @@ All services in the Ouranos Lab MUST follow these log level conventions. These r
| Level | When to Use | What MUST Be Included | Loki / Grafana Role |
|-------|-------------|----------------------|---------------------|
| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger Pushover notifications. |
| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger email notifications. |
| **WARNING** | Degraded but self-recovering: retries succeeding, fallback paths taken, thresholds approaching, deprecated features invoked. | What degraded, what recovery action was taken, current metric value vs. threshold. | Grafana dashboard panels. Rate-based alerting (e.g., >N warnings/min). |
| **INFO** | Significant lifecycle and business events: service start/stop, configuration loaded, deployment markers, user authentication, job completion, schema migrations. | The event and its outcome. This level tells the *story* of what the system did. | Default production visibility. The go-to level for post-incident timelines. |
| **DEBUG** | Diagnostic detail for active troubleshooting: request/response payloads, SQL queries, internal state, variable values. | **Actionable context is mandatory.** A DEBUG line with no detail is worse than no line at all. Include variable values, object states, or decision paths. | Never enabled in production by default. Used on-demand via per-service level override. |
@@ -53,19 +57,34 @@ These are explicit violations of Ouranos logging standards:
| ❌ Anti-Pattern | Why It's Wrong | ✅ Correct Approach |
|----------------|---------------|-------------------|
| Health checks logged at INFO (`GET /health → 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress health endpoints from access logs entirely, or demote to DEBUG. |
| Health/metrics checks logged at INFO (`GET /live → 200 OK`, `GET /metrics → 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress successful probes to `/live`, `/ready`, `/metrics`, `/health*`, `/ping` from access logs entirely. Non-2xx responses MUST still log. |
| DEBUG with no context (`logger.debug("error occurred")`) | Provides zero diagnostic value. If DEBUG is noisy *and* useless, nobody will ever enable it. | `logger.debug("PaymentService.process failed: order_id=%s, provider=%s, response=%r", oid, provider, resp)` |
| ERROR without exception details (`logger.error("task failed")`) | Cannot be triaged without reproduction steps. Wastes on-call time. | `logger.error("Celery task invoice_gen failed: order_id=%s", oid, exc_info=True)` |
| Logging sensitive data at any level | Passwords, tokens, API keys, and PII in Loki are a security incident. | Mask or redact: `api_key=sk-...a3f2`, `password=*****`. |
| Inconsistent level casing | Breaks LogQL filters and Grafana label selectors. | **Python / Django**: UPPERCASE (`INFO`, `WARNING`, `ERROR`, `DEBUG`). **Go / infrastructure** (HAProxy, Alloy, Gitea): lowercase (`info`, `warn`, `error`, `debug`). |
| Logging expected conditions as ERROR | A user entering a wrong password is not an error — it is normal business logic. | Use WARNING or INFO for expected-but-notable conditions. Reserve ERROR for things that are actually broken. |
### Health Check Rule
### Health Check & Monitoring Endpoint Rule
> All services exposed through HAProxy MUST suppress or demote health check endpoints (`/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`) to DEBUG or below. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful health probe, your log level is wrong.
> All services MUST suppress successful (2xx/3xx) access log entries for health and monitoring endpoints: `/live`, `/ready`, `/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful probe of one of these endpoints, your log level is wrong.
>
> Non-2xx responses to these paths MUST still be logged — a failing `/ready` is a real signal.
**Implementation guidance:**
- **Django / Gunicorn**: Filter health paths in the access log handler or use middleware that skips logging for probe user-agents.
- **FastAPI / Uvicorn**: Add a `logging.Filter` on the `uvicorn.access` logger that matches health paths in the access log message. Uvicorn's access log format includes the full request line in quotes (e.g., `"GET /live HTTP/1.1"`), so filter regexes must account for that. See also the structured logging notes below.
- **nginx containers**: nginx does not log through Python loggers, so app-level filters do not apply. Suppress probe access lines at the nginx config level using `map` on `$request_uri` or `$status`:
```nginx
map $request_uri $loggable {
~^/(live|ready|metrics|health|healthz|ping)(/|$|\?) 0;
default 1;
}
server {
access_log /var/log/nginx/access.log combined if=$loggable;
# errors (4xx/5xx) still logged via error_log regardless
}
```
Applies to every nginx-based container (static frontends, reverse proxies, sidecars).
- **Docker services**: Configure the application's internal logging to exclude health routes — the syslog driver forwards everything it receives.
- **HAProxy**: HAProxy's own health check logs (`option httpchk`) should remain at the HAProxy level for connection debugging, but backend application responses to those probes must not surface at INFO.
@@ -77,13 +96,13 @@ When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runne
**Required practices:**
1. **Heartbeat logging** — Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). The *absence* of this heartbeat is the alertable condition.
1. **Heartbeat logging** — Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). **Cadence: every 60 seconds.** The staleness alert fires after 10 minutes of silence (= 10 consecutive missed heartbeats), which gives enough margin to absorb transient Loki ingestion lag without flapping. The *absence* of this heartbeat is the alertable condition.
2. **Startup and shutdown at INFO** — Worker start, ready, graceful shutdown, and crash-exit are significant lifecycle events. These MUST log at INFO.
3. **Queue depth as a metric** — RabbitMQ queue depths and any application-level task queues MUST be exposed as Prometheus metrics. A growing queue with zero consumer activity is an **ERROR**-level alert, not a warning.
4. **Grafana "last seen" alerts** — For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* → ERROR severity → Pushover notification.
4. **Grafana "last seen" alerts** — For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* → ERROR severity → email notification via AlertManager.
5. **Crash-on-start is ERROR** — If a worker exits within seconds of starting (missing config, failed DB connection, import error), the exit MUST be captured at ERROR level by the service manager (`systemd OnFailure=`, Docker restart policy logs). Do not rely on the crashing application to log its own death — it may never get the chance.
@@ -92,7 +111,8 @@ When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runne
| Service Category | Default Level | Rationale |
|-----------------|---------------|-----------|
| Django apps (Angelia, Athena, Kairos, Icarlos, Spelunker, Peitho, MCP Switchboard) | `WARNING` | Business logic — only degraded or broken conditions surface. Lifecycle events (start/stop/deploy) still log at INFO via Gunicorn and systemd. |
| Gunicorn access logs | Suppress 2xx/3xx health probes | Routine request logging deferred to HAProxy access logs in Loki. |
| FastAPI apps (Periplus) | `WARNING` | Same rationale as Django. Uvicorn lifecycle events (start/stop) are pinned to INFO via the `uvicorn.error` logger regardless of app log level. |
| Gunicorn / Uvicorn / nginx access logs | Suppress successful probes to `/live`, `/ready`, `/metrics`, `/health*`, `/ping` | Routine request logging deferred to HAProxy access logs in Loki. |
| Infrastructure agents (Alloy, Prometheus, Node Exporter) | `warn` | Stable — do not change without cause. |
| HAProxy (Titania) | `warning` | Connection-level logging handled by HAProxy's own log format → Alloy → Loki. |
| Databases (PostgreSQL, Neo4j) | `warning` | Query-level logging only enabled for active troubleshooting. |
@@ -100,6 +120,20 @@ When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runne
| LLM Proxy (Arke) | `info` | Token usage tracking and provider routing decisions justify INFO. Review periodically for noise. |
| Observability stack (Grafana, Loki, AlertManager) | `warn` | Should be quiet unless something is wrong with observability itself. |
### Structured Logging — FastAPI / Uvicorn
FastAPI apps using uvicorn require special handling to achieve JSON-structured log output for the Alloy → Loki pipeline. Uvicorn manages its own loggers aggressively, and naive approaches will fail silently.
**Required practices:**
1. **Override uvicorn's handlers, don't just add to root** — Uvicorn's `config.load()` creates its own `StreamHandler` instances on `uvicorn`, `uvicorn.error`, and `uvicorn.access`. You must remove these handlers and set `propagate = True` so log records flow to the root logger where your JSON formatter lives.
2. **Re-apply logging config in the lifespan** — Configuring logging at module import time is not sufficient. Uvicorn's `config.load()` runs *after* your module is imported but *before* the ASGI lifespan starts. Call your logging configuration function again inside the FastAPI `lifespan` context manager to recapture control.
3. **Remap uvicorn logger names** — Uvicorn uses `uvicorn.error` for all lifecycle messages (startup, shutdown, errors) despite the misleading name. Remap it to `uvicorn` in your JSON formatter's output for clarity in Loki queries.
4. **Use `pydantic-settings` with `extra = "ignore"`** — When loading config from `.env` files that contain variables for other services (e.g., oauth2-proxy), pydantic-settings will reject unknown fields by default. Always set `extra = "ignore"` in the model config.
### Loki & Grafana Alignment
**Label normalization**: Alloy pipelines (syslog listeners and journal relabeling) MUST extract and forward a `level` label on every log line. Without a `level` label, the log entry is invisible to level-based dashboard filters and alert rules.
@@ -117,11 +151,20 @@ When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runne
```
**Alerting rules** — Grafana alert rules MUST key off the normalized `level` label:
- `level=~"error|fatal|critical"` → Immediate Pushover notification via AlertManager
- `level=~"error|fatal|critical"` → Immediate email notification via AlertManager
- `absent_over_time({service_name="celery_worker"}[10m])` → Worker heartbeat staleness → ERROR severity
- Rate-based: `rate({service_name="arke"} | json | level="error" [5m]) > 0.1` → Sustained error rate
**Retention alignment**: Loki retention policies should preserve ERROR and WARNING logs longer than DEBUG. DEBUG-level logs generated during troubleshooting sessions should have a short TTL or be explicitly cleaned up.
**Retention alignment**: Loki retention policies MUST preserve higher-severity logs longer than lower-severity ones. Target retention:
| Level | Retention | Rationale |
|-------|-----------|-----------|
| DEBUG | 7 days | Troubleshooting context only — stale debug data is noise. |
| INFO | 30 days | Post-incident timelines and lifecycle review. |
| WARNING | 90 days | Degradation trend analysis across release cycles. |
| ERROR / FATAL / CRITICAL | 90 days | Incident review, root-cause investigation, compliance. |
DEBUG-level logs generated during troubleshooting sessions should be explicitly cleaned up if they would blow past the 7-day budget.
---
@@ -135,10 +178,12 @@ All services MUST expose Kubernetes-style health endpoints at these paths:
| `GET /ready` | **Readiness** — process is running AND all dependencies (DB, cache, upstream APIs) are healthy | None |
| `GET /metrics` | Prometheus metrics | IP-restricted (no JWT) |
- HAProxy uses `health_path: /ready/` for backend health checks — return HTTP 200 when ready
- HAProxy uses `health_path: /ready/` (trailing slash) for backend health checks — return HTTP 200 when ready
- Health endpoints MUST NOT require authentication
- Third-party services use their native paths (`/api/health`, `/api/healthz`, `/-/healthy`, etc.)
**Trailing slash**: The standard path is `/ready/` with a trailing slash. Django's `APPEND_SLASH` handling, FastAPI route declarations, and nginx `location` blocks all differ in how they treat the slash. Services that cannot comply (framework redirects, third-party apps) MUST be recorded in the Exceptions section below. Access-log suppression filters MUST match both `/ready` and `/ready/` forms.
### Docker Compose Healthchecks
Use `curl -f` (install curl in images if needed). Do not use `wget --spider`.
@@ -171,7 +216,7 @@ healthcheck:
## Prometheus Metrics
All services SHOULD expose `GET /metrics` in Prometheus exposition format, scraped by Prospero's Prometheus at 15s intervals.
All services MUST expose `GET /metrics` in Prometheus exposition format, scraped by Prospero's Prometheus at 15s intervals.
- **IP-restricted** to internal networks: `10.10.0.0/24`, `172.16.0.0/12`, `127.0.0.0/8`
- No JWT required — HAProxy and Prometheus scrapers cannot authenticate
@@ -190,6 +235,19 @@ Frontend/browser code MUST report errors and performance data back to the server
---
## Environment Variable Naming
All environment variables for an application MUST use a consistent prefix matching the service name (e.g., `PERIPLUS_`, `ARKE_`, `ANGELIA_`). This applies to every variable in the `.env` file, including those consumed by sidecar services like oauth2-proxy.
**Rules:**
- All vars in `.env` use the `SERVICENAME_` prefix — no exceptions
- `compose.yaml` maps prefixed vars to the sidecar's expected names (e.g., `OAUTH2_PROXY_CLIENT_ID: ${PERIPLUS_CASDOOR_CLIENT_ID}`)
- The application's Settings model SHOULD declare all prefixed vars, even those only consumed by sidecars, so the full configuration is documented in one place
- Every repo MUST include a `.env.example` with placeholder values for all required variables. Add `!.env.example` to `.gitignore` if a broad `.env.*` pattern would otherwise exclude it
- `.env` files with real secrets are ALWAYS gitignored — no exceptions
---
## Docker Networking
- Use the **default Docker bridge network** for simple deployments
@@ -206,8 +264,29 @@ Place documentation in the `/docs/` directory of the repository.
HTML documents must follow [docs/documentation_style_guide.html](documentation_style_guide.html).
- Use Bootstrap CDN with Bootswatch theme **Flatly**
- Include a dark mode toggle button in the navbar
- Use Bootstrap Icons for icons
- Use Bootstrap CSS for styles — avoid custom CSS
- Include a dark mode that follows the system automatically and include a toggle button in the navbar
- avoid custom CSS
- Use **Mermaid** for diagrams
---
## Exceptions
Third-party services and vendor containers frequently cannot comply with every standard in this document (health endpoint paths, access-log filtering, log level semantics, env var prefixes). Rather than force non-compliance into a binary pass/fail, record deviations here so the gap is visible and intentional.
**Rules for exceptions:**
- Every exception MUST name the service, the standard being waived, and the reason (vendor constraint, upstream bug, deliberate trade-off).
- Exceptions MUST be reviewed on the doc's `Last reviewed` date. If the underlying reason has gone away (vendor fixed it, we forked, we replaced the service), remove the exception.
- A missing exception for a known-non-compliant service is itself a Red Panda violation — the point is transparency.
| Service | Standard waived | Reason | Reviewed |
|---------|-----------------|--------|----------|
| _(example)_ Gitea | `/live`, `/ready` paths — uses `/api/healthz` | Upstream does not expose K8s-style endpoints | 2026-04-18 |
| _(example)_ Nextcloud | Env var prefix `NEXTCLOUD_` — uses vendor-defined `NC_*` and unprefixed vars | Vendor container ignores renamed vars | 2026-04-18 |
| _(add real exceptions as they are discovered)_ | | | |
**Health path trailing-slash exceptions** — services that serve `/ready` without the trailing slash (framework default, cannot be reconfigured without breaking routing):
| Service | Actual path | Reason |
|---------|-------------|--------|
| _(add as discovered)_ | | |

289
docs/alloy.md Normal file
View File

@@ -0,0 +1,289 @@
# Alloy Log & Metric Collection
Grafana Alloy runs as a **native systemd service** (never in Docker) on every
Ouranos host with `alloy` in its `services` list. It collects logs and forwards
them to **Loki on Prospero** (`http://prospero.incus:3100/loki/api/v1/push`),
and scrapes host/container metrics that it **remote-writes** to **Prometheus on
Prospero** (`http://prospero.incus:9090/api/v1/write`).
## Overview
- **Default config:** [`ansible/alloy/config.alloy.j2`](../ansible/alloy/config.alloy.j2) — journal-only fallback for hosts without a dedicated config.
- **Per-host config:** [`ansible/alloy/<hostname_short>/config.alloy.j2`](../ansible/alloy/) — overrides the default when present.
- **Selection:** [`alloy/deploy.yml`](../ansible/alloy/deploy.yml) stat-checks `<hostname_short>/config.alloy.j2` on the controller; if it exists, that template is rendered, otherwise the default is used.
- **Log destination:** Loki on `prospero.incus:3100` via `loki.write "default"`.
- **Metric destination:** Prometheus on `prospero.incus:9090` via `prometheus.remote_write "default"`.
- **Environment:** every stream is labelled `environment="{{ deployment_environment }}"` (`ouranos`) and `hostname="{{ inventory_hostname }}"`.
- **Deploy:** `ansible-playbook alloy/deploy.yml` (optionally `--limit <host>`).
`deploy.yml` also adds the `alloy` user to the host's `docker` group when the
host has `docker` in its services — this is what lets Alloy read
`/var/run/docker.sock` for the Docker discovery and cAdvisor blocks below.
## Log Sources
Ouranos collects logs through three mechanisms. New Dockerised services should
use the **Docker socket discovery** path (preferred); the per-service syslog
listener is the older pattern, still in use on several hosts.
### 1. Systemd journal (native services)
Every host includes a `loki.source.journal` component capturing all systemd
unit output. By default journal entries are labelled `job="systemd"`; a
`loki.relabel` component can promote specific units to a richer label set (see
[Journal relabeling](#journal-relabeling-native-services)).
This is the correct path for **native systemd services** (binaries managed by a
`.service` unit) — they write to stdout/stderr, systemd captures it in the
journal, and Alloy forwards it. No syslog port or log file needed.
### 2. Docker socket discovery (preferred for containers)
> **Reference implementation:** [`ansible/alloy/puck/config.alloy.j2`](../ansible/alloy/puck/config.alloy.j2).
> Puck is currently the lead host for this pattern; other Docker hosts still use
> per-service syslog listeners and should migrate to this model over time.
A **single** pair of `discovery.docker` + `loki.source.docker` blocks collects
stdout from **every Compose project on the host**, current and future — no
per-service configuration. Container log streams are labelled from Docker's own
Compose metadata:
- `service` ← Compose **project** name (e.g. `athena`, `mnemosyne`, `daedalus`)
- `component` ← Compose **service** name (e.g. `app`, `mcp`, `nginx`, `worker`)
- `container` ← raw container name (for non-Compose `docker run` containers)
```alloy
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
refresh_interval = "30s"
}
discovery.relabel "containers" {
targets = discovery.docker.containers.targets
rule { // Compose project → service
source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
target_label = "service"
}
rule { // Compose service → component
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "component"
}
rule { // container name (non-Compose)
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
rule { // fall back to container name as service
source_labels = ["service", "container"]
separator = "@"
regex = "@(.+)"
target_label = "service"
}
}
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.relabel.containers.output
forward_to = [loki.write.default.receiver]
labels = {
hostname = "{{ inventory_hostname }}",
environment = "{{ deployment_environment }}",
}
}
```
**Why this is preferred over syslog listeners:**
- **Zero per-service wiring.** Adding a new Compose project requires no Alloy
change — it is discovered automatically and labelled by its project name.
- **No startup ordering hazard.** It scrapes Docker's default `json-file` log
driver, so containers never block on an Alloy listener being up (contrast the
syslog driver, below).
- **Consistent `{service, component}` schema** across apps, matching the
Prometheus `component` label used by multi-target scrape jobs (app vs web).
**Requirements:**
- The Compose project must use the default **`json-file`** log driver (i.e. it
must *not* set `logging: { driver: syslog }`). The app must log to **stdout**.
- The `alloy` user needs read access to `/var/run/docker.sock` (handled by
`deploy.yml` adding it to the `docker` group on Docker hosts).
- The `service` label is the **Compose project name**, which defaults to the
deploy directory's basename. Confirm it (`docker compose config``name:`)
when an alert or dashboard depends on a specific `service=` selector.
### 3. Docker syslog driver (legacy, per-service)
The older pattern: each container ships logs via Docker's `syslog` driver to a
dedicated Alloy `loki.source.syslog` listener on a localhost port, labelled with
a static `job`.
```alloy
loki.source.syslog "kairos_logs" {
listener {
address = "127.0.0.1:{{ kairos_syslog_port }}"
protocol = "tcp"
syslog_format = "{{ syslog_format }}" // rfc3164
labels = {
job = "kairos",
hostname = "{{ inventory_hostname }}",
environment = "{{ deployment_environment }}",
}
}
forward_to = [loki.write.default.receiver]
}
```
Container side, in the service's `docker-compose.yml.j2`:
```yaml
logging:
driver: syslog
options:
syslog-address: "tcp://127.0.0.1:{{ kairos_syslog_port }}"
syslog-format: "{{ syslog_format | default('rfc3164') }}"
```
Ports follow the `514XX` convention and live in the host's `host_vars`.
> ⚠️ **Ordering hazard.** The listener must exist before the container starts.
> If `docker compose up` runs while the Alloy listener is not bound, the
> container fails immediately with `failed to initialize logging driver: dial
> tcp 127.0.0.1:<port>: connect: connection refused`. Deploy/verify Alloy on the
> host *before* deploying a syslog-driver service. This hazard is the main
> reason new services should prefer the Docker-socket path instead.
> **Note — labels differ between the two Docker paths.** The syslog listener
> sets `job="<service>"` (no `service`/`component`). The Docker-socket block
> sets `service="<project>"` + `component="<compose service>"` (no `job`). When
> migrating a service off syslog, update any dashboards or alert annotations
> that filter on `{job="…"}` to use `{service="…"}`.
## Journal relabeling (native services)
By default all journal entries share `job="systemd"`, making per-service
filtering impossible. A `loki.relabel` component overrides labels based on the
systemd unit. The journal source forwards to the relabel component instead of
directly to `loki.write`.
```alloy
loki.source.journal "systemd_logs" {
forward_to = [loki.write.default.receiver]
relabel_rules = loki.relabel.journal_puck.rules
labels = {
hostname = "{{ inventory_hostname }}",
environment = "{{ deployment_environment }}",
}
}
loki.relabel "journal_puck" {
forward_to = []
rule { // Pallas runtime → service/project schema
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "service"
replacement = "pallas"
}
rule { // default fallback
source_labels = ["__journal__systemd_unit"]
regex = ".+"
target_label = "job"
replacement = "systemd"
}
}
```
Rules run top-to-bottom; the first match per `target_label` wins, so the
generic `systemd` fallback stays **last**. Escape dots in unit regexes
(`alloy\\.service`). The `__journal_*` fields are hidden metadata — used for
relabeling, not shipped to Loki.
## Metrics
On Docker hosts the per-host config also scrapes host and container metrics and
**remote-writes** them to Prometheus (Alloy is the push agent; Prometheus does
not scrape these hosts directly):
- `prometheus.exporter.unix` — node metrics (Incus-safe collectors only).
- `prometheus.exporter.process``namedprocess_namegroup_*` per command.
- `prometheus.exporter.cadvisor``container_*` metrics via the Docker socket.
These feed `prometheus.scrape` (`job_name` = the host, e.g. `puck`) →
`prometheus.relabel` (adds `instance=<hostname>`) →
`prometheus.remote_write``prospero.incus:9090`.
> Application `/metrics` endpoints (e.g. django-prometheus, the
> nginx-prometheus-exporter sidecar) are **not** scraped by Alloy. Prometheus on
> Prospero scrapes those directly — see
> [`pplg/prometheus.yml.j2`](../ansible/pplg/prometheus.yml.j2).
## Current inventory
### Hosts using Docker socket discovery
| Host | Block | Notes |
|------|-------|-------|
| `puck` | `discovery.docker` + `loki.source.docker "containers"` | Reference implementation. Covers all Compose projects (athena, mnemosyne, daedalus, kairos, …) as `service`/`component`. |
### Hosts using per-service syslog listeners
| Host | Services (job labels) |
|------|-----------------------|
| `puck` | angelia, kairos, spelunker, jupyterlab *(transitional — see below)* |
| `miranda` | argos, neo4j-cypher, grafana_mcp, gitea-mcp, searxng |
| `oberon` | rabbitmq, smtp4dev |
| `rosalind` | gitea, hass, lobechat, jellyfin, searxng (+ apache log files) |
| `titania` | casdoor, haproxy |
| `ariel`, `umbriel` | neo4j |
### Transitional state on puck
`athena`, `mnemosyne`, and `daedalus` have **migrated off** their syslog
listeners to the Docker-socket block; their old `*_syslog_port` host_vars are
retained as reserved-but-unused and can be removed once each rollout is
verified. The remaining `puck` syslog listeners (angelia, kairos, spelunker,
jupyterlab) are candidates to migrate the same way.
## Querying in Grafana
```logql
# All Athena container logs (any component)
{service="athena"}
# Just the Athena MCP container
{service="athena", component="mcp"}
# Superuser-login forensic line behind the DjangoSuperuserLogin alert
{service="athena"} |= "event=superuser_login"
# A syslog-driver service (legacy label scheme)
{job="kairos"}
# Errors across everything on one host
{hostname="puck.incus"} |~ "(?i)error"
```
## Adding a new Dockerised service
**Preferred (Docker socket — no Alloy change needed):**
1. Ensure the service's Compose project uses the default `json-file` log driver
(do **not** set `logging: { driver: syslog }`) and the app logs to stdout.
2. Confirm the host's per-host Alloy config has the `discovery.docker` +
`loki.source.docker` blocks (currently `puck`). If not, add them once
(copy from [`puck/config.alloy.j2`](../ansible/alloy/puck/config.alloy.j2)).
3. Deploy the service. Verify in Grafana: `{service="<compose-project>"}`
returns entries, with `component=<compose-service>`.
**Legacy (syslog driver — only if the host has no Docker-socket block):**
1. Allocate a `514XX` syslog port in the host's `host_vars`.
2. Add a `loki.source.syslog` block to `ansible/alloy/<host>/config.alloy.j2`.
3. Add the `syslog` logging driver to the service's `docker-compose.yml.j2`.
4. **Deploy Alloy first**, then the service.
5. Verify: `{job="<label>", hostname="<host>"}` returns entries.
# Red Panda Seal of Approval 🐼

2419
docs/brave_search_api.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -9,8 +9,9 @@ This playbook deploys certbot with the Namecheap DNS plugin for DNS-01 validatio
| Installation | Python virtualenv in `/srv/certbot/.venv` |
| DNS Plugin | `certbot-dns-namecheap` |
| Validation | DNS-01 (supports wildcards) |
| Renewal | Systemd timer (twice daily) |
| Certificate Output | `/etc/haproxy/certs/{domain}.pem` |
| Renewal | Systemd timer (twice daily), runs as the `certbot` user |
| Certificate Output | Combined PEM at `haproxy_cert_path` (Titania: `/etc/haproxy/certs/ouranos.pem`) |
| HAProxy Reload | `systemctl reload haproxy` (native systemd, not Docker) |
| Metrics | Prometheus textfile collector |
## Deployments
@@ -69,12 +70,23 @@ services:
# ...
certbot_email: webmaster@helu.ca
certbot_cert_name: ouranos.helu.ca
certbot_domains:
- "*.ouranos.helu.ca"
- "ouranos.helu.ca"
certbot_certificates:
- cert_name: wildcard.ouranos.helu.ca
domains: ["*.ouranos.helu.ca", "ouranos.helu.ca"]
# Where the renewal hook writes the combined fullchain+privkey PEM for HAProxy
haproxy_cert_path: /etc/haproxy/certs/ouranos.pem
```
> The certbot lineage name is **`wildcard.ouranos.helu.ca`**, so the certbot
> config lives under `/srv/certbot/config/live/wildcard.ouranos.helu.ca/`. The
> combined PEM that HAProxy actually serves is a separate file at
> `haproxy_cert_path` (`ouranos.pem`) written by the renewal hook — do not
> confuse the two.
>
> The playbook also supports the single-cert form (`certbot_cert_name` +
> `certbot_domains`) for hosts with one certificate.
### 3. Deploy
```bash
@@ -91,9 +103,9 @@ ansible-playbook certbot/deploy.yml --limit titania.incus
| `/srv/certbot/credentials/namecheap.ini` | Namecheap API credentials (600 perms) |
| `/srv/certbot/hooks/renewal-hook.sh` | Post-renewal script |
| `/srv/certbot/hooks/cert-metrics.sh` | Prometheus metrics script |
| `/etc/haproxy/certs/ouranos.helu.ca.pem` | Combined cert for HAProxy (Titania) |
| `/etc/systemd/system/certbot-renew.service` | Renewal service unit |
| `/etc/systemd/system/certbot-renew.timer` | Twice-daily renewal timer |
| `/etc/haproxy/certs/ouranos.pem` | Combined cert for HAProxy (Titania), written by the renewal hook |
| `/etc/sudoers.d/certbot-haproxy-reload` | Scoped sudo rule letting certbot run `systemctl reload haproxy` |
| `/etc/systemd/system/certbot-renew.service` | Renewal service unit (runs as the `certbot` user) |
| `/etc/systemd/system/certbot-renew.timer` | Twice-daily renewal timer |
## Renewal Process
@@ -105,10 +117,36 @@ ansible-playbook certbot/deploy.yml --limit titania.incus
- Waits 120 seconds for propagation
- Validates and downloads new certificate
- Runs `renewal-hook.sh`
4. Renewal hook:
- Combines fullchain + privkey into HAProxy format
- Reloads HAProxy via `docker compose kill -s HUP haproxy`
- Updates Prometheus metrics
4. Renewal hook (`renewal-hook.sh`, run via certbot's `--deploy-hook`):
- Combines fullchain + privkey into the HAProxy PEM at `haproxy_cert_path`
- Reloads native HAProxy via `sudo -n systemctl reload haproxy`
- Always refreshes Prometheus metrics (even on failure — see below)
> **HAProxy on Titania runs natively under systemd, not in Docker.** The hook
> reloads it with `systemctl reload haproxy`. (Only Casdoor runs in Docker on
> Titania.)
### Permission model (why renewals can silently fail)
The renewal timer runs the hook as the unprivileged **`certbot`** user, so three
permissions must line up or the renewed cert never reaches HAProxy:
| Resource | Required state | Provided by |
|----------|----------------|-------------|
| `/etc/haproxy/certs` | `0770`, group `haproxy`; `certbot` is a member of `haproxy` | `haproxy/deploy.yml` (mode) + `certbot/deploy.yml` (group membership) |
| `systemctl reload haproxy` | allowed for `certbot` via sudo | `/etc/sudoers.d/certbot-haproxy-reload` |
| Prometheus textfile dir | group-writable by `certbot` | `certbot/deploy.yml` |
If any of these is wrong, the hook fails. **Certbot treats a deploy-hook failure
as a non-fatal WARNING and still reports "renewals succeeded"** — so a broken hook
will let the live cert renew while HAProxy keeps serving the *old* file until it
expires. To make this visible, the hook now:
- checks each step and exits non-zero with an explicit
`serving a STALE certificate` error (surfaced in the certbot/journal output), and
- refreshes the Prometheus cert metrics on *every* exit, so the
`SSLCertificateExpiringSoon` / `SSLCertificateExpired` alerts keep reflecting
reality even when installation fails.
## Prometheus Metrics
@@ -137,14 +175,29 @@ Example alert rule:
### View Certificate Status
```bash
# Check certificate expiry (Titania example)
openssl x509 -enddate -noout -in /etc/haproxy/certs/ouranos.helu.ca.pem
# Check expiry of the cert HAProxy actually serves (Titania)
sudo openssl x509 -enddate -noout -in /etc/haproxy/certs/ouranos.pem
# Confirm HAProxy is serving it on the wire
echo | openssl s_client -connect titania.incus:8443 \
-servername grafana.ouranos.helu.ca 2>/dev/null \
| openssl x509 -noout -enddate -issuer
# Check the underlying certbot lineage (may be newer than the served file
# if the deploy hook failed to install it)
sudo openssl x509 -enddate -noout \
-in /srv/certbot/config/live/wildcard.ouranos.helu.ca/fullchain.pem
# Check certbot certificates
sudo -u certbot /srv/certbot/.venv/bin/certbot certificates \
--config-dir /srv/certbot/config
```
> If the served file is older than the certbot lineage, the deploy hook is
> failing to install renewals. Check the hook output:
> `sudo grep -i hook /srv/certbot/logs/letsencrypt.log*` — look for
> `Permission denied`, `reload failed`, or `serving a STALE certificate`.
### Manual Renewal Test
```bash

View File

@@ -374,10 +374,10 @@ MinIO specifically expects certs at `~/.minio/certs/public.crt` and `~/.minio/ce
| Certbot location | On the host itself | OCI free host |
| Namecheap credentials | On the host | Only on OCI host |
| Cert delivery | Direct to HAProxy | Via OCI Vault → Ansible |
| Renewal hook | Docker HAProxy reload | OCI Vault upload |
| Renewal hook | Combine PEM + reload HAProxy | OCI Vault upload |
| Distribution | N/A (local only) | Ansible cron on controller |
| Environments served | Ouranos sandbox only | All environments |
| Service reload | `docker compose kill -s HUP` | `systemctl reload` per host_vars |
| Service reload | `systemctl reload haproxy` (native, via scoped sudo) | `systemctl reload` per host_vars |
Titania can remain self-contained (it's working) or migrate to this centralized model later.

255
docs/iolaus.md Normal file
View File

@@ -0,0 +1,255 @@
# iolaus
Personal agents for Daedalus — powered by [Pallas](https://git.helu.ca/r/pallas).
Iolaus is a pure agent project: Python agent definitions + YAML configuration.
The runtime (serving, registry, health checks, multimodal support) lives in Pallas.
## Architecture
```
Daedalus Backend — FastAPI
│ MCP over StreamableHTTP
Pallas MCP Bridge (pallas.server:main)
│ reads agents.yaml for topology
│ reads fastagent.config.yaml for LLM + model capabilities
├── Registry → /.well-known/mcp/server.json (agent discovery)
├── Shawn → kairos, neo4j_cypher, argos, research, time
├── Watson → argos, neo4j_cypher, time
├── Cristiano → nike, neo4j_cypher, time
├── Nate → periplus, argos, neo4j_cypher, time
├── David → orpheus, argos, neo4j_cypher, research, time
├── Research → argos, neo4j_cypher
├── Tech Research → context7, github, argos
└── Mikael → argos, time (news briefings; reads `news:` config)
```
## Project Structure
```
.
├── agents.yaml # Deployment topology — agents, ports, host, namespace
├── fastagent.config.yaml # LLM provider, MCP servers, model capabilities (committed)
├── fastagent.secrets.yaml # API keys and tokens (gitignored — never commit)
├── agents/ # Agent definitions (FastAgent @fast.agent decorators)
│ ├── shawn.py
│ ├── watson.py
│ ├── cristiano.py
│ ├── nate.py
│ ├── david.py
│ ├── research.py
│ └── tech_research.py
├── systemd/
│ └── iolaus.service
├── pyproject.toml
└── .env.example
```
## Agents
| Agent | Port | MCP URL | Purpose |
|-------|------|---------|---------|
| Shawn | 24001 | `http://puck.incus:24001/mcp` | Personal general assistant — calendar, contacts, email, and daily life |
| Watson | 24005 | `http://puck.incus:24005/mcp` | Relationship memory & emotional safety — reflection, values, habits, emotional experiences, dialogue notes |
| Cristiano | 24006 | `http://puck.incus:24006/mcp` | Football analyst — live data, match tracking, tactics |
| Nate | 24007 | `http://puck.incus:24007/mcp` | Travel and adventure companion — trip planning, navigation |
| David | 24008 | `http://puck.incus:24008/mcp` | Arts & culture — music, film, art, fashion, and Kawai piano |
| Infrastructure | 24050 | `http://puck.incus:24050/mcp` | Shell, git, and Grafana router |
| Research | 24051 | `http://puck.incus:24051/mcp` | Web search + knowledge graph chain |
| Tech Research | 24052 | `http://puck.incus:24052/mcp` | Technical investigation — library docs, code examples, API comparisons |
| Mikael | 24053 | `http://puck.incus:24053/mcp` | News briefings — topic-driven, source-verified, image-rich |
| Registry | 24000 | `http://puck.incus:24000/.well-known/mcp/server.json` | Agent discovery |
## Configuration
### `agents.yaml` — Deployment Topology
Single source of truth for agent names, ports, dependencies, host, and namespace.
Read by Pallas at startup.
```yaml
name: iolaus
version: "1.0.0"
host: puck.incus
namespace: ca.helu.iolaus
registry_port: 24000
agents:
shawn:
module: agents.shawn
port: 24001
title: Shawn
description: "Personal general assistant — calendar, contacts, email, and daily life management"
depends_on: [research]
# ...
```
To deploy a different agent group, swap `agents.yaml` — no code changes needed.
Override the config path with `PALLAS_AGENTS_CONFIG` env var.
### `fastagent.config.yaml` — LLM + Model Capabilities
Committed to the repo. Contains LLM provider settings and explicit model capability
declarations.
```yaml
default_model: generic.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
model_capabilities:
vision: false
context_window: 40960
max_output_tokens: 8192
```
The `model_capabilities` section declares capabilities explicitly rather than
inferring from the model name. Exposed in the registry for Daedalus to use when
routing requests.
### `news:` — Mikael News Agent Configuration
Top-level block in `fastagent.config.yaml`, consumed by `agents/mikael.py` at
startup. Edits take effect on restart — no code changes needed to tweak
topics or sources.
```yaml
news:
topics:
- Canadian federal politics
- Generative AI and LLM research
# ... add/remove freely
preferred_sources: # seeded into queries, not exclusive
- reuters.com
- apnews.com
- cbc.ca
avoided_sources: # excluded via -site: AND post-filtered by host
- foxnews.com
- breitbart.com
- dailymail.co.uk
default_lookback_hours: 24
max_items_per_topic: 5
```
Mikael excludes `avoided_sources` from search results with `-site:` operators
*and* post-filters by hostname as a second line of defence. It will never
summarize or cite content from an avoided source, even if another outlet
syndicates the claim.
### `fastagent.secrets.yaml` — API Keys and Tokens
Gitignored — never commit. Place in the repo root alongside `fastagent.config.yaml`.
```yaml
openai:
api_key: "your-key-here"
mcp:
servers:
angelia:
headers:
Authorization: "Bearer your-token"
kairos:
headers:
Authorization: "Bearer your-token"
periplus:
headers:
Authorization: "Bearer your-token"
nike:
headers:
Authorization: "Bearer your-token"
# ...
```
## Quickstart
```bash
# 1. Install dependencies (Python 3.13 required)
source ~/env/iolaus/bin/activate
pip install -e .
# 2. Configure secrets
cp fastagent.secrets.yaml.example fastagent.secrets.yaml
# Edit: set api_key and service tokens
# 3. Start all agents
iolaus
# 4. Verify
curl http://localhost:24001/mcp
# 5. Start a single agent
iolaus --agent shawn
```
## Daedalus Integration
Daedalus connects to agents via the MCP Python SDK's `streamable_http_client`.
Registry endpoint: `http://puck.incus:24000/.well-known/mcp/server.json`
The registry includes model capabilities on each agent entry:
```json
{
"capabilities": {
"model": "Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf",
"vision": false,
"context_window": 40960,
"max_output_tokens": 8192
}
}
```
## Deployment (systemd)
```bash
# 1. Copy project to /srv/iolaus
sudo cp -r . /srv/iolaus
sudo chown -R iolaus:iolaus /srv/iolaus
# 2. Install into venv
cd /srv/iolaus
source ~/env/iolaus/bin/activate
pip install -e .
# 3. Configure secrets
cp fastagent.secrets.yaml.example fastagent.secrets.yaml
# Fill in API keys and tokens
# 4. Install and start systemd service
sudo cp systemd/iolaus.service /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable --now iolaus
sudo systemctl status iolaus
```
## Downstream MCP Servers
| Server | Host | URL |
|--------|------|-----|
| argos | miranda.incus | `http://miranda.incus:25534/mcp` |
| neo4j_cypher | circe.helu.ca | `http://circe.helu.ca:22034/mcp` |
| kernos | caliban.incus | `http://caliban.incus:22021/mcp` |
| rommie | caliban.incus | `http://caliban.incus:22031/mcp` |
| gitea | miranda.incus | `http://miranda.incus:25535/mcp` |
| grafana | miranda.incus | `http://miranda.incus:25533/mcp` |
| korax | korax.helu.ca | `http://korax.helu.ca:22021/mcp` |
| orpheus | orpheus.helu.ca | `https://orpheus.helu.ca/mcp` |
| angelia | ouranos.helu.ca | `https://ouranos.helu.ca/mcp/` |
| kairos | ouranos.helu.ca | `https://kairos.ouranos.helu.ca/mcp/` |
| periplus | ouranos.helu.ca | `https://periplus.ouranos.helu.ca/mcp/` |
| nike | ouranos.helu.ca | `https://nike.ouranos.helu.ca/mcp/` |
| github | local (Docker stdio) | `ghcr.io/github/github-mcp-server` |
| context7 | local (stdio) | `npx -y @upstash/context7-mcp` |
| time | local (stdio) | `mcp-server-time` |
## Notes
- **Python 3.13** required (`fast-agent-mcp` pins `>=3.13`)
- **Runtime:** [Pallas](https://git.helu.ca/r/pallas) — `pallas-mcp @ git+ssh://git@git.helu.ca:22022/r/pallas.git`
- **Transport:** StreamableHTTP (`/mcp`) throughout — not SSE
- **LLM:** Local Qwen via fast-agent's Generic (OpenAI-compatible) provider at
`http://nyx.helu.ca:22079/v1`
- **Logging:** Console output — stdout → syslog → Alloy → Loki in production
- **Port scheme:** registry at 24000, personal agents 2400124049, sub-agents 2405024099

283
docs/kottos.md Normal file
View File

@@ -0,0 +1,283 @@
# kottos
Engineering agents for Daedalus — powered by [Pallas](https://git.helu.ca/r/pallas).
Kottos is a pure agent project: Python agent definitions + YAML configuration.
The runtime (serving, registry, health checks, multimodal support) lives in Pallas.
## Architecture
```
Daedalus Backend — FastAPI
│ MCP over StreamableHTTP
Pallas MCP Bridge (pallas.server:main)
│ reads agents.yaml for topology
│ reads fastagent.config.yaml for LLM + model capabilities
├── Registry → /.well-known/mcp/server.json (agent discovery)
├── Harper → kernos_harper, gitea, argos, neo4j_cypher, grafana,
│ rommie, angelia, time, research, tech_research
├── Scotty → kernos_scotty, argos, tech_research, neo4j_cypher, grafana, time
├── Research → argos, neo4j_cypher
└── Tech Research → context7, github, argos
```
## Project Structure
```
.
├── agents.yaml # Deployment topology — agents, ports, host, namespace
├── fastagent.config.yaml # LLM provider, MCP servers, model capabilities (committed)
├── fastagent.secrets.yaml # API keys and tokens (gitignored — never commit)
├── fastagent.secrets.yaml.example
├── agents/ # Agent definitions (FastAgent @fast.agent decorators)
│ ├── harper.py
│ ├── scotty.py
│ ├── research.py
│ └── tech_research.py
├── docs/
│ └── pallas_integration.md
├── pyproject.toml
└── LICENSE
```
## Agents
| Agent | Port | MCP URL | Purpose |
|-------|------|---------|---------|
| Harper | 24101 | `http://puck.incus:24101/mcp` | Scrappy engineer — rapid prototyping, hacking, and creative problem-solving |
| Scotty | 24102 | `http://puck.incus:24102/mcp` | Systems administration — infrastructure diagnostics and security hardening |
| Research | 24150 | `http://puck.incus:24150/mcp` | Web search + knowledge graph chain |
| Tech Research | 24151 | `http://puck.incus:24151/mcp` | Technical investigation — library docs, code examples, API comparisons |
| Registry | 24100 | `http://puck.incus:24100/.well-known/mcp/server.json` | Agent discovery |
## Configuration
### `agents.yaml` — Deployment Topology
Single source of truth for agent names, ports, dependencies, host, and namespace.
Read by Pallas at startup.
```yaml
name: kottos
version: "1.0.0"
host: puck.incus
namespace: ca.helu.kottos
registry_port: 24100
agents:
harper:
module: agents.harper
port: 24101
title: Harper
description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving"
depends_on: [research, tech_research]
# ...
```
To deploy a different agent group, swap `agents.yaml` — no code changes needed.
Override the config path with `PALLAS_AGENTS_CONFIG` env var.
### `fastagent.config.yaml` — LLM + Model Capabilities
Committed to the repo. Contains LLM provider settings and explicit model capability
declarations.
In Ansible-managed deployments this file is replaced by the
`fastagent.config.yaml.j2` template which renders environment-specific values
for model, MCP URLs, etc.
```yaml
default_model: generic.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
model_capabilities:
vision: false
context_window: 192000
max_output_tokens: 16384
```
The `model_capabilities` section declares capabilities explicitly rather than
inferring from the model name. Exposed in the registry for Daedalus to use when
routing requests.
### `fastagent.secrets.yaml` — API Keys and Tokens
Gitignored — never commit. Place in the repo root alongside `fastagent.config.yaml`.
In Ansible-managed deployments this file is replaced by the
`fastagent.secrets.yaml.j2` template which renders secrets from OCI Vault.
```yaml
openai:
api_key: "your-key-here"
mcp:
servers:
angelia:
headers:
Authorization: "Bearer your-token"
github:
env:
GITHUB_PERSONAL_ACCESS_TOKEN: "your-token"
# ...
```
## Quickstart
```bash
# 1. Install dependencies (Python 3.13 required)
source ~/env/kottos/bin/activate
pip install -e .
# 2. Configure secrets
cp fastagent.secrets.yaml.example fastagent.secrets.yaml
# Edit: set api_key and service tokens
# 3. Start all agents
kottos
# 4. Verify
curl http://localhost:24101/mcp
# 5. Start a single agent
kottos --agent harper
```
## Daedalus Integration
Daedalus connects to agents via the MCP Python SDK's `streamable_http_client`.
Registry endpoint: `http://puck.incus:24100/.well-known/mcp/server.json`
The registry includes model capabilities on each agent entry:
```json
{
"capabilities": {
"model": "Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf",
"vision": false,
"context_window": 192000,
"max_output_tokens": 16384
}
}
```
## Deployment
Kottos runs two ways:
1. **Locally on caliban**, hand-started for iteration (`kottos` from the repo root). This is the flow documented above in *Quickstart*.
2. **In Ouranos / Virgo / Taurus via Ansible**, as a `systemd`-managed `pallas` process on the puck.incus container. This is the pipeline that feeds the Puck Services dashboard in Grafana.
### Ansible role
Lives in `ouranos/ansible/kottos/`:
| File | Purpose |
|---|---|
| `deploy.yml` | Main playbook — user/group, venv, systemd unit, config templating, registry probe. |
| `stage.yml` | Clones `git.helu.ca/r/kottos` at `{{ kottos_rel }}` and creates the release tarball. |
| `kottos.service.j2` | systemd unit. `SyslogIdentifier=kottos`, `StandardOutput=journal`, `PALLAS_LOG_STDOUT=1` via the env file. |
| `.env.j2` | Runtime environment for `pallas` — logging config, `PALLAS_AGENTS_CONFIG`. |
| `agents.yaml.j2` | Deployment topology with host/ports pulled from inventory. |
| `fastagent.config.yaml.j2` | LLM provider + MCP server URLs, parametric per environment. |
| `fastagent.secrets.yaml.j2` | API keys and auth tokens, rendered from Ansible Vault. |
### Inventory
Host variables live in `inventory/host_vars/puck.incus.yml` under **Kottos Configuration**:
```yaml
kottos_user: kottos
kottos_group: kottos
kottos_directory: /srv/kottos
kottos_host: "puck.incus"
kottos_registry_port: 24100
kottos_harper_port: 24101
kottos_scotty_port: 24102
kottos_research_port: 24150
kottos_tech_research_port: 24151
pallas_log_level: INFO
# Local Qwen served via fast-agent's Generic (OpenAI-compatible) provider.
# The openai_base_url slot is reserved for cloud OpenAI endpoints (e.g.
# Bedrock Mantle Chat Completions).
kottos_default_model: "generic.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
kottos_generic_base_url: "http://nyx.helu.ca:22079/v1"
# ...plus one entry per downstream MCP URL so each environment overrides freely
```
Every host variable is parametric — Virgo's `puck.virgo.yml` (or wherever the Pallas host lives) can override any value without touching the templates.
### Vault
Four vault keys required — all documented in `inventory/group_vars/all/vault.yml.example`:
| Key | Used for |
|---|---|
| `vault_kottos_openai_api_key` | OpenAI-compatible LLM endpoint (nyx Qwen in Ouranos). |
| `vault_kottos_github_pat` | `GITHUB_PERSONAL_ACCESS_TOKEN` for the local GitHub MCP Docker container. |
| `vault_kottos_angelia_bearer` | Bearer token accepted by the Angelia MCP server. |
| `vault_kottos_mnemosyne_jwt` | Long-lived team JWT from Daedalus admin UI — Mnemosyne validates it on every `search_memory` call and scopes results to this team's workspaces. |
### Deploying
Wired into `site.yml`:
```bash
cd ansible
ansible-playbook kottos/stage.yml # clone repo + build tarball (local)
ansible-playbook kottos/deploy.yml # deploy + template + start
```
Or run the full site (`ansible-playbook site.yml`) — kottos's stage + deploy steps are the last block in the sequence.
### Logs
Journal identifier `kottos`, so on the host:
```bash
sudo journalctl -u kottos -f --output=cat | jq .
```
Alloy on puck's journal source relabels `__journal_syslog_identifier=kottos` to `{service="pallas", project="kottos"}`, then into Loki. Everything shows up in Grafana's *Puck Services — Logs & Health* dashboard under the **Pallas** row, with per-agent colouring driven by the `component` JSON field (`harper`, `scotty`, `research`, `tech_research`).
For per-agent follow-along:
```logql
{service="pallas", project="kottos", component="harper"} | json
```
For the opaque-MCP-transport-failure trace stream (see Pallas's bearer-forwarding incident history):
```logql
{service="pallas", project="kottos"} |= "pallas.forward.trace" | json
```
See [logging.md](logging.md) for the full label schema + level policy + add-a-new-service guide.
## Downstream MCP Servers
| Server | Host | URL |
|--------|------|-----|
| argos | miranda.incus | `http://miranda.incus:25534/mcp` |
| neo4j_cypher | circe.helu.ca | `http://circe.helu.ca:22034/mcp` |
| caliban | caliban.incus | `http://caliban.incus:22062/mcp` |
| rommie | caliban.incus | `http://caliban.incus:22061/mcp` |
| gitea | miranda.incus | `http://miranda.incus:25535/mcp` |
| grafana | miranda.incus | `http://miranda.incus:25533/mcp` |
| korax | korax.helu.ca | `http://korax.helu.ca:20261/mcp` |
| angelia | ouranos.helu.ca | `https://ouranos.helu.ca/mcp/` |
| github | local (Docker stdio) | `ghcr.io/github/github-mcp-server` |
| context7 | local (stdio) | `npx -y @upstash/context7-mcp` |
| time | local (stdio) | `mcp-server-time` |
## Notes
- **Python 3.13** required (`fast-agent-mcp` pins `>=3.13`)
- **Runtime:** [Pallas](https://git.helu.ca/r/pallas) — `pallas-mcp @ git+ssh://git@git.helu.ca:22022/r/pallas.git`
- **Transport:** StreamableHTTP (`/mcp`) throughout — not SSE
- **LLM:** Local Qwen via fast-agent's Generic (OpenAI-compatible) provider at
`http://nyx.helu.ca:22079/v1`
- **Logging:** Console output — stdout → syslog → Alloy → Loki in production
- **Port scheme:** registry at 24100, agents 2410124149, sub-agents 2415024199

173
docs/logging.md Normal file
View File

@@ -0,0 +1,173 @@
# Unified Logging — Mnemosyne, Pallas, Daedalus
PPLG is the single destination for every service's logs. This document describes the label schema every service emits, the two transports Alloy uses to collect logs, and the level policy that keeps INFO output actionable.
The three in-scope services today are **Mnemosyne**, **Pallas** (running as Kottos/Mentor/Iolaus), and **Daedalus**. The same patterns generalise to any future service that deploys on a `docker`-enabled host or under `systemd+journald`.
## Label schema
Every Loki log stream carries these labels, and nothing else:
| Label | Example values | Source |
|---|---|---|
| `service` | `mnemosyne`, `pallas`, `daedalus`, `athena`, `kairos`, `angelia` | Docker compose project name (container logs) **or** explicit systemd relabel rule (journal logs) |
| `component` | `app`, `mcp`, `worker`, `nginx`, `harper`, `scotty`, `research`, `tech_research` | Docker compose service name **or** per-agent `ContextVar` (Pallas) |
| `project` | `kottos` (Pallas only) | `agents.yaml` `name:` field read by `pallas.log.set_project()` |
| `hostname` | `puck.incus`, `caliban.incus` | Alloy's `inventory_hostname` template var |
| `environment` | `ouranos`, `virgo`, `taurus` | `deployment_environment` from Ansible group_vars |
**Everything else is a JSON field in the log body**, not a label. That includes `level`, `logger`, `funcName`, `lineno`, `message`, `request_id`, `workspace_id`, `agent`, `tool`, `duration_ms`, and any `extra={...}` kwargs the application passed in. LogQL's `| json` pipeline parses these on-query — keeping them out of the label index is what keeps Loki fast.
## Level policy
Same rules for every service. Health-check `200 OK`s live in DEBUG, never in INFO.
| Level | Meaning |
|---|---|
| `ERROR` | Broken; requires human attention. |
| `WARNING` | Degraded but self-recovering — retries, skipped items, missing optional config. |
| `INFO` | Lifecycle events and failures. Start, ready, shutdown, preflight, LLM provider validation. 200 OKs on health endpoints are **not** INFO. |
| `DEBUG` | Per-request detail, successful health probes, verbose traces. Enable on demand when troubleshooting. |
Mnemosyne enforces this with `mnemosyne.log_filters.SuppressHealthAccessFilter` on Django/gunicorn access loggers; Pallas with `_HealthAccessFilter` on `uvicorn.access`; Daedalus with the equivalent filter in `daedalus.logging`.
## Two transports, one Alloy
Alloy on each host uses exactly two sources for application logs. Pick whichever matches the service's runtime model — **don't** invent a third.
### 1. Docker socket (for compose projects)
`discovery.docker` enumerates every running container, and `loki.source.docker` tails their stdout via the `json-file` driver. Compose project → `service` label, compose service → `component` label. One block covers every compose project on the host, current and future.
**Requirements on the service side:**
- Emit JSON lines to **stdout**, one per log record. Mnemosyne uses `python-json-logger`; Daedalus uses `structlog`; any Python service can do the same.
- Pin the logging driver to `json-file` with bounded rotation in `docker-compose.yaml`:
```yaml
x-logging: &default-logging
driver: json-file
options:
tag: "{{.Name}}"
max-size: "10m"
max-file: "5"
services:
app:
# ...
logging: *default-logging
```
`json-file` is Docker's default, but pinning it defensively guarantees Alloy sees the same driver on every host.
- On the Alloy host, the `alloy` user must be in the `docker` group to read `/var/run/docker.sock`. The `ouranos/ansible/alloy/` role handles this.
### 2. Systemd journal (for systemd-managed units)
`loki.source.journal` tails journald. A `loki.relabel "journal_<host>"` block translates `__journal_syslog_identifier` → `service` / `project` labels so Pallas-managed agents land alongside Docker-based services with the same schema.
**Requirements on the service side:**
- Emit JSON to **stdout** (journald captures it with `PRIORITY=6` INFO by default).
- The systemd unit must set a distinctive `SyslogIdentifier=` — the Alloy relabel block keys off this.
- Under Pallas, set `PALLAS_LOG_STDOUT=1` in the unit's `EnvironmentFile`. Also set `PALLAS_LOG_FILE=/dev/null` to disable the rotating file sink (journald is already durable).
Example, from `ouranos/ansible/kottos/kottos.service.j2`:
```ini
[Service]
...
EnvironmentFile=/srv/kottos/.env
ExecStart=/srv/kottos/.venv/bin/pallas
StandardOutput=journal
StandardError=journal
SyslogIdentifier=kottos
```
And the matching Alloy relabel rule on puck:
```alloy
loki.relabel "journal_puck" {
forward_to = []
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "service"
replacement = "pallas"
}
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "project"
replacement = "kottos"
}
// ...
}
```
## Per-service reference
### Mnemosyne (Docker compose on puck)
- Logging config: `mnemosyne/mnemosyne/mnemosyne/settings.py` → `LOGGING` dict using `pythonjsonlogger.json.JsonFormatter`.
- Component attribution: `MNEMOSYNE_COMPONENT` env var set per docker-compose service (`init`, `app`, `mcp`, `worker`). The settings module reads it into `static_fields.component`.
- Health-filter: `mnemosyne.log_filters.SuppressHealthAccessFilter` on the `access` handler.
- Metrics: `/metrics` on the nginx container (port 23181) — served by django-prometheus on the app container plus `mcp_server.metrics` (shared `prometheus_client` registry).
- Scrape job: `mnemosyne` (see `ouranos/ansible/pplg/prometheus.yml.j2`).
- Alerts: `mnemosyne_alerts` group in `ouranos/ansible/pplg/alert_rules.yml.j2`.
### Pallas — Kottos (systemd on puck via Ansible role `ouranos/ansible/kottos/`)
- Logging config: `pallas/pallas/log.py` → `setup_logging()` with `PALLAS_LOG_STDOUT=1`.
- Component attribution: `pallas.log.set_agent_component(name)` is called by `_start_agent()` inside each agent's asyncio task, setting a `contextvars.ContextVar` that the `_StaticFieldsFilter` reads per record. Each agent (harper, scotty, research, tech_research) carries its own value without leaking across tasks.
- Project attribution: `pallas.log.set_project(deploy_name)` is called once in `main()` from `agents.yaml`'s `name:`. For Kottos this renders as `project="kottos"` on every record.
- Deployed by: `ansible-playbook kottos/deploy.yml` (wired into `site.yml`).
- Metrics: none today — Pallas is observed through logs only. Future phase will add a `prometheus_client` endpoint on the registry port for `pallas_agent_requests_total{agent=…}`, `pallas_downstream_mcp_errors_total{server=…}`.
### Daedalus (Docker compose on puck)
- Logging config: `daedalus/backend/daedalus/logging.py` — `structlog` JSON processor chain, already production-ready.
- Component attribution: `structlog.contextvars.bind_contextvars(service="daedalus", component="api")` at app startup.
- Health-filter: `_SuppressHealthAccessFilter` on uvicorn's access logger.
- Metrics: `/metrics` on the api container (port 22181).
- Scrape job: `daedalus`.
- Alerts: `daedalus_alerts` group.
## Useful LogQL queries
Once the pipeline is live, the "troubleshooting is a nightmare" problem becomes three-click queries in Grafana Explore:
```logql
# All Mnemosyne errors in the last 15m
{service="mnemosyne"} | json | level="ERROR"
# Everything Harper did in the last hour
{service="pallas", project="kottos", component="harper"} | json
# The infamous pallas.forward.trace stream (MCP transport failures)
{service="pallas", project="kottos"} |= "pallas.forward.trace"
# Cross-service trace of a single request (requires X-Request-Id propagation
# — not yet implemented; Phase 1.5 nice-to-have)
{environment="ouranos"} | json | request_id="<paste-id>"
# 5xx spike in Daedalus by path
sum by (path) (rate({service="daedalus"} | json | level="ERROR" [5m]))
```
The **Puck Services — Logs & Health** dashboard in Grafana (`/etc/grafana/provisioning/dashboards/puck.yaml` → `/var/lib/grafana/dashboards/puck_services.json`) has these pre-wired as panels per service row.
## Adding a new service
If you're adding a service to puck (or any Ouranos/Virgo host with this stack):
1. **Emit JSON to stdout** with `service`/`component` as static fields. Copy Mnemosyne's settings pattern or Pallas's `_StaticFieldsFilter`.
2. **Pick a transport:**
- Docker compose → add the `x-logging: &default-logging` anchor + `logging: *default-logging` on each service. Done. No Alloy changes needed.
- systemd → set `SyslogIdentifier=<name>` on the unit and add a two-rule relabel block to the host's `loki.relabel "journal_<host>"` block.
3. **Expose `/metrics`** if the service is in Python — `prometheus_client` plus either `django-prometheus` or `prometheus_fastapi_instrumentator`.
4. **Add a scrape job** in `ouranos/ansible/pplg/prometheus.yml.j2` (parametrise the target — `{{ <service>_metrics_host }}:{{ <service>_metrics_port }}`) and wire the defaults into the host's `host_vars`.
5. **Add alerts** in `ouranos/ansible/pplg/alert_rules.yml.j2`. At minimum: `Down`, `HighErrorRate`. Use the metric names the service actually exposes — no dead rules.
6. **Optional**: add panels to the Puck Services dashboard JSON.
No new transport. No per-service Alloy block. No custom log format.

218
docs/mentor.md Normal file
View File

@@ -0,0 +1,218 @@
# mentor
Work agents for Daedalus — powered by [Pallas](https://git.helu.ca/r/pallas).
Mentor is a pure agent project: Python agent definitions + YAML configuration.
The runtime (serving, registry, health checks, multimodal support) lives in Pallas.
## Architecture
```
Daedalus Backend — FastAPI
│ MCP over StreamableHTTP
Pallas MCP Bridge (pallas.server:main)
│ reads agents.yaml for topology
│ reads fastagent.config.yaml for LLM + model capabilities
├── Registry → /.well-known/mcp/server.json (agent discovery)
├── Jarvis → kernos, rommie, argos, neo4j_cypher, athena, time
├── Jeffrey → neo4j_cypher, athena, research, time
├── Ann → research, argos, neo4j_cypher, athena, angelia, time
├── Alan → research, argos, athena, neo4j_cypher, time
├── AWS SA → aws_knowledge, aws_docs, aws_pricing, argos, context7
├── Research → argos, neo4j_cypher
└── Tech Research → context7, github, argos
```
## Project Structure
```
.
├── agents.yaml # Deployment topology — agents, ports, host, namespace
├── fastagent.config.yaml # LLM provider, MCP servers, model capabilities (committed)
├── fastagent.secrets.yaml # API keys and tokens (gitignored — never commit)
├── agents/ # Agent definitions (FastAgent @fast.agent decorators)
│ ├── jarvis.py
│ ├── jeffrey.py
│ ├── ann.py
│ ├── alan.py
│ ├── aws_sa.py
│ ├── research.py
│ └── tech_research.py
├── systemd/
│ └── mentor.service
├── pyproject.toml
└── .env.example
```
## Configuration
### `agents.yaml` — Deployment Topology
Single source of truth for agent names, ports, dependencies, host, and namespace.
Read by Pallas at startup.
```yaml
name: mentor
version: "1.0.0"
host: puck.incus
namespace: ca.helu.mentor
registry_port: 24200
agents:
jarvis:
module: agents.jarvis
port: 24201
title: Jarvis
description: "Work execution assistant — task management, meetings, daily operations"
depends_on: [research]
# ...
```
To deploy a different agent group, swap `agents.yaml` — no code changes needed.
Override the config path with `PALLAS_AGENTS_CONFIG` env var.
### `fastagent.config.yaml` — LLM + Model Capabilities
Committed to the repo. Contains LLM provider settings and explicit model capability
declarations.
```yaml
default_model: openai.global.anthropic.claude-opus-4-6-v1
model_capabilities:
vision: false
context_window: 200000
max_output_tokens: 32000
```
The `model_capabilities` section declares capabilities explicitly rather than
inferring from the model name. Exposed in the registry for Daedalus to use when
routing requests (e.g., vision-capable agents).
### `fastagent.secrets.yaml` — API Keys and Tokens
Gitignored — never commit. Place in the repo root alongside `fastagent.config.yaml`.
```yaml
openai:
api_key: "your-key-here"
mcp:
servers:
angelia:
headers:
Authorization: "Bearer your-token"
# ...
```
## Quickstart
### Prerequisites
- **Python 3.13+** (`fast-agent-mcp` pins `>=3.13`)
- **Node.js / npm** — for the `context7` stdio MCP server (`npx -y @upstash/context7-mcp`)
- **Docker** — for the `github` stdio MCP server
- **`uv` / `uvx`** at `/usr/local/bin/uvx` — required by the `aws_docs` and
`aws_pricing` stdio MCP servers. fast-agent's stdio client sanitizes `PATH`
before spawning subprocesses (via `mcp.client.stdio.get_default_environment()`),
so `fastagent.config.yaml` references `uvx` by absolute path. Install
system-wide — this matches the Ansible production deploy
(`virgo/ansible/mentor/deploy.yml`) so dev and prod configs are identical:
```bash
sudo sh -c 'curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh'
```
Do **not** use the default per-user install (`~/.local/bin/uvx`) — the config
will not find it at runtime under systemd or under pallas' sanitized PATH.
```bash
# 1. Install dependencies (Python 3.13 required)
source ~/env/mentor/bin/activate
pip install -e .
# 2. Configure secrets
cp fastagent.secrets.yaml.example fastagent.secrets.yaml
# Edit: set api_key and service tokens
# 3. Start all agents
mentor
# 4. Verify
curl http://localhost:24201/mcp
# 5. Start a single agent
mentor --agent jarvis
```
## Daedalus Integration
Daedalus connects to agents via the MCP Python SDK's `streamable_http_client`.
Registry endpoint: `http://<host>:<registry_port>/.well-known/mcp/server.json`
The registry includes model capabilities on each agent entry:
```json
{
"capabilities": {
"model": "global.anthropic.claude-opus-4-6-v1",
"vision": false,
"context_window": 200000,
"max_output_tokens": 32000
}
}
```
## Deployment (systemd)
```bash
# 1. Copy project to /srv/mentor
sudo cp -r . /srv/mentor
sudo chown -R mentor:mentor /srv/mentor
# 2. Install into venv
cd /srv/mentor
source ~/env/mentor/bin/activate
pip install -e .
# 3. Configure secrets
cp fastagent.secrets.yaml.example fastagent.secrets.yaml
# Fill in API keys and tokens
# 4. Install and start systemd service
sudo cp systemd/mentor.service /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable --now mentor
sudo systemctl status mentor
```
## Downstream MCP Servers
| Server | Host | URL |
|--------|------|-----|
| argos | miranda.incus | `http://miranda.incus:25534/mcp` |
| neo4j_cypher | circe.helu.ca | `http://circe.helu.ca:22034/mcp` |
| kernos | caliban.incus | `http://caliban.incus:22021/mcp` |
| rommie | caliban.incus | `http://caliban.incus:22031/mcp` |
| gitea | miranda.incus | `http://miranda.incus:25535/mcp` |
| grafana | miranda.incus | `http://miranda.incus:25533/mcp` |
| angelia | ouranos.helu.ca | `https://ouranos.helu.ca/mcp/` |
| athena | ouranos.helu.ca | `https://athena.ouranos.helu.ca/mcp/` |
| kairos | ouranos.helu.ca | `https://kairos.ouranos.helu.ca/mcp/` |
| github | local (Docker stdio) | `ghcr.io/github/github-mcp-server` |
| context7 | local (stdio) | `npx -y @upstash/context7-mcp` |
| time | local (stdio) | `mcp-server-time` |
| aws_knowledge | knowledge-mcp.global.api.aws | `https://knowledge-mcp.global.api.aws` |
| aws_docs | local (stdio) | `uvx awslabs.aws-documentation-mcp-server@latest` |
| aws_pricing | local (stdio) | `uvx awslabs.aws-pricing-mcp-server@latest` |
## Notes
- **Python 3.13** required (`fast-agent-mcp` pins `>=3.13`)
- **Runtime:** [Pallas](https://git.helu.ca/r/pallas) — `pallas-mcp @ git+ssh://git@git.helu.ca:22022/r/pallas.git`
- **Transport:** StreamableHTTP (`/mcp`) throughout — not SSE
- **Logging:** Console output — stdout → syslog → Alloy → Loki in production
- **Port scheme:** registry at `registry_port`, agents at configured ports in `agents.yaml`

View File

@@ -4,10 +4,19 @@
Neo4j is a high-performance graph database providing native graph storage and processing. It enables efficient traversal of complex relationships and is used for knowledge graphs, recommendation engines, and connected data analysis. Deployed with the **APOC plugin** enabled for extended stored procedures and functions.
**Host:** ariel.incus
**Role:** graph_database
**Container Port:** 25554 (HTTP Browser), 7687 (Bolt)
**External Access:** Direct Bolt connection via `ariel.incus:7687`
Two dedicated Neo4j instances run in the Ouranos lab, one per tenant, because
Neo4j Community Edition is single-database and tenants cannot safely share
label space, vector indexes, or schema migrations:
| Host | Tenant | HTTP Browser | Bolt |
|------|--------|--------------|------|
| `ariel.incus` | Shared / general graph work (Neo4j MCP, exploration) | port 25554 | port 7687 |
| `umbriel.incus` | Mnemosyne (dedicated — `Library`/`Collection`/`Item`/`Chunk`/`Concept`) | port 25555 | port 7687 |
Both hosts run the same Ansible playbook (`neo4j/deploy.yml`) from the same
`docker-compose.yml.j2` template, differing only by port and vault password.
They run independent Docker Compose stacks with their own named volumes
(`neo4j_data`, `neo4j_logs`, `neo4j_plugins`) — no shared state.
## Architecture
@@ -22,32 +31,50 @@ Neo4j is a high-performance graph database providing native graph storage and pr
└────────────▶│ Neo4j Browser│
│ HTTP :25554 │
└──────────────┘
┌──────────────┐ ┌──────────────┐
│ Mnemosyne │─────▶│ Neo4j │
│ (puck) │ Bolt │ (Umbriel) │
└──────────────┘ └──────────────┘
┌──────────────┐
│ Neo4j Browser│
│ HTTP :25555 │
└──────────────┘
```
- **Neo4j Browser**: Web-based query interface on port 25554
- **Bolt Protocol**: Binary protocol on port 7687 for high-performance connections
- **Neo4j Browser (Ariel)**: Web-based query interface on port 25554
- **Neo4j Browser (Umbriel)**: Web-based query interface on port 25555
- **Bolt Protocol**: Binary protocol on port 7687 for high-performance connections (same port on both hosts — each container has its own network namespace)
- **APOC Plugin**: Extended procedures for import/export, graph algorithms, and utilities
- **Neo4j MCP Servers**: Connect via Bolt from Miranda for AI agent access
- **Neo4j MCP Servers**: Connect via Bolt from Miranda for AI agent access (Ariel only)
- **Mnemosyne**: Connects via Bolt to Umbriel; does not touch Ariel
## Terraform Resources
### Host Definition
### Host Definitions
The service runs on `ariel`, defined in `terraform/containers.tf`:
Both hosts are defined in `terraform/containers.tf`:
| Attribute | Value |
|-----------|-------|
| Image | noble |
| Role | graph_database |
| Security Nesting | true |
| AppArmor | unconfined |
| Description | Neo4j Host - Ethereal graph connections |
| Attribute | ariel | umbriel |
|-----------|-------|---------|
| Image | noble | noble |
| Role | graph_database | graph_database |
| Security Nesting | true | true |
| AppArmor | unconfined | unconfined |
| Description | Neo4j Host - Ethereal graph connections | Neo4j Host (Mnemosyne) - Dusky sprite keeping the memory graph |
### Proxy Devices
| Device Name | Listen | Connect |
|-------------|--------|---------|
| neo4j_ports | tcp:0.0.0.0:25554 | tcp:127.0.0.1:25554 |
| Host | Device Name | Listen | Connect |
|------|-------------|--------|---------|
| ariel | neo4j_ports | tcp:0.0.0.0:25554 | tcp:127.0.0.1:25554 |
| umbriel | neo4j_ports | tcp:0.0.0.0:25555 | tcp:127.0.0.1:25555 |
> Bolt (7687) is not in the Incus proxy device list for either host — it is
> reached directly over the internal `10.10.0.0/24` network by DNS name
> (`ariel.incus:7687`, `umbriel.incus:7687`).
### Dependencies
@@ -69,9 +96,10 @@ ansible-playbook neo4j/deploy.yml
| File | Purpose |
|------|---------|
| `neo4j/deploy.yml` | Main deployment playbook |
| `neo4j/deploy.yml` | Main deployment playbook (runs on both hosts via service detection) |
| `neo4j/docker-compose.yml.j2` | Docker Compose template |
| `alloy/ariel/config.alloy.j2` | Alloy log collection config |
| `alloy/ariel/config.alloy.j2` | Alloy log collection config — Ariel |
| `alloy/umbriel/config.alloy.j2` | Alloy log collection config — Umbriel |
### Deployment Steps
@@ -83,7 +111,28 @@ ansible-playbook neo4j/deploy.yml
## Configuration
### Host Variables (`host_vars/ariel.incus.yml`)
### Host Variables
Both hosts define the same variable set, differing only in port, syslog port,
and vault reference.
`host_vars/ariel.incus.yml`:
| Variable | Value |
|----------|-------|
| `neo4j_auth_password` | `{{ vault_neo4j_auth_password }}` |
| `neo4j_http_port` | `25554` |
| `neo4j_syslog_port` | `22011` |
`host_vars/umbriel.incus.yml`:
| Variable | Value |
|----------|-------|
| `neo4j_auth_password` | `{{ vault_mnemosyne_neo4j_auth_password }}` |
| `neo4j_http_port` | `25555` |
| `neo4j_syslog_port` | `22012` |
Shared variables on both hosts:
| Variable | Description | Default |
|----------|-------------|---------|
@@ -92,17 +141,15 @@ ansible-playbook neo4j/deploy.yml
| `neo4j_group` | System group | `neo4j` |
| `neo4j_directory` | Installation directory | `/srv/neo4j` |
| `neo4j_auth_user` | Database admin username | `neo4j` |
| `neo4j_auth_password` | Database admin password | `{{ vault_neo4j_auth_password }}` |
| `neo4j_http_port` | HTTP browser port | `25554` |
| `neo4j_bolt_port` | Bolt protocol port | `7687` |
| `neo4j_syslog_port` | Local syslog port for Alloy | `22011` |
| `neo4j_apoc_unrestricted` | APOC procedures allowed | `apoc.*` |
### Vault Variables (`group_vars/all/vault.yml`)
| Variable | Description |
|----------|-------------|
| `vault_neo4j_auth_password` | Neo4j admin password |
| `vault_neo4j_auth_password` | Neo4j admin password (Ariel) |
| `vault_mnemosyne_neo4j_auth_password` | Neo4j admin password (Umbriel — dedicated Mnemosyne instance) |
### APOC Plugin Configuration
@@ -128,19 +175,20 @@ The APOC (Awesome Procedures on Cypher) plugin is enabled with the following set
### Alloy Configuration
**File:** `ansible/alloy/ariel/config.alloy.j2`
**Files:** `ansible/alloy/ariel/config.alloy.j2`, `ansible/alloy/umbriel/config.alloy.j2`
Alloy on Ariel collects:
Alloy on each host collects:
- System logs (`/var/log/syslog`, `/var/log/auth.log`)
- Systemd journal
- Neo4j Docker container logs via syslog
- Neo4j Docker container logs via syslog (Ariel: tcp:127.0.0.1:22011; Umbriel: tcp:127.0.0.1:22012)
### Loki Logs
| Log Source | Labels |
|------------|--------|
| Neo4j container | `{job="neo4j", hostname="ariel.incus"}` |
| System logs | `{job="syslog", hostname="ariel.incus"}` |
| Neo4j container (Ariel) | `{job="neo4j", hostname="ariel.incus"}` |
| Neo4j container (Umbriel) | `{job="neo4j", hostname="umbriel.incus"}` |
| System logs | `{job="syslog", hostname="ariel.incus"}` / `{job="syslog", hostname="umbriel.incus"}` |
### Prometheus Metrics
@@ -153,7 +201,8 @@ Host-level metrics collected via Alloy's Unix exporter:
### Log Collection Flow
```
Neo4j Container → Syslog (tcp:127.0.0.1:22011) → Alloy → Loki (Prospero)
Neo4j Container (Ariel) → Syslog (tcp:127.0.0.1:22011) → Alloy → Loki (Prospero)
Neo4j Container (Umbriel) → Syslog (tcp:127.0.0.1:22012) → Alloy → Loki (Prospero)
```
## Operations

View File

@@ -58,7 +58,7 @@
<div class="col-lg-8">
<h1 class="display-4 fw-bold"><i class="bi bi-diagram-3-fill"></i> Ouranos Lab</h1>
<p class="lead">Red Panda Approved™ Infrastructure as Code</p>
<p class="mb-0">10 Incus containers named after moons of Uranus, provisioned with Terraform and configured with Ansible. Accessible at <a href="https://ouranos.helu.ca" class="text-white fw-bold">ouranos.helu.ca</a></p>
<p class="mb-0">11 Incus containers named after moons of Uranus, provisioned with Terraform and configured with Ansible. Accessible at <a href="https://ouranos.helu.ca" class="text-white fw-bold">ouranos.helu.ca</a></p>
</div>
<div class="col-lg-4 text-center mt-3 mt-lg-0">
<div class="badge bg-success fs-6 p-3">
@@ -87,7 +87,7 @@
<div class="card-body">
<p class="card-text">Provisions the Uranian host containers with:</p>
<ul class="mb-0">
<li>10 specialised Incus containers (LXC)</li>
<li>11 specialised Incus containers (LXC)</li>
<li>DNS-resolved networking (<code>.incus</code> domain)</li>
<li>Security policies and nested Docker support</li>
<li>Port proxy devices and resource dependencies</li>
@@ -106,7 +106,7 @@
<p class="card-text">Deploys and configures all services:</p>
<ul class="mb-0">
<li>Docker engine on nested-capable hosts</li>
<li>Databases: PostgreSQL (Portia), Neo4j (Ariel)</li>
<li>Databases: PostgreSQL (Portia), Neo4j (Ariel — shared; Umbriel — dedicated Mnemosyne instance)</li>
<li>Observability: Prometheus, Loki, Grafana (Prospero)</li>
<li>Application runtimes and LLM proxies</li>
<li>HAProxy TLS termination and Casdoor SSO (Titania)</li>
@@ -198,6 +198,12 @@
<td>HAProxy, Casdoor SSO, certbot</td>
<td class="text-center"><i class="bi bi-check-circle-fill text-success"></i></td>
</tr>
<tr>
<td><strong>umbriel</strong></td>
<td><span class="badge bg-warning text-dark">graph_database</span></td>
<td>Neo4j 5.26.0 (dedicated Mnemosyne instance)</td>
<td class="text-center"><i class="bi bi-check-circle-fill text-success"></i></td>
</tr>
</tbody>
</table>
</div>
@@ -250,8 +256,26 @@
<p class="text-muted fst-italic small">Air spirit — ethereal, interconnected nature mirroring graph relationships.</p>
<ul class="mb-0">
<li>Neo4j 5.26.0 (Docker)</li>
<li>HTTP API: port 25554</li>
<li>Bolt: port 7687</li>
<li>HTTP Browser: port 25554</li>
<li>Bolt: port 7687 (reached as <code>ariel.incus:7687</code>)</li>
<li>Shared graph work — Neo4j MCP, exploration</li>
</ul>
</div>
</div>
</div>
<div class="col-lg-6">
<div class="card h-100 border-warning">
<div class="card-header bg-warning text-dark">
<h5 class="mb-0"><i class="bi bi-diagram-2 me-2"></i>umbriel — Graph Database (Mnemosyne)</h5>
</div>
<div class="card-body">
<p class="text-muted fst-italic small">Dusky melancholy sprite from Pope's <em>Rape of the Lock</em> — keeper of the Cave of Spleen, naturally paired with Mnemosyne the Titan of memory.</p>
<ul class="mb-0">
<li>Neo4j 5.26.0 (Docker)</li>
<li>HTTP Browser: port 25555</li>
<li>Bolt: port 7687 (reached as <code>umbriel.incus:7687</code>)</li>
<li>Dedicated to <strong>Mnemosyne</strong> — owns <code>Library</code>/<code>Collection</code>/<code>Item</code>/<code>Chunk</code>/<code>Concept</code> labels, vector index, and schema migrations</li>
</ul>
</div>
</div>
@@ -563,7 +587,7 @@ ansible-vault encrypt new_secrets.yml</code></pre>
<tr><td><code>pplg/deploy.yml</code></td><td>Prospero</td><td>Full observability stack + internal HAProxy + OAuth2-Proxy</td></tr>
<tr><td><code>postgresql/deploy.yml</code></td><td>Portia</td><td>PostgreSQL with all databases</td></tr>
<tr><td><code>postgresql_ssl/deploy.yml</code></td><td>Titania</td><td>Dedicated PostgreSQL for Casdoor</td></tr>
<tr><td><code>neo4j/deploy.yml</code></td><td>Ariel</td><td>Neo4j graph database</td></tr>
<tr><td><code>neo4j/deploy.yml</code></td><td>Ariel, Umbriel</td><td>Neo4j graph database (Umbriel is the dedicated Mnemosyne instance)</td></tr>
<tr><td><code>searxng/deploy.yml</code></td><td>Oberon</td><td>SearXNG privacy search</td></tr>
<tr><td><code>haproxy/deploy.yml</code></td><td>Titania</td><td>HAProxy TLS termination and routing</td></tr>
<tr><td><code>casdoor/deploy.yml</code></td><td>Titania</td><td>Casdoor SSO</td></tr>
@@ -713,6 +737,7 @@ flowchart LR
<tr><td>All LLM apps</td><td>Arke (Sycorax)</td><td><code>http://sycorax.incus:25540</code></td></tr>
<tr><td>Open WebUI, Arke, Gitea, Nextcloud, LobeChat</td><td>PostgreSQL (Portia)</td><td><code>portia.incus:5432</code></td></tr>
<tr><td>Neo4j MCP</td><td>Neo4j (Ariel)</td><td><code>ariel.incus:7687</code> (Bolt)</td></tr>
<tr><td>Mnemosyne</td><td>Neo4j (Umbriel)</td><td><code>umbriel.incus:7687</code> (Bolt) — dedicated tenant</td></tr>
<tr><td>MCP Switchboard</td><td>Docker API (Miranda)</td><td><code>tcp://miranda.incus:2375</code></td></tr>
<tr><td>MCP Switchboard, Kairos, Spelunker</td><td>RabbitMQ (Oberon)</td><td><code>oberon.incus:5672</code></td></tr>
<tr><td>All apps (SMTP)</td><td>smtp4dev (Oberon)</td><td><code>oberon.incus:22025</code></td></tr>

View File

@@ -13,7 +13,61 @@ Infrastructure-as-Code project managing the **Ouranos Lab** — a development sa
> **DNS Domain**: Incus resolves containers via the `.incus` domain suffix (e.g., `oberon.incus`, `portia.incus`). IPv4 addresses are dynamically assigned — always use DNS names, never hardcode IPs.
---
## Project Numbers
- External Apps
- Well known: Postgresl, ssh, web, prometheus
- 220: External Apps (legacy)
- 290: External App 1
- 299: External App 9
- Django Projects:
- 221: Zelus
- 222: Angelia
- 224: Athena
- 225: Kairos
- 226: Icarlos
- 227: MCP Switchboard (227), Spelunker (228), Peitho (229), Mnemosyne (230)
- FastAgent Projects:
- 240: Pallas Iolaus
- 241: Pallas Kottos
- 242: Pallas Mentor
- FastAPI Projects:
- 200: Daedalus
- 201: Arke
- 202: Kernos
- 203: Rommie
- 204: Orpheus
- 205: Periplus
- 206: Nike
- 207: Stentor
- 208: Argos
- 209: Hecate
- 210: Rhema
- 211: Synesis
## Port Numbering
Well-known ports running as a service may be used: Postgresql 5432, Prometheus Metrics 9100.
However inside a docker project, the number plan needs to be followed to avoid port conflicts and confusion:
XXXYZ
XXX Project Number or 290-299 for external project (host specific)
Y Service: 0 reserved, 1-4 flexible, 5 database, 6 MCP, 7 API, 8 Web App, 9 Prometheus metrics
Z Instance: The running instance of this app on the same host, starting at 1. May also be used to handle exceptions.
255 Incus port forwarding: Ports in this range are forwarded from the Incus host to Incus containers (defined in Terraform), but HAProxy through Titania
| Range | Host | Purpose |
|-------|------|---------|
| 2551025519 | caliban | 25512→22 SSH, 25515→5432 Postgres, 25516→8006 web, 25517→8007 web, 25518→8008 web, 25519→3389 RDP |
| 2553025539 | miranda | MCP containers |
| 2554025544 | sycorax | Arke LLM proxy |
| 25554 | ariel | Neo4j |
| 25555 | umbriel | Neo4j (Mnemosyne) |
| 2556025569 | miranda | MCPO ports |
| 2557025589 | puck | 2557025588 app ports, 25589→3389 RDP |
| 2559025599 | oberon | App ports |
514ZZ is the syslog port. Docker containers send their syslog to an Alloy syslog collector port. ZZ is the application instance, they just need to be different on the same host and increment from 01.
## Uranian Host Architecture
@@ -31,6 +85,7 @@ All containers are named after moons of Uranus and resolved via the `.incus` DNS
| **rosalind** | collaboration | Gitea, LobeChat, Nextcloud, AnythingLLM | ✔ |
| **sycorax** | language_models | Arke LLM Proxy | ✔ |
| **titania** | proxy_sso | HAProxy TLS termination + Casdoor SSO | ✔ |
| **umbriel** | graph_database | Neo4j (Mnemosyne) — dedicated memory graph | ✔ |
### puck — Project Application Runtime
@@ -39,12 +94,6 @@ This is the host that runs Python projects in the Ouranos sandbox.
It has an RDP server and is generally where application development happens.
Each project has a number that is used to determine port numbers.
- Docker engine
- JupyterLab (port 22071 via OAuth2-Proxy)
- Gitea Runner (CI/CD agent)
- Django Projects: Zelus (221), Angelia (222), Athena (224), Kairos (225), Icarlos (226), MCP Switchboard (227), Spelunker (228), Peitho (229), Mnemosyne (230)
- FastAgent Projects: Pallas (240)
- FastAPI Projects: Daedalus (200), Arke (201) Kernos (202), Rommie (203), Orpheus (204), Periplus (205), Nike (206), Stentor (207)
### caliban — Agent Automation
@@ -52,46 +101,52 @@ Autonomous computer agent learning through environmental interaction.
- Docker engine
- Agent S MCP Server (MATE desktop, AT-SPI automation)
- Kernos MCP Shell Server (port 22062)
- Rommie MCP Server (port 22061) — agent-to-agent GUI automation via Agent S
- FreeCAD Robust MCP Server (port 22063) — CAD automation via FreeCAD XML-RPC
- Kernos MCP Shell Server
- Rommie MCP Server — agent-to-agent GUI automation via Agent S
- FreeCAD Robust MCP Server — CAD automation via FreeCAD XML-RPC
- GPU passthrough
- RDP access (port 25521)
- RDP access
### oberon — Container Orchestration & Dockerized Shared Services
King of the Fairies orchestrating containers and managing MCP infrastructure.
- Docker engine
- MCP Switchboard (port 22781) — Django app routing MCP tool calls
- RabbitMQ message queue
- smtp4dev SMTP test server (port 22025)
- smtp4dev SMTP test server
### portia — Relational Database
Intelligent and resourceful — the reliability of relational databases.
- PostgreSQL 17 (port 5432)
- Databases: `arke`, `anythingllm`, `gitea`, `hass`, `lobechat`, `mcp_switchboard`, `nextcloud`, `openwebui`, `periplus`, `spelunker`
- Databases: `arke`, `anythingllm`, `gitea`, `hass`, `lobechat`, `mcp_switchboard`, `mnemosyne`, `nextcloud`, `openwebui`, `periplus`, `spelunker`
### ariel — Graph Database
Air spirit — ethereal, interconnected nature mirroring graph relationships.
- Neo4j (Docker)
- Neo4j 5.26.0 (Docker)
- HTTP API: port 25584
- Bolt: port 25554
### umbriel — Graph Database (Mnemosyne)
Dusky melancholy sprite from Pope's *Rape of the Lock* — keeper of the Cave of
Spleen, naturally paired with Mnemosyne the Titan of memory. Dedicated Neo4j
instance so Mnemosyne's `Library`/`Collection`/`Item`/`Chunk`/`Concept` labels,
vector indexes, and schema migrations can't collide with another tenant's
graph on Ariel.
- Neo4j (Docker)
### miranda — MCP Docker Host
Curious bridge between worlds — hosting MCP server containers.
- Docker engine (API exposed on port 2375 for MCP Switchboard)
- MCPO OpenAI-compatible MCP proxy 22071
- Argos MCP Server — web search via SearXNG (port 22062)
- Grafana MCP Server (port 22063)
- Neo4j MCP Server (port 22064)
- Gitea MCP Server (port 22065)
- Docker engine
- MCPO OpenAI-compatible MCP
- Argos MCP Server — web search via SearXNG
- Grafana MCP Server
- Neo4j MCP Server
- Gitea MCP Server
### prospero — Observability Stack
@@ -108,11 +163,10 @@ Master magician observing all events.
Witty and resourceful moon for PHP, Go, and Node.js runtimes.
- SearXNG privacy search (port 22083, behind OAuth2-Proxy)
- Gitea self-hosted Git (port 22082, SSH on 22022)
- LobeChat AI chat interface (port 22081)
- Nextcloud file sharing and collaboration (port 22083)
- AnythingLLM document AI workspace (port 22084)
- SearXNG privacy search
- Gitea self-hosted Git
- Nextcloud file sharing and collaboration
- Jellyfin media server (port 22086, NVIDIA transcoding, Casdoor SSO)
- Nextcloud data on dedicated Incus storage volume
- Open WebUI LLM interface (port 22088, PostgreSQL backend on Portia
- Home Assistant (port 8123)
@@ -121,7 +175,7 @@ Witty and resourceful moon for PHP, Go, and Node.js runtimes.
Original magical power wielding language magic.
- Arke LLM API Proxy (port 25540)
- Arke LLM API Proxy
- Multi-provider support (OpenAI, Anthropic, etc.)
- Session management with Memcached
- Database backend on Portia
@@ -130,7 +184,7 @@ Original magical power wielding language magic.
Queen of the Fairies managing access control and authentication.
- HAProxy 3.x with TLS termination (port 443)
- HAProxy 3.x with TLS termination
- Let's Encrypt wildcard certificate via certbot DNS-01 (Namecheap)
- HTTP to HTTPS redirect (port 80)
- Gitea SSH proxy (port 22022)
@@ -139,21 +193,6 @@ Queen of the Fairies managing access control and authentication.
---
## Port Numbering
Well-known ports running as a service may be used: Postgresql 5432, Prometheus Metrics 9100.
However inside a docker project, the number plan needs to be followed to avoid port conflicts and confusion:
XXXYZ
XXX Project Number or 220 for external project
Y Service: 0 reserved, 1-4 flexible, 5 database, 6 MCP, 7 API, 8 Web App, 9 Prometheus metrics
Z Instance: The running instance of this app on the same host, starting at 1. May also be used to handle exceptions.
255 Incus port forwarding: Ports in ths range are forwarded from the Incus host to Incus containers (defined in Terraform)
514ZZ is the syslog port. Docker containers send their syslog to an Alloy syslog collector port. ZZ is the application instance, they just need to be different on the same host and increment from 01.
---
## Application Conventions
@@ -242,34 +281,9 @@ Titania provides TLS termination and reverse proxy for all services.
- **HTTP**: port 80 (redirects to HTTPS)
- **Certificate**: Let's Encrypt wildcard via certbot DNS-01
### Route Table
### Subdomains
| Subdomain | Backend | Service |
|-----------|---------|---------|
| `ouranos.helu.ca` (root) | puck.incus:22281 | Angelia (Django) |
| `alertmanager.ouranos.helu.ca` | prospero.incus:443 (SSL) | AlertManager |
| `angelia.ouranos.helu.ca` | puck.incus:22281 | Angelia (Django) |
| `anythingllm.ouranos.helu.ca` | rosalind.incus:22084 | AnythingLLM |
| `arke.ouranos.helu.ca` | sycorax.incus:25540 | Arke LLM Proxy |
| `athena.ouranos.helu.ca` | puck.incus:22481 | Athena (Django) |
| `gitea.ouranos.helu.ca` | rosalind.incus:22082 | Gitea |
| `grafana.ouranos.helu.ca` | prospero.incus:443 (SSL) | Grafana |
| `hass.ouranos.helu.ca` | oberon.incus:8123 | Home Assistant |
| `id.ouranos.helu.ca` | titania.incus:22081 | Casdoor SSO |
| `icarlos.ouranos.helu.ca` | puck.incus:22681 | Icarlos (Django) |
| `jupyterlab.ouranos.helu.ca` | puck.incus:22071 | JupyterLab (OAuth2-Proxy) |
| `kairos.ouranos.helu.ca` | puck.incus:22581 | Kairos (Django) |
| `lobechat.ouranos.helu.ca` | rosalind.incus:22081 | LobeChat |
| `loki.ouranos.helu.ca` | prospero.incus:443 (SSL) | Loki |
| `mcp-switchboard.ouranos.helu.ca` | oberon.incus:22781 | MCP Switchboard |
| `nextcloud.ouranos.helu.ca` | rosalind.incus:22083 | Nextcloud |
| `openwebui.ouranos.helu.ca` | oberon.incus:22088 | Open WebUI |
| `peitho.ouranos.helu.ca` | puck.incus:22981 | Peitho (Django) |
| `pgadmin.ouranos.helu.ca` | prospero.incus:443 (SSL) | PgAdmin 4 |
| `prometheus.ouranos.helu.ca` | prospero.incus:443 (SSL) | Prometheus |
| `searxng.ouranos.helu.ca` | oberon.incus:22073 | SearXNG (OAuth2-Proxy) |
| `smtp4dev.ouranos.helu.ca` | oberon.incus:22085 | smtp4dev |
| `spelunker.ouranos.helu.ca` | puck.incus:22881 | Spelunker (Django) |
Refer to the Ansible Titania host inventory (`inventory/host_vars/titania.incus.yml`) for current backend routing configuration.
---
@@ -296,6 +310,35 @@ ansible-playbook site.yml
ansible-playbook sandbox_down.yml
```
### Python Virtual Environment Setup
The Ansible automation requires a Python virtual environment with the `ansible` package installed. Create and activate the environment from the `~` directory:
```bash
# Create virtual environment
cd ~
python3 -m venv env/ouranos
# Activate environment
source ~/env/ouranos/bin/activate
# Install Ansible
pip install ansible
pip install ansible-core
pip install ansible-community.postgresql
```
### Ansible Playbook Syntax Check
Before running playbooks, use the `apsc.sh` utility (in PATH) to quickly validate YAML syntax:
```bash
# From the ansible directory
apsc.sh
# This will check all YAML files in the current directory for syntax errors
```
### Terraform Workflow
1. **Define** — Containers, networks, and resources in `*.tf` files
@@ -343,6 +386,7 @@ terraform import 'incus_instance.uranian_hosts["prospero"]' ouranos/prospero,ima
terraform import 'incus_instance.uranian_hosts["rosalind"]' ouranos/rosalind,image=75cde3e755b0e657c05f67e03a42683217b233b0339448be747845747df58644
terraform import 'incus_instance.uranian_hosts["sycorax"]' ouranos/sycorax,image=75cde3e755b0e657c05f67e03a42683217b233b0339448be747845747df58644
terraform import 'incus_instance.uranian_hosts["titania"]' ouranos/titania,image=75cde3e755b0e657c05f67e03a42683217b233b0339448be747845747df58644
terraform import 'incus_instance.uranian_hosts["umbriel"]' ouranos/umbriel,image=75cde3e755b0e657c05f67e03a42683217b233b0339448be747845747df58644
# Containers using questing image
terraform import 'incus_instance.uranian_hosts["caliban"]' ouranos/caliban,image=e78dd4a406b7fa3592ed0a6048862260b3d2e50c76e32a6169930245c0a13fdf
@@ -405,13 +449,45 @@ ansible-vault encrypt new_secrets.yml
Terraform provisions Incus S3 buckets for services requiring object storage:
| Service | Host | Purpose |
|---------|------|---------|
| **Casdoor** | Titania | User avatars and SSO resource storage |
| **LobeChat** | Rosalind | File uploads and attachments |
| Name | Description |
|---------------------|----------------------------------|
| `casdoor` | Casdoor file storage bucket |
| `daedalus` | Daedalus file storage bucket |
| `lobechat` | Lobechat file storage bucket |
| `mnemosyne-content` | Mnemosyne content storage bucket |
| `spelunker` | Spelunker file storage bucket |
> S3 credentials (access key, secret key, endpoint) are stored as sensitive Terraform outputs and managed in Ansible Vault with the `vault_*_s3_*` prefix.
### Retrieving S3 Bucket Credentials
The bucket credentials are declared as **sensitive** Terraform outputs, so a plain
`terraform output` will mask them. Use the `-json` (or `-raw`) flag to reveal the
values:
```bash
cd terraform
# List all outputs (sensitive values shown as <sensitive>)
terraform output
# Show a specific bucket's credentials as JSON
terraform output -json casdoor_s3_credentials
terraform output -json daedalus_s3_credentials
terraform output -json lobechat_s3_credentials
terraform output -json mnemosyne_s3_credentials
terraform output -json spelunker_s3_credentials
# Extract a single field (e.g. access_key) with jq
terraform output -json casdoor_s3_credentials | jq -r .access_key
terraform output -json casdoor_s3_credentials | jq -r .secret_key
terraform output -json casdoor_s3_credentials | jq -r .endpoint
```
Each `*_s3_credentials` output contains `bucket`, `access_key`, `secret_key`, and
`endpoint`. Copy these into `inventory/group_vars/all/vault.yml` as
`vault_<service>_s3_access_key`, `vault_<service>_s3_secret_key`, etc.
---
## Ansible Automation
@@ -430,7 +506,7 @@ Playbooks run in dependency order:
| `pplg/deploy.yml` | Prospero | Full observability stack + HAProxy + OAuth2-Proxy |
| `postgresql/deploy.yml` | Portia | PostgreSQL with all databases |
| `postgresql_ssl/deploy.yml` | Titania | Dedicated PostgreSQL for Casdoor |
| `neo4j/deploy.yml` | Ariel | Neo4j graph database |
| `neo4j/deploy.yml` | Ariel, Umbriel | Neo4j graph database (Umbriel is the dedicated Mnemosyne instance) |
| `searxng/deploy.yml` | Oberon | SearXNG privacy search |
| `haproxy/deploy.yml` | Titania | HAProxy TLS termination and routing |
| `casdoor/deploy.yml` | Titania | Casdoor SSO |
@@ -454,6 +530,7 @@ Services with standalone deploy playbooks (not in `site.yml`):
| `gitea_mcp/deploy.yml` | Miranda | Gitea MCP Server |
| `gitea_runner/deploy.yml` | Puck | Gitea CI/CD runner |
| `grafana_mcp/deploy.yml` | Miranda | Grafana MCP Server |
| `jellyfin/deploy.yml` | Rosalind | Jellyfin media server |
| `jupyterlab/deploy.yml` | Puck | JupyterLab + OAuth2-Proxy |
| `kernos/deploy.yml` | Caliban | Kernos MCP shell server |
| `lobechat/deploy.yml` | Rosalind | LobeChat AI chat |
@@ -490,6 +567,7 @@ collect metrics & logs storage & visualisation notifications
| All LLM apps | Arke (Sycorax) | `http://sycorax.incus:25540` |
| Open WebUI, Arke, Gitea, Nextcloud, LobeChat | PostgreSQL (Portia) | `portia.incus:5432` |
| Neo4j MCP | Neo4j (Ariel) | `ariel.incus:7687` (Bolt) |
| Mnemosyne | Neo4j (Umbriel) | `umbriel.incus:7687` (Bolt) — dedicated tenant |
| MCP Switchboard | Docker API (Miranda) | `tcp://miranda.incus:2375` |
| MCP Switchboard | RabbitMQ (Oberon) | `oberon.incus:5672` |
| Kairos, Spelunker | RabbitMQ (Oberon) | `oberon.incus:5672` |

View File

@@ -184,7 +184,8 @@ vault_grafana_oauth_client_secret: "YourGrafanaOAuthSecret"
#### 7. PgAdmin Setup
Just do it manually:
cmd: /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db
sudo -u pgadmin /usr/pgadmin4/venv/bin/python3 /usr/pgadmin4/web/setup.py setup-db
**Requirements:**
- **Purpose**: Initial local admin account (fallback when OAuth is unavailable)
@@ -483,17 +484,35 @@ vault_casdoor_prometheus_access_key: "your-casdoor-access-key"
vault_casdoor_prometheus_access_secret: "your-casdoor-access-secret"
```
#### Certificate fetch fails
#### TLS cert expired / not renewing on `*.ouranos.helu.ca`
**Cause**: Titania not running or certbot hasn't provisioned the cert yet.
TLS for all PPLG subdomains is terminated by **Titania's native HAProxy** using
the Let's Encrypt wildcard cert managed by certbot on Titania (see
[certbot DNS-01 with Namecheap](cerbot.md)). PPLG itself holds no cert.
**Fix**: Ensure Titania is up and certbot has run:
**Most likely cause**: certbot renewed the lineage but the deploy hook failed to
install the new cert into HAProxy's served PEM (`/etc/haproxy/certs/ouranos.pem`),
so HAProxy keeps serving the old file until it expires. Certbot reports such hook
failures only as a WARNING, so the renewal looks successful.
**Diagnose** (on Titania):
```bash
ansible-playbook sandbox_up.yml
ansible-playbook certbot/deploy.yml
# Does the served file match the certbot lineage?
sudo openssl x509 -enddate -noout -in /etc/haproxy/certs/ouranos.pem
sudo openssl x509 -enddate -noout \
-in /srv/certbot/config/live/wildcard.ouranos.helu.ca/fullchain.pem
# Look for a failing hook
sudo grep -iE 'hook|Permission denied|reload failed|STALE' /srv/certbot/logs/letsencrypt.log*
```
The playbook falls back to a self-signed certificate if Titania is unavailable.
**Fix**: re-run the playbooks (in this order) and force a renewal to reinstall:
```bash
ansible-playbook haproxy/deploy.yml --limit titania.incus
ansible-playbook certbot/deploy.yml --limit titania.incus
```
See the certbot doc's [permission model](cerbot.md#permission-model-why-renewals-can-silently-fail)
for the `certbot`-user permissions the hook depends on.
#### OAuth2 redirect loops

View File

@@ -44,7 +44,7 @@ The playbook imports `agent_s/deploy.yml` first to ensure the MATE desktop and A
4. Installs Rommie into the venv in editable mode (`pip install -e`)
5. Deploys `~/rommie/.env` from the template
6. Deploys and enables the `rommie.service` systemd unit
7. Health-checks `http://localhost:<rommie_port>/mcp` (retries 5×, 3 s apart)
7. Health-checks `http://localhost:<rommie_port>/mcp` (retries 5×, 3 s apart, accepts 200/406)
## MCP Tools
@@ -64,7 +64,7 @@ External Agent (e.g., Claude Desktop / MCP Switchboard)
│ https://rommie.ouranos.helu.ca/mcp
Titania HAProxy (TLS termination, wildcard cert)
│ http://caliban.incus:22031/mcp
│ http://caliban.incus:20361/mcp
Rommie MCP Server
(serialized task execution, multi-client reads)
@@ -80,15 +80,15 @@ External Agent (e.g., Claude Desktop / MCP Switchboard)
| Variable | Default | Description |
|----------|---------|-------------|
| `rommie_port` | `22031` | HTTP listen port |
| `rommie_port` | `20361` | HTTP listen port |
| `rommie_host` | `0.0.0.0` | Bind address |
| `rommie_display` | `:10` | X11 display for Agent S (XRDP assigns `:10` by default) |
| `rommie_allowed_hosts` | `caliban.incus` | Allowed Host header values |
| `rommie_model` | `Qwen3-VL-30B-A3B-Instruct-UD-Q5_K_XL.gguf` | Primary vision-language model |
| `rommie_model_url` | `http://nyx.helu.ca:22078` | Inference endpoint for the primary model |
| `rommie_model` | `Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf` | Primary vision-language model |
| `rommie_model_url` | `http://nyx.helu.ca:22072` | Inference endpoint for the primary model |
| `rommie_provider` | `openai` | API provider for the primary model |
| `rommie_ground_provider` | `huggingface` | API provider for the grounding model |
| `rommie_ground_url` | `http://pan.helu.ca:22078` | Inference endpoint for the grounding model |
| `rommie_ground_url` | `http://pan.helu.ca:22076` | Inference endpoint for the grounding model |
| `rommie_ground_model` | `UI-TARS-7B-DPO-Q6_K_L.gguf` | Grounding model (UI element localisation) |
| `rommie_grounding_width` | `1024` | Screenshot width passed to the grounding model |
| `rommie_grounding_height` | `1024` | Screenshot height passed to the grounding model |
@@ -136,7 +136,7 @@ The unit runs as `principal_user` (`robert`) and loads environment from `~/rommi
### Health check fails
The playbook probes `http://localhost:22031/mcp` after starting the service. If it times out:
The playbook probes `http://localhost:20361/mcp` after starting the service. If it times out:
1. Check the service started: `systemctl status rommie`
2. Confirm the `DISPLAY` variable resolves — XRDP must have created the `:10` display before Rommie starts

284
docs/searxng.md Normal file
View File

@@ -0,0 +1,284 @@
# SearXNG
## Overview
SearXNG is a privacy-respecting metasearch engine that aggregates results from
multiple upstream search providers and re-ranks them. The Ouranos deployment runs
as a single Docker container behind an authenticating OAuth2-Proxy sidecar (see
[`searxng-auth.md`](./searxng-auth.md) for the auth design).
**Host:** `rosalind.incus`
**Container port:** 22089 (host) → 8080 (container)
**Public URL:** `https://searxng.ouranos.helu.ca/` (via HAProxy → OAuth2-Proxy → SearXNG)
**Internal URL:** `http://rosalind.incus:22089/` (used by LobeChat, Argos, etc.)
## Ansible Deployment
### Layout
```
ansible/searxng/
├── deploy.yml # Main deployment playbook
├── deploy_oauth2.yml # OAuth2-Proxy sidecar playbook
├── docker-compose.yml.j2 # Docker Compose template
├── searxng-settings.yml.j2 # SearXNG settings.yml template
├── oauth2-proxy-searxng.cfg.j2 # OAuth2-Proxy config (see searxng-auth.md)
└── oauth2-proxy-searxng.service.j2 # Systemd unit for the sidecar
```
### Run
```bash
cd ansible
ansible-playbook searxng/deploy.yml --limit rosalind.incus
ansible-playbook searxng/deploy_oauth2.yml --limit rosalind.incus
```
`deploy.yml`:
1. Skips hosts that don't list `searxng` in their `services` list.
2. Creates the `searxng` system user and `/srv/searxng` directory.
3. Templates `docker-compose.yml` and `searxng-settings.yml` into `/srv/searxng/`.
4. Brings up the container with `community.docker.docker_compose_v2` (`pull: always`).
The container mounts `searxng-settings.yml` read-only at
`/etc/searxng/settings.yml`. There is no persistent volume — the cache lives in
the container's `/tmp` and is rebuilt on restart.
### Variables
#### Host Variables (`inventory/host_vars/rosalind.incus.yml`)
| Variable | Value | Purpose |
|--------------------------|----------------------------------|----------------------------------|
| `searxng_port` | `22089` | Host-side container port |
| `searxng_base_url` | `http://rosalind.incus:22089/` | Used by SearXNG to build URLs |
| `searxng_instance_name` | `Ouranos Search` | Shown in the UI header |
| `searxng_directory` | `/srv/searxng` | Compose project dir on the host |
| `searxng_user`/`group` | `searxng` | Owns templated config files |
| `searxng_syslog_port` | `51403` | Alloy syslog receiver port |
#### Vault Variables (`group_vars/all/vault.yml`)
| Variable | Purpose |
|--------------------------------|------------------------------------------------------------|
| `vault_searxng_secret_key` | `server.secret_key` — also used as cache DB password |
| `vault_searxng_brave_api_key` | Brave Search API subscription token (see below) |
| `vault_searxng_oauth_*` | OAuth2-Proxy sidecar — see `searxng-auth.md` |
> ⚠️ **Changing `vault_searxng_secret_key` truncates the cache.** SearXNG hashes
> cache keys with the secret key; on mismatch it drops every cache table on next
> startup. Harmless, but be aware that engines like `wikidata` and
> `radio_browser` will need to re-fetch their on-disk indexes.
## Search Engine Configuration
The engine list is templated in `searxng-settings.yml.j2` and merges with the
upstream defaults via `use_default_settings: true`. The merge is keyed by engine
`name` and is shallow — **only fields you explicitly set override the
defaults**, everything else (including hidden ones like `inactive`) is inherited.
### Enabled engines
| Engine | Notes |
|--------------|----------------------------------------------------|
| `duckduckgo` | General web |
| `startpage` | General web |
| `mojeek` | General web |
| `braveapi` | Brave Search via official REST API (see below) |
### Disabled engines
| Engine | Reason |
|--------------------------------|------------------------------------------------------------|
| `google` | Aggressive bot detection / unstable scraping results |
| `bing news` | Frequent parsing errors |
| `brave` (HTML scraper) | Replaced by `braveapi` — keeping both duplicates results |
| `brave.images` / `.videos` / `.news` | Scraping endpoints return 451 / access-denied |
| `duckduckgo images` | Suspended / access-denied responses |
| `pexels`, `vimeo` | Same — suspended / access-denied |
> **Why disable Google and Bing's web search?** Google's HTML scraper is
> blocked aggressively and produces low-quality / inconsistent results. Bing's
> news scraper hits parser failures often enough to be more noise than signal.
> The remaining four engines (Brave API, DuckDuckGo, Startpage, Mojeek) cover
> general web search with stable results and no API rate-limit surprises.
### Brave Search API (`braveapi`)
`braveapi` is the official REST API engine — distinct from the `brave` engine,
which scrapes the public Brave Search HTML. The API engine is more reliable, has
proper rate limiting, and supports paging and time-range filters.
#### Configuration
```yaml
- name: braveapi
engine: braveapi
api_key: "{{ searxng_brave_api_key }}"
results_per_page: 20
inactive: false
disabled: false
```
#### `inactive: false` is required
The upstream SearXNG `settings.yml` ships `braveapi` with `inactive: true` and
an empty API key. Because `use_default_settings` does a shallow merge, an
override that only sets `disabled: false` leaves the inherited `inactive: true`
in place — and `inactive` engines are filtered out before `load_engine()` runs.
The result is a silent disable: no error appears in the logs, and the engine
never shows up in `/config`.
`disabled` and `inactive` are different gates:
- **`disabled`** — engine still loads; user can toggle it on/off via Preferences.
- **`inactive`** — engine is filtered out before loading; the UI never sees it.
You need both `inactive: false` and `disabled: false` (or omit `disabled` and
let the default `false` apply).
#### Endpoint and result handling
The engine implementation (`searx/engines/braveapi.py`) hits a single endpoint:
```
https://api.search.brave.com/res/v1/web/search
```
with the `X-Subscription-Token` header. Although the Brave API can return
multiple result sections (`web`, `news`, `videos`, `discussions`, `infobox`,
`locations`, etc.), the SearXNG engine **only consumes `data["web"]["results"]`**.
Other sections in the response are silently discarded.
This means `braveapi` cannot be split into `braveapi.images` / `braveapi.news`
/ `braveapi.videos` engines the way the HTML-scraper `brave` engine is. To
surface those result types from Brave you'd need to patch the upstream engine
module. For now, the disabled `brave.*` scrapers and other category-specific
engines fill that role.
#### Categories
`braveapi` declares `categories = ["general", "web"]` at module level. You don't
need to override this in the YAML.
### Verifying the engine is live
After `ansible-playbook searxng/deploy.yml` and a container restart:
```bash
# 1. Engine is loaded and registered
curl -s 'http://rosalind.incus:22089/config' \
| jq '.engines[] | select(.name=="braveapi")'
# 2. Direct query — bypasses any UI/category filtering
curl -s 'http://rosalind.incus:22089/search?q=python&format=json&engines=braveapi' \
| jq '.results | length, .unresponsive_engines'
# 3. Container logs — look for braveapi-specific errors
docker logs searxng 2>&1 | grep -i braveapi
```
## Authentication
SearXNG itself does not authenticate users. All public access goes through an
OAuth2-Proxy sidecar that talks to Casdoor for OIDC. Internal callers
(LobeChat, Argos, etc.) hit `http://rosalind.incus:22089/` directly and bypass
auth.
See [`searxng-auth.md`](./searxng-auth.md) for the full design and Casdoor
application setup.
## Monitoring
### Logs
The container is configured to ship its stdout/stderr to Alloy's syslog
receiver:
```yaml
logging:
driver: syslog
options:
syslog-address: "tcp://127.0.0.1:51403"
syslog-format: "{{syslog_format}}"
tag: "searxng"
```
Alloy on `rosalind.incus` forwards these to Loki. Query in Grafana with:
```
{job="searxng", host="rosalind.incus"}
```
### Health check
```bash
curl -fsS http://rosalind.incus:22089/healthz
```
## Operations
### Restart
```bash
ssh rosalind.incus
cd /srv/searxng
docker compose restart
```
### Force pull a newer image
```bash
ssh rosalind.incus
cd /srv/searxng
docker compose pull
docker compose up -d
```
Or just re-run the playbook — `pull: always` is set on the deploy task.
### Inspect rendered settings inside the container
```bash
ssh rosalind.incus
docker exec searxng cat /etc/searxng/settings.yml | grep -A6 -B1 braveapi
```
## Troubleshooting
### "Brave doesn't work"
1. Confirm the engine is registered: `/config` JSON should include a `braveapi`
entry. If absent, `inactive: false` is missing or the template didn't deploy.
2. Confirm the API key is non-empty inside the container — see "Inspect rendered
settings" above.
3. Hit the engine directly with `&engines=braveapi`. If `unresponsive_engines`
contains it with a reason, that's your real error (auth, rate limit, network).
### `radio_browser` / `wikidata` init errors at startup
These are unrelated to your engine config:
- **`radio_browser`** — known cache init-order bug in recent
`searxng/searxng:latest` images. The SQLite `properties` table isn't created
before `radio_browser.init()` calls `CACHE.get(...)`. The engine simply stays
unregistered; other engines work normally. Pinning to an older image tag
works around it.
- **`wikidata`** — transient: `query.wikidata.org` returned a truncated SPARQL
response during the startup language-fetch. Restart the container; if it
persists, Wikidata is rate-limiting the source IP.
### Cache appears stale after rotating `vault_searxng_secret_key`
Expected. The secret key is hashed and used as the cache password; on mismatch
SearXNG truncates every cache table at startup. No data loss — search still
works, the engines just rebuild their indexes lazily.
## References
- Upstream docs: <https://docs.searxng.org/>
- Brave Search API engine: <https://docs.searxng.org/dev/engines/online/brave.html>
- Brave Search API reference: [`brave_search_api.md`](./brave_search_api.md)
- SearXNG authentication design: [`searxng-auth.md`](./searxng-auth.md)
- [Ansible Practices](./ansible.md)

View File

@@ -60,6 +60,23 @@ EOT
}
}]
}
umbriel = {
description = "Neo4j Host (Mnemosyne) - Dusky sprite keeping the memory graph"
role = "graph_database"
image = "noble"
config = {
"security.nesting" = true
"raw.lxc" = "lxc.apparmor.profile=unconfined"
}
devices = [{
name = "neo4j_ports"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25555"
connect = "tcp:127.0.0.1:25555"
}
}]
}
miranda = {
description = "Dedicated Docker Host for MCP Servers - Curious bridge between worlds"
role = "mcp_docker_host"
@@ -141,43 +158,68 @@ EOT
"security.nesting" = true
"raw.lxc" = "lxc.apparmor.profile=unconfined"
}
devices = [{
name = "caliban"
devices = [
{
name = "caliban_rdp"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25519"
connect = "tcp:127.0.0.1:3389"
}
},
{
name = "caliban_web3"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25518"
connect = "tcp:127.0.0.1:8008"
}
},
{
name = "caliban_web2"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25517"
connect = "tcp:127.0.0.1:8007"
}
},
{
name = "caliban_web1"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25516"
connect = "tcp:127.0.0.1:8006"
}
},
{
name = "caliban_postgres"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25515"
connect = "tcp:127.0.0.1:5432"
}
},
{
name = "caliban_ssh"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25512"
connect = "tcp:127.0.0.1:22"
}
},
{
name = "gpu"
type = "gpu"
properties = {}
}]
}
]
}
prospero = {
description = "Master magician observing events - PPLG observability stack with internal HAProxy"
role = "observability"
image = "noble"
config = {}
devices = [
{
name = "https_internal"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25510"
connect = "tcp:127.0.0.1:443"
}
},
{
name = "http_redirect"
type = "proxy"
properties = {
listen = "tcp:0.0.0.0:25511"
connect = "tcp:127.0.0.1:80"
}
}
]
devices = []
}
titania = {
description = "Proxy & SSO Services - Queen of the fairies managing access and authentication"

View File

@@ -164,3 +164,33 @@ output "mnemosyne_s3_credentials" {
}
sensitive = true
}
# S3 bucket for Peitho file storage (document versions + converted Office files)
resource "incus_storage_bucket" "peitho" {
name = "peitho"
pool = var.storage_pool
project = var.project_name
description = "Peitho document storage bucket"
depends_on = [incus_project.ouranos]
}
# Access key for Peitho S3 bucket
resource "incus_storage_bucket_key" "peitho_key" {
name = "peitho-access"
pool = incus_storage_bucket.peitho.pool
storage_bucket = incus_storage_bucket.peitho.name
project = var.project_name
role = "admin"
}
output "peitho_s3_credentials" {
description = "Peitho S3 bucket credentials - store in vault as vault_peitho_s3_*"
value = {
bucket = incus_storage_bucket.peitho.name
access_key = incus_storage_bucket_key.peitho_key.access_key
secret_key = incus_storage_bucket_key.peitho_key.secret_key
endpoint = "https://${incus_storage_bucket.peitho.location}"
}
sensitive = true
}

View File

@@ -4,6 +4,7 @@ terraform {
required_providers {
incus = {
source = "lxc/incus"
version = "~> 1.0"
}
}
}