feat(alloy): add journal relabeling and kottos integration on puck

Introduce structured journal relabel rules on puck to tag Pallas-managed units with {service, project, component} labels matching the Mnemosyne and Daedalus schema. Add kottos release variable and vault secrets example entries for the new Pallas FastAgent runtime. Remove the defunct mnemosyne syslog listener now that Mnemosyne ships JSON logs via the docker-socket pipeline.
2026-05-11 13:54:14 -04:00
parent e92ab80bbf
commit 8c95173705
19 changed files with 1336 additions and 27 deletions
--- a/ansible/alloy/puck/config.alloy.j2
+++ b/ansible/alloy/puck/config.alloy.j2
@@ -18,10 +18,60 @@ loki.source.file "system_logs" {
  forward_to = [loki.write.default.receiver]
 }
 // Journal relabel rules — tag Pallas-managed units (kottos now, mentor /
 // iolaus later) with the same {service, project, component} schema used
 // by Mnemosyne and Daedalus.  Rules run top-to-bottom and STOP at the
 // first target_label match per source, so the generic "systemd" fallback
 // stays last.  If a new Pallas host/project ever lands here, copy one of
 // the blocks below and adjust SyslogIdentifier + project.
 loki.relabel "journal_puck" {
  forward_to = []
  // Expose the systemd unit as an auxiliary label for debugging.
  rule {
    source_labels = ["__journal__systemd_unit"]
    target_label  = "unit"
  }
  // Kottos — Pallas FastAgent runtime for the engineering agent project.
  // SyslogIdentifier=kottos is set in ouranos/ansible/kottos/kottos.service.j2.
  rule {
    source_labels = ["__journal_syslog_identifier"]
    regex         = "kottos"
    target_label  = "service"
    replacement   = "pallas"
  }
  rule {
    source_labels = ["__journal_syslog_identifier"]
    regex         = "kottos"
    target_label  = "project"
    replacement   = "kottos"
  }
  // Alloy itself — useful to separate from the "systemd" bucket when the
  // shipping pipeline misbehaves.
  rule {
    source_labels = ["__journal__systemd_unit"]
    regex         = "alloy\\.service"
    target_label  = "service"
    replacement   = "alloy"
  }
  // Default fallback — everything else becomes service="systemd".  We
  // also set job here for backwards compatibility with existing
  // dashboards that filter on ``job="systemd"``.
  rule {
    source_labels = ["__journal__systemd_unit"]
    regex         = ".+"
    target_label  = "job"
    replacement   = "systemd"
  }
 }
 loki.source.journal "systemd_logs" {
  forward_to    = [loki.write.default.receiver]
  relabel_rules = loki.relabel.journal_puck.rules
  labels = {
    job = "systemd",
    hostname    = "{{inventory_hostname}}",
    environment = "{{deployment_environment}}",
  }
@@ -69,19 +119,11 @@ loki.source.syslog "kairos_logs" {
  forward_to = [loki.write.default.receiver]
 }
-loki.source.syslog "menosyne_logs" {
+// Mnemosyne used to ship via syslog on {{mnemosyne_syslog_port}}; it now
-  listener {
+// logs line-delimited JSON to container stdout and is picked up by the
-    address  = "127.0.0.1:{{mnemosyne_syslog_port}}"
+// docker-socket block below. The host_var is retained as a reserved port
-    protocol = "tcp"
+// number but no listener binds to it — remove the var from the inventory
-    syslog_format = "{{ syslog_format }}"
+// when the rollout is verified.
    labels = {
      job = "menosyne",
      hostname = "{{inventory_hostname}}",
      environment = "{{deployment_environment}}",
    }
  }
  forward_to = [loki.write.default.receiver]
 }
 loki.source.syslog "spelunker_logs" {
  listener {
@@ -111,19 +153,66 @@ loki.source.syslog "jupyterlab_logs" {
  forward_to = [loki.write.default.receiver]
 }
-loki.source.syslog "daedalus_logs" {
+// Daedalus also used to ship via syslog on {{daedalus_syslog_port}}; it
-  listener {
+// already emits structlog JSON to stdout, so the docker-socket block
-    address  = "127.0.0.1:{{daedalus_syslog_port}}"
+// below now handles it. Host_var kept for the same transitional reason
-    protocol = "tcp"
+// as mnemosyne above.
-    syslog_format = "{{ syslog_format }}"
+
 // ----------------------------------------------------------------------------
 // Docker socket — any compose project on this host lands in Loki with
 // `service` = compose project (e.g. "mnemosyne", "daedalus", "kairos") and
 // `component` = compose service (e.g. "app", "mcp", "worker", "nginx").
 // This replaces per-service syslog listeners — one block covers every
 // compose project, current and future.
 //
 // Requires: the Alloy process to have read access to /var/run/docker.sock
 // (Ansible role should add the alloy user to the `docker` group). No Docker
 // daemon changes required — we scrape the json-file driver, which is Docker's
 // default and is pinned in each compose project's x-logging anchor.
 // ----------------------------------------------------------------------------
 discovery.docker "containers" {
  host             = "unix:///var/run/docker.sock"
  refresh_interval = "30s"
 }
 discovery.relabel "containers" {
  targets = discovery.docker.containers.targets
  // Compose project → service label
  rule {
    source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
    target_label  = "service"
  }
  // Compose service → component label
  rule {
    source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
    target_label  = "component"
  }
  // Container name (for one-off / non-compose containers)
  rule {
    source_labels = ["__meta_docker_container_name"]
    regex         = "/(.*)"
    target_label  = "container"
  }
  // Fall back to the container name as `service` when compose labels are
  // absent (e.g. a `docker run ...` container outside any compose project)
  rule {
    source_labels = ["service", "container"]
    separator     = "@"
    regex         = "@(.+)"
    target_label  = "service"
  }
 }
 loki.source.docker "containers" {
  host       = "unix:///var/run/docker.sock"
  targets    = discovery.relabel.containers.output
  forward_to = [loki.write.default.receiver]
  labels = {
      job = "daedalus",
    hostname    = "{{inventory_hostname}}",
    environment = "{{deployment_environment}}",
  }
 }
  forward_to = [loki.write.default.receiver]
 }
 loki.write "default" {
  endpoint {
--- a/ansible/inventory/group_vars/all/vars.yml
+++ b/ansible/inventory/group_vars/all/vars.yml
@@ -34,6 +34,7 @@ spelunker_rel: main
 mcp_switchboard_rel: main
 kernos_rel: main
 rommie_rel: main
 kottos_rel: main
 # PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/
 freecad_mcp_version: 0.6.1
 openwebui_rel: 0.8.3
--- a/ansible/inventory/group_vars/all/vault.yml.example
+++ b/ansible/inventory/group_vars/all/vault.yml.example
@@ -99,3 +99,25 @@ vault_ntth_token_1_app_secret: changeme
 vault_ntth_token_2_app_secret: changeme
 vault_ntth_token_3_app_secret: changeme
 vault_ntth_token_4_app_secret: changeme
 # Kottos (Pallas FastAgent runtime on puck)
 #   vault_kottos_openai_api_key      — API key for the OpenAI-compatible LLM
 #                                      endpoint (nyx Qwen in Ouranos, varies
 #                                      per environment).  Set to any string
 #                                      if the endpoint doesn't validate.
 #   vault_kottos_github_pat          — GitHub personal access token passed
 #                                      into the github MCP Docker container
 #                                      via GITHUB_PERSONAL_ACCESS_TOKEN env.
 #   vault_kottos_angelia_bearer      — Bearer token for the Angelia MCP
 #                                      server (accepts the outgoing auth).
 #   vault_kottos_mnemosyne_jwt       — Long-lived team JWT minted in the
 #                                      Daedalus admin UI → Settings →
 #                                      Pallas Instances → kottos row →
 #                                      "Reveal" or "Rotate".  Mnemosyne
 #                                      validates this on every search_memory
 #                                      call and scopes results to the
 #                                      workspaces attached to this team.
 vault_kottos_openai_api_key: changeme
 vault_kottos_github_pat: changeme
 vault_kottos_angelia_bearer: changeme
 vault_kottos_mnemosyne_jwt: changeme
--- a/ansible/inventory/host_vars/puck.incus.yml
+++ b/ansible/inventory/host_vars/puck.incus.yml
@@ -7,6 +7,7 @@ services:
  - docker
  - gitea_runner
  - athena
  - kottos
 # Gitea Runner
 gitea_runner_name: "puck-runner"
@@ -14,14 +15,86 @@ gitea_runner_name: "puck-runner"
 # Alloy
 alloy_log_level: "warn"
 angelia_syslog_port: 51422
 # mnemosyne_syslog_port retained for inventory-compatibility while the
 # Alloy Docker-socket discovery block rolls out; no listener binds to it
 # any more.  Delete once the docker-socket pipeline is proven in prod.
 mnemosyne_syslog_port: 51431
 athena_syslog_port: 51424
 kairos_syslog_port: 51425
 icarlos_syslog_port: 51426
 spelunker_syslog_port: 51428
 jupyterlab_syslog_port: 51411
 # daedalus_syslog_port retained for the same reason as mnemosyne above.
 daedalus_syslog_port: 51430
 # =============================================================================
 # PPLG scrape targets on puck
 # =============================================================================
 # Consumed by ``ansible/pplg/prometheus.yml.j2`` on Prospero.  Defining them
 # here keeps the scrape config fully parametric so the same playbook runs
 # unchanged against Ouranos / Virgo / Taurus — each environment sets its
 # own puck-equivalent host in its host_vars.
 # Daedalus (FastAPI on puck, behind nginx)
 daedalus_metrics_host: "puck.incus"
 daedalus_metrics_port: 22181
 # Mnemosyne — /metrics is served by nginx (mnemosyne-web:23181) and
 # proxied to the Django app container, which owns the single
 # prometheus_client process registry that both django-prometheus
 # (HTTP / Celery) and the MCP server's tool-call counters write to.
 mnemosyne_metrics_host: "puck.incus"
 mnemosyne_metrics_port: 23181
 # =============================================================================
 # Kottos Configuration (Pallas FastAgent runtime)
 # =============================================================================
 # Engineering agents (Harper, Scotty, Research, Tech Research) running as a
 # single systemd-managed ``pallas`` process.  Logs land in journald via
 # SyslogIdentifier=kottos, then Alloy's journal relabel block tags them as
 # {service="pallas", project="kottos"} for Loki.
 kottos_user: kottos
 kottos_group: kottos
 kottos_directory: /srv/kottos
 kottos_host: "puck.incus"
 kottos_namespace: "ca.helu.kottos"
 # Ports — registry at 24100, agents 24101–24149, sub-agents 24150–24199
 kottos_registry_port: 24100
 kottos_harper_port: 24101
 kottos_scotty_port: 24102
 kottos_research_port: 24150
 kottos_tech_research_port: 24151
 # Log level — INFO surfaces lifecycle + failures, DEBUG adds per-request
 # detail and successful health probe lines.  Ouranos Lab convention:
 # health-check 200 OKs live in DEBUG, never in INFO.
 pallas_log_level: INFO
 # fast-agent's own logger — keep at INFO in prod, bump to DEBUG alongside
 # pallas_log_level when chasing MCP transport issues.
 kottos_fastagent_log_level: info
 # LLM provider — the same OpenAI-compatible Qwen endpoint Kottos uses today.
 kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
 kottos_openai_base_url: "http://nyx.helu.ca:22079/v1"
 kottos_model_vision: true
 kottos_model_context_window: 192000
 kottos_model_max_output_tokens: 16384
 kottos_timezone: "America/Toronto"
 # Downstream MCP server URLs — each parametric so Virgo / Taurus override
 # them in their own host_vars without touching the templates.
 kottos_argos_url: "http://miranda.incus:25534/mcp"
 kottos_neo4j_cypher_url: "http://circe.helu.ca:22034/mcp"
 kottos_kernos_scotty_url: "http://caliban.incus:22062/mcp"
 kottos_rommie_url: "http://caliban.incus:20361/mcp"
 kottos_gitea_url: "http://miranda.incus:25535/mcp"
 kottos_grafana_url: "http://miranda.incus:25533/mcp"
 kottos_kernos_harper_url: "http://korax.helu.ca:20261/mcp"
 kottos_angelia_url: "https://ouranos.helu.ca/mcp/"
 kottos_mnemosyne_url: "https://mnemosyne.ouranos.helu.ca/mcp/"
 # =============================================================================
 # Athena Configuration
 # =============================================================================
--- a/ansible/kottos/.env.j2
+++ b/ansible/kottos/.env.j2
@@ -0,0 +1,24 @@
 # Kottos runtime environment — rendered by Ansible from inventory host_vars.
 # ------------------------------------------------------------------------
 # Loaded by systemd (EnvironmentFile=) and inherited by the pallas process.
 # ``.env`` vars NOT set here come from pallas.server's defaults — tweak by
 # adding the variable to host_vars and this template, not by editing the
 # rendered file on the host.
 # ── Logging ─────────────────────────────────────────────────────────────────
 # Stdout JSON is the preferred sink for systemd+journald+Alloy deployments.
 # Rotating file sink is disabled by pointing PALLAS_LOG_FILE at /dev/null so
 # we don't write every record twice.
 PALLAS_LOG_STDOUT=1
 PALLAS_LOG_FILE=/dev/null
 PALLAS_LOG_LEVEL={{ pallas_log_level | default('INFO') }}
 # ── Config location ─────────────────────────────────────────────────────────
 # PALLAS_AGENTS_CONFIG can be overridden to point at a non-default topology
 # (e.g. staging scenarios).  Default: agents.yaml next to the working dir.
 PALLAS_AGENTS_CONFIG={{ kottos_directory }}/agents.yaml
 # ── LLM provider / MCP server secrets ───────────────────────────────────────
 # Secrets are rendered into fastagent.secrets.yaml rather than env vars so
 # fast-agent's existing YAML-merge logic applies.  This block stays empty
 # intentionally — the template exists for future per-host tunables.
--- a/ansible/kottos/agents.yaml.j2
+++ b/ansible/kottos/agents.yaml.j2
@@ -0,0 +1,43 @@
 # Kottos — Deployment Configuration (rendered by Ansible)
 # ------------------------------------------------------------------
 # Single source of truth for agent topology, ports, and registry
 # metadata.  Read by Pallas at startup.  The kottos/agents.yaml
 # committed in the kottos repo is the local-dev equivalent; Ansible
 # overwrites it with this rendered version.
 #
 # Host + namespace + registry port come from inventory host_vars so
 # Ouranos / Virgo / Taurus each get their own shape without template
 # edits.
 name: kottos
 version: "1.0.0"
 host: {{ kottos_agents_host | default(kottos_host) | default(inventory_hostname) }}
 namespace: {{ kottos_namespace | default('ca.helu.kottos') }}
 registry_port: {{ kottos_registry_port | default(24100) }}
 agents:
  harper:
    module: agents.harper
    port: {{ kottos_harper_port | default(24101) }}
    title: Harper
    description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving"
    depends_on: [research, tech_research]
  scotty:
    module: agents.scotty
    port: {{ kottos_scotty_port | default(24102) }}
    title: Scotty
    description: "Systems administration expert — infrastructure diagnostics, security hardening, and keeping everything running"
    depends_on: [tech_research]
  research:
    module: agents.research
    port: {{ kottos_research_port | default(24150) }}
    title: Research Agent
    description: "Web search via Argos and knowledge graph via Neo4j"
  tech_research:
    module: agents.tech_research
    port: {{ kottos_tech_research_port | default(24151) }}
    title: Tech Research
    description: "Technical investigation — library comparisons, API docs, framework patterns, code examples"
--- a/ansible/kottos/deploy.yml
+++ b/ansible/kottos/deploy.yml
@@ -0,0 +1,192 @@
 ---
 - name: Deploy Kottos (Pallas FastAgent runtime)
  hosts: ubuntu
  vars:
    ansible_common_remote_group: "{{ kottos_group | default([]) }}"
    allow_world_readable_tmpfiles: true
  tasks:
    - name: Check if host has kottos service
      ansible.builtin.set_fact:
        has_kottos_service: "{{ 'kottos' in services | default([]) }}"
    - name: Skip hosts without kottos service
      ansible.builtin.meta: end_host
      when: not has_kottos_service
    - name: Create Kottos group
      become: true
      ansible.builtin.group:
        name: "{{ kottos_group }}"
        state: present
    - name: Create kottos user
      become: true
      ansible.builtin.user:
        name: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        home: "/home/{{ kottos_user }}"
        shell: /bin/bash
        system: false
        create_home: true
    - name: Add keeper_user to kottos group (optional — enables passwordless tailing)
      become: true
      ansible.builtin.user:
        name: "{{ keeper_user }}"
        groups: "{{ kottos_group }}"
        append: true
      when: keeper_user is defined
    - name: Reset connection to pick up new group membership
      ansible.builtin.meta: reset_connection
    - name: Create Kottos install directory
      become: true
      ansible.builtin.file:
        path: "{{ kottos_directory }}"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        state: directory
        mode: '0750'
    - name: Ensure base packages for Python + Docker MCP workflows
      become: true
      ansible.builtin.apt:
        name:
          - tar
          - python3
          - python3-venv
          - python3-dev
          - git
        state: present
        update_cache: true
    - name: Transfer and unarchive Kottos release
      become: true
      ansible.builtin.unarchive:
        src: "~/rel/kottos_{{ kottos_rel }}.tar"
        dest: "{{ kottos_directory }}"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        mode: '0550'
      notify: restart kottos
    - name: Ensure .venv directory ownership is correct
      become: true
      ansible.builtin.file:
        path: "{{ kottos_directory }}/.venv"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        state: directory
        recurse: true
      when: ansible_facts['file'] is defined or true
    - name: Create virtual environment for Kottos
      become: true
      become_user: "{{ kottos_user }}"
      ansible.builtin.command:
        cmd: "python3 -m venv {{ kottos_directory }}/.venv/"
        creates: "{{ kottos_directory }}/.venv/bin/activate"
    - name: Install wheel in the virtualenv
      become: true
      become_user: "{{ kottos_user }}"
      ansible.builtin.pip:
        name:
          - wheel
        state: latest
        virtualenv: "{{ kottos_directory }}/.venv"
    - name: Install Kottos (pyproject.toml — pulls in pallas-mcp and fast-agent-mcp)
      become: true
      become_user: "{{ kottos_user }}"
      ansible.builtin.pip:
        chdir: "{{ kottos_directory }}/kottos"
        name: .
        virtualenv: "{{ kottos_directory }}/.venv"
        virtualenv_command: python3 -m venv
      notify: restart kottos
    - name: Template agents.yaml
      become: true
      ansible.builtin.template:
        src: agents.yaml.j2
        dest: "{{ kottos_directory }}/agents.yaml"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        mode: '0640'
      notify: restart kottos
    - name: Template fastagent.config.yaml
      become: true
      ansible.builtin.template:
        src: fastagent.config.yaml.j2
        dest: "{{ kottos_directory }}/fastagent.config.yaml"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        mode: '0640'
      notify: restart kottos
    - name: Template fastagent.secrets.yaml (vault-rendered)
      become: true
      ansible.builtin.template:
        src: fastagent.secrets.yaml.j2
        dest: "{{ kottos_directory }}/fastagent.secrets.yaml"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        mode: '0600'
      notify: restart kottos
      no_log: true
    - name: Template runtime .env (PALLAS_LOG_STDOUT etc.)
      become: true
      ansible.builtin.template:
        src: .env.j2
        dest: "{{ kottos_directory }}/.env"
        owner: "{{ kottos_user }}"
        group: "{{ kottos_group }}"
        mode: '0640'
      notify: restart kottos
    - name: Template systemd unit
      become: true
      ansible.builtin.template:
        src: kottos.service.j2
        dest: /etc/systemd/system/kottos.service
        owner: root
        group: root
        mode: '0644'
      notify: restart kottos
    - name: Enable and start kottos service
      become: true
      ansible.builtin.systemd:
        name: kottos
        enabled: true
        state: started
        daemon_reload: true
    - name: Flush handlers before validation probes
      ansible.builtin.meta: flush_handlers
    # ── Validation ──────────────────────────────────────────────────────────
    # Registry is the only endpoint that responds with a deterministic JSON
    # payload without requiring an MCP session, so we probe it.  Agent ports
    # are exercised by Daedalus's health-poll loop once registered.
    - name: Validate Kottos registry responds
      ansible.builtin.uri:
        url: "http://localhost:{{ kottos_registry_port | default(24100) }}/.well-known/mcp/server.json"
        status_code: 200
        return_content: true
      register: registry_check
      retries: 10
      delay: 3
      until: registry_check.status == 200
  handlers:
    - name: restart kottos
      become: true
      ansible.builtin.systemd:
        name: kottos
        state: restarted
--- a/ansible/kottos/fastagent.config.yaml.j2
+++ b/ansible/kottos/fastagent.config.yaml.j2
@@ -0,0 +1,114 @@
 # Kottos — fast-agent configuration (rendered by Ansible)
 # ------------------------------------------------------------------
 # Committed-to-kottos copy is the local-dev equivalent; Ansible overwrites
 # it with this rendered file on deploy.  MCP server URLs are parametrised
 # so the same template renders correctly for Ouranos (.incus) and Virgo
 # (.virgo / .taurus) — each environment's host_vars supplies the base URLs.
 default_model: {{ kottos_default_model | default('openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf') }}
 # ── Model Capabilities ──────────────────────────────────────────────────────
 # Declares capabilities for models not in fast-agent's ModelDatabase.
 # vision: true adds image/jpeg, image/png, image/webp to the tokenizer list.
 model_capabilities:
  vision: {{ kottos_model_vision | default(true) | string | lower }}
  context_window: {{ kottos_model_context_window | default(192000) }}
  max_output_tokens: {{ kottos_model_max_output_tokens | default(16384) }}
 # ── LLM Providers ───────────────────────────────────────────────────────────
 openai:
  base_url: {{ kottos_openai_base_url | default('http://nyx.helu.ca:22079/v1') }}
 mcp:
  servers:
    # ── Web search via SearXNG (argos) ───────────────────────────────────────
    argos:
      transport: http
      url: "{{ kottos_argos_url | default('http://miranda.incus:25534/mcp') }}"
    # ── Knowledge graph — Neo4j ──────────────────────────────────────────────
    neo4j_cypher:
      transport: http
      url: "{{ kottos_neo4j_cypher_url | default('http://circe.helu.ca:22034/mcp') }}"
    # ── Shell + file operations — Kernos (Caliban) ───────────────────────────
    kernos_scotty:
      transport: http
      url: "{{ kottos_kernos_scotty_url | default('http://caliban.incus:22062/mcp') }}"
      load_on_start: false
    # ── Agent S computer automation — Rommie on Caliban ──────────────────────
    rommie:
      transport: http
      url: "{{ kottos_rommie_url | default('http://caliban.incus:20361/mcp') }}"
      load_on_start: false
    # ── Git repository management — Gitea MCP ────────────────────────────────
    gitea:
      transport: http
      url: "{{ kottos_gitea_url | default('http://miranda.incus:25535/mcp') }}"
    # ── Grafana observability ───────────────────────────────────────────────
    grafana:
      transport: http
      url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}"
    # ── Shell + file operations — Kernos (Korax) ─────────────────────────────
    kernos_harper:
      transport: http
      url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}"
      load_on_start: false
    # ── Angelia messaging ───────────────────────────────────────────────────
    # Auth header provided by fastagent.secrets.yaml (vault-rendered).
    angelia:
      transport: http
      url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}"
    # ── GitHub MCP Server (local Docker, stdio) ──────────────────────────────
    # GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
    github:
      command: "docker"
      args:
        - "run"
        - "-i"
        - "--rm"
        - "-e"
        - "GITHUB_PERSONAL_ACCESS_TOKEN"
        - "ghcr.io/github/github-mcp-server"
    # ── Library/framework documentation — Context7 (local stdio) ─────────────
    context7:
      command: "npx"
      args: ["-y", "@upstash/context7-mcp"]
    # ── Current time and timezone (local stdio) ──────────────────────────────
    time:
      command: "mcp-server-time"
      args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"]
    # ── Mnemosyne knowledge search — workspace-scoped ────────────────────────
    # Auth is a long-lived team JWT supplied by fastagent.secrets.yaml
    # (forward_inbound_auth=false — Mnemosyne validates the team JWT).
    mnemosyne:
      transport: http
      url: "{{ kottos_mnemosyne_url | default('https://mnemosyne.ouranos.helu.ca/mcp/') }}"
    # ── Kottos internal sub-agents ───────────────────────────────────────────
    # These stay on localhost regardless of environment — Pallas serves the
    # sub-agents on the same host as the top-level agents.
    research:
      transport: http
      url: "http://localhost:{{ kottos_research_port | default(24150) }}/mcp"
    tech_research:
      transport: http
      url: "http://localhost:{{ kottos_tech_research_port | default(24151) }}/mcp"
 logger:
  type: none
  level: {{ kottos_fastagent_log_level | default('info') }}
  progress_display: false
  show_chat: false
  show_tools: false
  truncate_tools: true
--- a/ansible/kottos/fastagent.secrets.yaml.j2
+++ b/ansible/kottos/fastagent.secrets.yaml.j2
@@ -0,0 +1,27 @@
 # Kottos — fast-agent secrets (rendered by Ansible from the vault)
 # ------------------------------------------------------------------
 # Never commit the rendered file.  Each value here pulls from a vault
 # variable — if a vault variable is missing, Ansible will fail the
 # template step with a clear error before the file is written.
 #
 # Same structure as fastagent.config.yaml; values merge with secrets
 # taking precedence (fast-agent deep-merges the two).
 openai:
  api_key: "{{ vault_kottos_openai_api_key }}"
 mcp:
  servers:
    github:
      env:
        GITHUB_PERSONAL_ACCESS_TOKEN: "{{ vault_kottos_github_pat }}"
    angelia:
      headers:
        Authorization: "Bearer {{ vault_kottos_angelia_bearer }}"
    # Long-lived team JWT minted in Daedalus admin UI.
    # See kottos/README.md § "Mnemosyne memory" for the rotation procedure.
    mnemosyne:
      headers:
        Authorization: "Bearer {{ vault_kottos_mnemosyne_jwt }}"
--- a/ansible/kottos/kottos.service.j2
+++ b/ansible/kottos/kottos.service.j2
@@ -0,0 +1,33 @@
 [Unit]
 Description=Kottos — Pallas FastAgent runtime ({{ kottos_host | default(inventory_hostname) }})
 After=network.target
 Wants=network-online.target
 [Service]
 Type=simple
 User={{ kottos_user }}
 Group={{ kottos_group }}
 WorkingDirectory={{ kottos_directory }}
 EnvironmentFile={{ kottos_directory }}/.env
 ExecStart={{ kottos_directory }}/.venv/bin/pallas
 Restart=always
 RestartSec=5
 # Journal is the durable sink (Alloy picks up via loki.source.journal and
 # relabels SyslogIdentifier=kottos into {service="pallas", project="kottos"}
 # for Loki).  Stdout from pallas is already JSON thanks to
 # PALLAS_LOG_STDOUT=1 set in the .env file.
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=kottos
 # Pallas needs to reach localhost sibling agents + upstream MCP servers
 # and read its own .venv / agents.yaml / config files.  No hardening flags
 # that would block those paths.
 NoNewPrivileges=false
 ProtectSystem=false
 ProtectHome=false
 PrivateTmp=false
 [Install]
 WantedBy=multi-user.target
--- a/ansible/kottos/stage.yml
+++ b/ansible/kottos/stage.yml
@@ -0,0 +1,48 @@
 - name: Stage Kottos release tarball
  hosts: localhost
  gather_facts: false
  vars:
    archive_path: "{{rel_dir}}/kottos_{{kottos_rel}}.tar"
    kottos_repo_url: "ssh://git@git.helu.ca:22022/r/kottos.git"
    kottos_repo_dir: "{{repo_dir}}/kottos"
  tasks:
    - name: Ensure release directory exists
      file:
        path: "{{rel_dir}}"
        state: directory
        mode: '755'
    - name: Ensure repo directory exists
      file:
        path: "{{repo_dir}}"
        state: directory
        mode: '755'
    - name: Clone Kottos repository if not present
      ansible.builtin.git:
        repo: "{{kottos_repo_url}}"
        dest: "{{kottos_repo_dir}}"
        version: "{{kottos_rel}}"
        accept_hostkey: true
      register: git_clone
      ignore_errors: true
    - name: Fetch latest changes if already cloned
      ansible.builtin.git:
        repo: "{{kottos_repo_url}}"
        dest: "{{kottos_repo_dir}}"
        version: "{{kottos_rel}}"
        update: true
        force: true
    - name: Create release archive
      ansible.builtin.archive:
        path: "{{kottos_repo_dir}}"
        dest: "{{archive_path}}"
        format: tar
        exclude_path:
          - "{{kottos_repo_dir}}/.git"
          - "{{kottos_repo_dir}}/.venv"
          - "{{kottos_repo_dir}}/__pycache__"
          - "{{kottos_repo_dir}}/fastagent.secrets.yaml"
--- a/ansible/pplg/alert_rules.yml.j2
+++ b/ansible/pplg/alert_rules.yml.j2
@@ -312,6 +312,78 @@ groups:
          summary: "Daedalus S3 error rate above 1%"
          description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
  # ============================================================================
  # Mnemosyne Application Alerts
  # ============================================================================
  # One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
  # The Django app container hosts the single prometheus_client registry that
  # both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
  # call counters) write to, so "MCP is broken" signals show up as
  # ``mcp_tool_invocations_total{status="error"}`` on the same job rather
  # than a separate up{} series.
  - name: mnemosyne_alerts
    rules:
      - alert: MnemosyneDown
        expr: up{job="mnemosyne"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Mnemosyne is down"
          description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes.  Both the Django app and the MCP server (same container family) are presumed unavailable."
      - alert: MnemosyneHighErrorRate
        expr: |
          sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
            / sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Mnemosyne HTTP 5xx error rate above 5%"
          description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
      - alert: MnemosyneSlowResponses
        expr: |
          histogram_quantile(0.95,
            sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
          ) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Mnemosyne p95 response time above 5s"
          description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
      # MCP tool-call error surface — owned by mcp_server.metrics on the
      # same /metrics endpoint.  This complements MnemosyneDown by catching
      # "app is up but the MCP layer is sick" — e.g. auth token lookups are
      # failing, or Neo4j vector search is 500-ing.
      - alert: MnemosyneMCPToolErrors
        expr: |
          sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
            / sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Mnemosyne MCP tool error rate above 10%"
          description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
      # Celery queue depth — high pending count usually means the embedding
      # worker is stuck or throttled by the embedding provider.  Requires
      # ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
      # if that is not deployed yet, this rule simply never fires.
      - alert: MnemosyneCeleryBacklog
        expr: |
          sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
          description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
 # Red Panda Seal of Approval 🐼
 # "If the metrics aren't red, go back to bed"
 {% endraw %}
--- a/ansible/pplg/dashboards_provider.yml.j2
+++ b/ansible/pplg/dashboards_provider.yml.j2
@@ -0,0 +1,23 @@
 # Grafana dashboard file provider
 # Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml
 #
 # Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports
 # any JSON file it finds.  Each dashboard JSON lives in that directory and
 # is owned by Ansible — operators should not edit dashboards through the
 # Grafana UI (changes won't survive a deploy; export the final JSON and
 # land it in this role).
 apiVersion: 1
 providers:
  - name: 'puck'
    orgId: 1
    folder: 'Puck Services'
    folderUid: puck-services
    type: file
    disableDeletion: false
    editable: true
    allowUiUpdates: false
    updateIntervalSeconds: 30
    options:
      path: /var/lib/grafana/dashboards
      foldersFromFilesStructure: false
--- a/ansible/pplg/deploy.yml
+++ b/ansible/pplg/deploy.yml
@@ -208,6 +208,32 @@
      group: grafana
      mode: '750'
  - name: Ensure Grafana dashboard provisioning directory exists
    ansible.builtin.file:
      path: /etc/grafana/provisioning/dashboards
      state: directory
      owner: grafana
      group: grafana
      mode: '750'
  - name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards)
    ansible.builtin.template:
      src: "dashboards_provider.yml.j2"
      dest: "/etc/grafana/provisioning/dashboards/puck.yaml"
      owner: grafana
      group: grafana
      mode: '640'
    notify: restart grafana
  - name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus)
    ansible.builtin.template:
      src: "puck_services_dashboard.json.j2"
      dest: "/var/lib/grafana/dashboards/puck_services.json"
      owner: grafana
      group: grafana
      mode: '640'
    notify: restart grafana
  - name: Template Grafana main configuration
    ansible.builtin.template:
      src: "grafana.ini.j2"
--- a/ansible/pplg/prometheus.yml.j2
+++ b/ansible/pplg/prometheus.yml.j2
@@ -47,7 +47,18 @@ scrape_configs:
  - job_name: 'daedalus'
    static_configs:
-      - targets: ['puck.incus:22181']
+      - targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}']
    metrics_path: '/metrics'
    scrape_interval: 15s
  # Mnemosyne — single /metrics endpoint on the app container serves both
  # django-prometheus HTTP/Celery metrics and the MCP server's tool-call
  # counters (the mcp_server.metrics module registers into the same
  # prometheus_client process registry on the Django side).  The mcp
  # container itself does not expose /metrics; run 'em on the WSGI side.
  - job_name: 'mnemosyne'
    static_configs:
      - targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
    metrics_path: '/metrics'
    scrape_interval: 15s
--- a/ansible/pplg/puck_services_dashboard.json.j2
+++ b/ansible/pplg/puck_services_dashboard.json.j2
@@ -0,0 +1,242 @@
 {
  "title": "Puck Services — Logs & Health",
  "uid": "puck-services-logs",
  "tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
  "timezone": "browser",
  "schemaVersion": 39,
  "version": 1,
  "editable": true,
  "fiscalYearStartMonth": 0,
  "weekStart": "",
  "refresh": "30s",
  "time": {"from": "now-1h", "to": "now"},
  "templating": {
    "list": [
      {
        "name": "loki",
        "type": "datasource",
        "query": "loki",
        "current": {"selected": false, "text": "Loki", "value": "Loki"},
        "hide": 0,
        "label": "Loki datasource"
      },
      {
        "name": "prom",
        "type": "datasource",
        "query": "prometheus",
        "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
        "hide": 0,
        "label": "Prometheus datasource"
      }
    ]
  },
  "panels": [
    {
      "id": 1,
      "type": "row",
      "title": "Mnemosyne",
      "collapsed": false,
      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
    },
    {
      "id": 2,
      "type": "timeseries",
      "title": "Mnemosyne — log rate by level",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
      "targets": [
        {
          "refId": "A",
          "expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
          "legendFormat": "{{level}}"
        }
      ],
      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
    },
    {
      "id": 3,
      "type": "logs",
      "title": "Mnemosyne — errors (last 25)",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
      "targets": [
        {
          "refId": "A",
          "expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
          "maxLines": 25
        }
      ],
      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
    },
    {
      "id": 4,
      "type": "stat",
      "title": "Mnemosyne — HTTP 5xx rate",
      "datasource": {"type": "prometheus", "uid": "${prom}"},
      "gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
      "targets": [
        {
          "refId": "A",
          "expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
        }
      ],
      "options": {
        "reduceOptions": {"calcs": ["lastNotNull"]},
        "colorMode": "value",
        "textMode": "auto"
      },
      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
    },
    {
      "id": 5,
      "type": "stat",
      "title": "Mnemosyne — p95 latency",
      "datasource": {"type": "prometheus", "uid": "${prom}"},
      "gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
      "targets": [
        {
          "refId": "A",
          "expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
        }
      ],
      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
    },
    {
      "id": 6,
      "type": "stat",
      "title": "Mnemosyne — MCP tool error rate",
      "datasource": {"type": "prometheus", "uid": "${prom}"},
      "gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
      "targets": [
        {
          "refId": "A",
          "expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
        }
      ],
      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
    },
    {
      "id": 10,
      "type": "row",
      "title": "Pallas (Kottos agents)",
      "collapsed": false,
      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
    },
    {
      "id": 11,
      "type": "timeseries",
      "title": "Pallas — log rate by agent (component)",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
      "targets": [
        {
          "refId": "A",
          "expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
          "legendFormat": "{{component}}"
        }
      ],
      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
    },
    {
      "id": 12,
      "type": "logs",
      "title": "Pallas — forward trace errors (opaque MCP transport failures)",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
      "targets": [
        {
          "refId": "A",
          "expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
          "maxLines": 25
        }
      ],
      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
    },
    {
      "id": 13,
      "type": "logs",
      "title": "Pallas — last 25 ERROR lines (any agent)",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
      "targets": [
        {
          "refId": "A",
          "expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
          "maxLines": 25
        }
      ],
      "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
    },
    {
      "id": 20,
      "type": "row",
      "title": "Daedalus",
      "collapsed": false,
      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
    },
    {
      "id": 21,
      "type": "timeseries",
      "title": "Daedalus — log rate by level",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
      "targets": [
        {
          "refId": "A",
          "expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
          "legendFormat": "{{level}}"
        }
      ],
      "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
    },
    {
      "id": 22,
      "type": "stat",
      "title": "Daedalus — HTTP 5xx rate",
      "datasource": {"type": "prometheus", "uid": "${prom}"},
      "gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
      "targets": [
        {
          "refId": "A",
          "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
        }
      ],
      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
      "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
    },
    {
      "id": 23,
      "type": "stat",
      "title": "Daedalus — MCP p95 latency",
      "datasource": {"type": "prometheus", "uid": "${prom}"},
      "gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
      "targets": [
        {
          "refId": "A",
          "expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
        }
      ],
      "options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
      "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
    },
    {
      "id": 24,
      "type": "logs",
      "title": "Daedalus — errors (last 25)",
      "datasource": {"type": "loki", "uid": "${loki}"},
      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
      "targets": [
        {
          "refId": "A",
          "expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
          "maxLines": 25
        }
      ],
      "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
    }
  ]
 }
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -44,3 +44,9 @@
 - name: Deploy Agent S
  import_playbook: agent_s/deploy.yml
 - name: Stage Kottos (Pallas FastAgent runtime)
  import_playbook: kottos/stage.yml
 - name: Deploy Kottos
  import_playbook: kottos/deploy.yml
--- a/docs/kottos.md
+++ b/docs/kottos.md
@@ -163,6 +163,96 @@ The registry includes model capabilities on each agent entry:
 }
 ```
 ## Deployment
 Kottos runs two ways:
 1. **Locally on caliban**, hand-started for iteration (`kottos` from the repo root). This is the flow documented above in *Quickstart*.
 2. **In Ouranos / Virgo / Taurus via Ansible**, as a `systemd`-managed `pallas` process on the puck.incus container. This is the pipeline that feeds the Puck Services dashboard in Grafana.
 ### Ansible role
 Lives in `ouranos/ansible/kottos/`:
 | File | Purpose |
 |---|---|
 | `deploy.yml` | Main playbook — user/group, venv, systemd unit, config templating, registry probe. |
 | `stage.yml` | Clones `git.helu.ca/r/kottos` at `{{ kottos_rel }}` and creates the release tarball. |
 | `kottos.service.j2` | systemd unit. `SyslogIdentifier=kottos`, `StandardOutput=journal`, `PALLAS_LOG_STDOUT=1` via the env file. |
 | `.env.j2` | Runtime environment for `pallas` — logging config, `PALLAS_AGENTS_CONFIG`. |
 | `agents.yaml.j2` | Deployment topology with host/ports pulled from inventory. |
 | `fastagent.config.yaml.j2` | LLM provider + MCP server URLs, parametric per environment. |
 | `fastagent.secrets.yaml.j2` | API keys and auth tokens, rendered from Ansible Vault. |
 ### Inventory
 Host variables live in `inventory/host_vars/puck.incus.yml` under **Kottos Configuration**:
 ```yaml
 kottos_user: kottos
 kottos_group: kottos
 kottos_directory: /srv/kottos
 kottos_host: "puck.incus"
 kottos_registry_port: 24100
 kottos_harper_port: 24101
 kottos_scotty_port: 24102
 kottos_research_port: 24150
 kottos_tech_research_port: 24151
 pallas_log_level: INFO
 kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
 kottos_openai_base_url: "http://nyx.helu.ca:22079/v1"
 # ...plus one entry per downstream MCP URL so each environment overrides freely
 ```
 Every host variable is parametric — Virgo's `puck.virgo.yml` (or wherever the Pallas host lives) can override any value without touching the templates.
 ### Vault
 Four vault keys required — all documented in `inventory/group_vars/all/vault.yml.example`:
 | Key | Used for |
 |---|---|
 | `vault_kottos_openai_api_key` | OpenAI-compatible LLM endpoint (nyx Qwen in Ouranos). |
 | `vault_kottos_github_pat` | `GITHUB_PERSONAL_ACCESS_TOKEN` for the local GitHub MCP Docker container. |
 | `vault_kottos_angelia_bearer` | Bearer token accepted by the Angelia MCP server. |
 | `vault_kottos_mnemosyne_jwt` | Long-lived team JWT from Daedalus admin UI — Mnemosyne validates it on every `search_memory` call and scopes results to this team's workspaces. |
 ### Deploying
 Wired into `site.yml`:
 ```bash
 cd ansible
 ansible-playbook kottos/stage.yml     # clone repo + build tarball (local)
 ansible-playbook kottos/deploy.yml    # deploy + template + start
 ```
 Or run the full site (`ansible-playbook site.yml`) — kottos's stage + deploy steps are the last block in the sequence.
 ### Logs
 Journal identifier `kottos`, so on the host:
 ```bash
 sudo journalctl -u kottos -f --output=cat | jq .
 ```
 Alloy on puck's journal source relabels `__journal_syslog_identifier=kottos` to `{service="pallas", project="kottos"}`, then into Loki. Everything shows up in Grafana's *Puck Services — Logs & Health* dashboard under the **Pallas** row, with per-agent colouring driven by the `component` JSON field (`harper`, `scotty`, `research`, `tech_research`).
 For per-agent follow-along:
 ```logql
 {service="pallas", project="kottos", component="harper"} | json
 ```
 For the opaque-MCP-transport-failure trace stream (see Pallas's bearer-forwarding incident history):
 ```logql
 {service="pallas", project="kottos"} |= "pallas.forward.trace" | json
 ```
 See [logging.md](logging.md) for the full label schema + level policy + add-a-new-service guide.
 ## Downstream MCP Servers
 | Server | Host | URL |
--- a/docs/logging.md
+++ b/docs/logging.md
@@ -0,0 +1,173 @@
 # Unified Logging — Mnemosyne, Pallas, Daedalus
 PPLG is the single destination for every service's logs. This document describes the label schema every service emits, the two transports Alloy uses to collect logs, and the level policy that keeps INFO output actionable.
 The three in-scope services today are **Mnemosyne**, **Pallas** (running as Kottos/Mentor/Iolaus), and **Daedalus**. The same patterns generalise to any future service that deploys on a `docker`-enabled host or under `systemd+journald`.
 ## Label schema
 Every Loki log stream carries these labels, and nothing else:
 | Label | Example values | Source |
 |---|---|---|
 | `service` | `mnemosyne`, `pallas`, `daedalus`, `athena`, `kairos`, `angelia` | Docker compose project name (container logs) **or** explicit systemd relabel rule (journal logs) |
 | `component` | `app`, `mcp`, `worker`, `nginx`, `harper`, `scotty`, `research`, `tech_research` | Docker compose service name **or** per-agent `ContextVar` (Pallas) |
 | `project` | `kottos` (Pallas only) | `agents.yaml` `name:` field read by `pallas.log.set_project()` |
 | `hostname` | `puck.incus`, `caliban.incus` | Alloy's `inventory_hostname` template var |
 | `environment` | `ouranos`, `virgo`, `taurus` | `deployment_environment` from Ansible group_vars |
 **Everything else is a JSON field in the log body**, not a label. That includes `level`, `logger`, `funcName`, `lineno`, `message`, `request_id`, `workspace_id`, `agent`, `tool`, `duration_ms`, and any `extra={...}` kwargs the application passed in. LogQL's `| json` pipeline parses these on-query — keeping them out of the label index is what keeps Loki fast.
 ## Level policy
 Same rules for every service. Health-check `200 OK`s live in DEBUG, never in INFO.
 | Level | Meaning |
 |---|---|
 | `ERROR` | Broken; requires human attention. |
 | `WARNING` | Degraded but self-recovering — retries, skipped items, missing optional config. |
 | `INFO` | Lifecycle events and failures. Start, ready, shutdown, preflight, LLM provider validation. 200 OKs on health endpoints are **not** INFO. |
 | `DEBUG` | Per-request detail, successful health probes, verbose traces. Enable on demand when troubleshooting. |
 Mnemosyne enforces this with `mnemosyne.log_filters.SuppressHealthAccessFilter` on Django/gunicorn access loggers; Pallas with `_HealthAccessFilter` on `uvicorn.access`; Daedalus with the equivalent filter in `daedalus.logging`.
 ## Two transports, one Alloy
 Alloy on each host uses exactly two sources for application logs. Pick whichever matches the service's runtime model — **don't** invent a third.
 ### 1. Docker socket (for compose projects)
 `discovery.docker` enumerates every running container, and `loki.source.docker` tails their stdout via the `json-file` driver. Compose project → `service` label, compose service → `component` label. One block covers every compose project on the host, current and future.
 **Requirements on the service side:**
 - Emit JSON lines to **stdout**, one per log record. Mnemosyne uses `python-json-logger`; Daedalus uses `structlog`; any Python service can do the same.
 - Pin the logging driver to `json-file` with bounded rotation in `docker-compose.yaml`:
  ```yaml
  x-logging: &default-logging
    driver: json-file
    options:
      tag: "{{.Name}}"
      max-size: "10m"
      max-file: "5"
  services:
    app:
      # ...
      logging: *default-logging
  ```
  `json-file` is Docker's default, but pinning it defensively guarantees Alloy sees the same driver on every host.
 - On the Alloy host, the `alloy` user must be in the `docker` group to read `/var/run/docker.sock`. The `ouranos/ansible/alloy/` role handles this.
 ### 2. Systemd journal (for systemd-managed units)
 `loki.source.journal` tails journald. A `loki.relabel "journal_<host>"` block translates `__journal_syslog_identifier` → `service` / `project` labels so Pallas-managed agents land alongside Docker-based services with the same schema.
 **Requirements on the service side:**
 - Emit JSON to **stdout** (journald captures it with `PRIORITY=6` INFO by default).
 - The systemd unit must set a distinctive `SyslogIdentifier=` — the Alloy relabel block keys off this.
 - Under Pallas, set `PALLAS_LOG_STDOUT=1` in the unit's `EnvironmentFile`. Also set `PALLAS_LOG_FILE=/dev/null` to disable the rotating file sink (journald is already durable).
 Example, from `ouranos/ansible/kottos/kottos.service.j2`:
 ```ini
 [Service]
 ...
 EnvironmentFile=/srv/kottos/.env
 ExecStart=/srv/kottos/.venv/bin/pallas
 StandardOutput=journal
 StandardError=journal
 SyslogIdentifier=kottos
 ```
 And the matching Alloy relabel rule on puck:
 ```alloy
 loki.relabel "journal_puck" {
  forward_to = []
  rule {
    source_labels = ["__journal_syslog_identifier"]
    regex         = "kottos"
    target_label  = "service"
    replacement   = "pallas"
  }
  rule {
    source_labels = ["__journal_syslog_identifier"]
    regex         = "kottos"
    target_label  = "project"
    replacement   = "kottos"
  }
  // ...
 }
 ```
 ## Per-service reference
 ### Mnemosyne (Docker compose on puck)
 - Logging config: `mnemosyne/mnemosyne/mnemosyne/settings.py` → `LOGGING` dict using `pythonjsonlogger.json.JsonFormatter`.
 - Component attribution: `MNEMOSYNE_COMPONENT` env var set per docker-compose service (`init`, `app`, `mcp`, `worker`). The settings module reads it into `static_fields.component`.
 - Health-filter: `mnemosyne.log_filters.SuppressHealthAccessFilter` on the `access` handler.
 - Metrics: `/metrics` on the nginx container (port 23181) — served by django-prometheus on the app container plus `mcp_server.metrics` (shared `prometheus_client` registry).
 - Scrape job: `mnemosyne` (see `ouranos/ansible/pplg/prometheus.yml.j2`).
 - Alerts: `mnemosyne_alerts` group in `ouranos/ansible/pplg/alert_rules.yml.j2`.
 ### Pallas — Kottos (systemd on puck via Ansible role `ouranos/ansible/kottos/`)
 - Logging config: `pallas/pallas/log.py` → `setup_logging()` with `PALLAS_LOG_STDOUT=1`.
 - Component attribution: `pallas.log.set_agent_component(name)` is called by `_start_agent()` inside each agent's asyncio task, setting a `contextvars.ContextVar` that the `_StaticFieldsFilter` reads per record. Each agent (harper, scotty, research, tech_research) carries its own value without leaking across tasks.
 - Project attribution: `pallas.log.set_project(deploy_name)` is called once in `main()` from `agents.yaml`'s `name:`. For Kottos this renders as `project="kottos"` on every record.
 - Deployed by: `ansible-playbook kottos/deploy.yml` (wired into `site.yml`).
 - Metrics: none today — Pallas is observed through logs only. Future phase will add a `prometheus_client` endpoint on the registry port for `pallas_agent_requests_total{agent=…}`, `pallas_downstream_mcp_errors_total{server=…}`.
 ### Daedalus (Docker compose on puck)
 - Logging config: `daedalus/backend/daedalus/logging.py` — `structlog` JSON processor chain, already production-ready.
 - Component attribution: `structlog.contextvars.bind_contextvars(service="daedalus", component="api")` at app startup.
 - Health-filter: `_SuppressHealthAccessFilter` on uvicorn's access logger.
 - Metrics: `/metrics` on the api container (port 22181).
 - Scrape job: `daedalus`.
 - Alerts: `daedalus_alerts` group.
 ## Useful LogQL queries
 Once the pipeline is live, the "troubleshooting is a nightmare" problem becomes three-click queries in Grafana Explore:
 ```logql
 # All Mnemosyne errors in the last 15m
 {service="mnemosyne"} | json | level="ERROR"
 # Everything Harper did in the last hour
 {service="pallas", project="kottos", component="harper"} | json
 # The infamous pallas.forward.trace stream (MCP transport failures)
 {service="pallas", project="kottos"} |= "pallas.forward.trace"
 # Cross-service trace of a single request (requires X-Request-Id propagation
 # — not yet implemented; Phase 1.5 nice-to-have)
 {environment="ouranos"} | json | request_id="<paste-id>"
 # 5xx spike in Daedalus by path
 sum by (path) (rate({service="daedalus"} | json | level="ERROR" [5m]))
 ```
 The **Puck Services — Logs & Health** dashboard in Grafana (`/etc/grafana/provisioning/dashboards/puck.yaml` → `/var/lib/grafana/dashboards/puck_services.json`) has these pre-wired as panels per service row.
 ## Adding a new service
 If you're adding a service to puck (or any Ouranos/Virgo host with this stack):
 1. **Emit JSON to stdout** with `service`/`component` as static fields. Copy Mnemosyne's settings pattern or Pallas's `_StaticFieldsFilter`.
 2. **Pick a transport:**
   - Docker compose → add the `x-logging: &default-logging` anchor + `logging: *default-logging` on each service. Done. No Alloy changes needed.
   - systemd → set `SyslogIdentifier=<name>` on the unit and add a two-rule relabel block to the host's `loki.relabel "journal_<host>"` block.
 3. **Expose `/metrics`** if the service is in Python — `prometheus_client` plus either `django-prometheus` or `prometheus_fastapi_instrumentator`.
 4. **Add a scrape job** in `ouranos/ansible/pplg/prometheus.yml.j2` (parametrise the target — `{{ <service>_metrics_host }}:{{ <service>_metrics_port }}`) and wire the defaults into the host's `host_vars`.
 5. **Add alerts** in `ouranos/ansible/pplg/alert_rules.yml.j2`. At minimum: `Down`, `HighErrorRate`. Use the metric names the service actually exposes — no dead rules.
 6. **Optional**: add panels to the Puck Services dashboard JSON.
 No new transport. No per-service Alloy block. No custom log format.