diff --git a/ansible/alloy/puck/config.alloy.j2 b/ansible/alloy/puck/config.alloy.j2 index ae941f1..ab77558 100644 --- a/ansible/alloy/puck/config.alloy.j2 +++ b/ansible/alloy/puck/config.alloy.j2 @@ -18,11 +18,61 @@ loki.source.file "system_logs" { forward_to = [loki.write.default.receiver] } +// Journal relabel rules — tag Pallas-managed units (kottos now, mentor / +// iolaus later) with the same {service, project, component} schema used +// by Mnemosyne and Daedalus. Rules run top-to-bottom and STOP at the +// first target_label match per source, so the generic "systemd" fallback +// stays last. If a new Pallas host/project ever lands here, copy one of +// the blocks below and adjust SyslogIdentifier + project. +loki.relabel "journal_puck" { + forward_to = [] + + // Expose the systemd unit as an auxiliary label for debugging. + rule { + source_labels = ["__journal__systemd_unit"] + target_label = "unit" + } + + // Kottos — Pallas FastAgent runtime for the engineering agent project. + // SyslogIdentifier=kottos is set in ouranos/ansible/kottos/kottos.service.j2. + rule { + source_labels = ["__journal_syslog_identifier"] + regex = "kottos" + target_label = "service" + replacement = "pallas" + } + rule { + source_labels = ["__journal_syslog_identifier"] + regex = "kottos" + target_label = "project" + replacement = "kottos" + } + + // Alloy itself — useful to separate from the "systemd" bucket when the + // shipping pipeline misbehaves. + rule { + source_labels = ["__journal__systemd_unit"] + regex = "alloy\\.service" + target_label = "service" + replacement = "alloy" + } + + // Default fallback — everything else becomes service="systemd". We + // also set job here for backwards compatibility with existing + // dashboards that filter on ``job="systemd"``. + rule { + source_labels = ["__journal__systemd_unit"] + regex = ".+" + target_label = "job" + replacement = "systemd" + } +} + loki.source.journal "systemd_logs" { - forward_to = [loki.write.default.receiver] + forward_to = [loki.write.default.receiver] + relabel_rules = loki.relabel.journal_puck.rules labels = { - job = "systemd", - hostname = "{{inventory_hostname}}", + hostname = "{{inventory_hostname}}", environment = "{{deployment_environment}}", } } @@ -69,19 +119,11 @@ loki.source.syslog "kairos_logs" { forward_to = [loki.write.default.receiver] } -loki.source.syslog "menosyne_logs" { - listener { - address = "127.0.0.1:{{mnemosyne_syslog_port}}" - protocol = "tcp" - syslog_format = "{{ syslog_format }}" - labels = { - job = "menosyne", - hostname = "{{inventory_hostname}}", - environment = "{{deployment_environment}}", - } - } - forward_to = [loki.write.default.receiver] -} +// Mnemosyne used to ship via syslog on {{mnemosyne_syslog_port}}; it now +// logs line-delimited JSON to container stdout and is picked up by the +// docker-socket block below. The host_var is retained as a reserved port +// number but no listener binds to it — remove the var from the inventory +// when the rollout is verified. loki.source.syslog "spelunker_logs" { listener { @@ -111,18 +153,65 @@ loki.source.syslog "jupyterlab_logs" { forward_to = [loki.write.default.receiver] } -loki.source.syslog "daedalus_logs" { - listener { - address = "127.0.0.1:{{daedalus_syslog_port}}" - protocol = "tcp" - syslog_format = "{{ syslog_format }}" - labels = { - job = "daedalus", - hostname = "{{inventory_hostname}}", - environment = "{{deployment_environment}}", - } +// Daedalus also used to ship via syslog on {{daedalus_syslog_port}}; it +// already emits structlog JSON to stdout, so the docker-socket block +// below now handles it. Host_var kept for the same transitional reason +// as mnemosyne above. + +// ---------------------------------------------------------------------------- +// Docker socket — any compose project on this host lands in Loki with +// `service` = compose project (e.g. "mnemosyne", "daedalus", "kairos") and +// `component` = compose service (e.g. "app", "mcp", "worker", "nginx"). +// This replaces per-service syslog listeners — one block covers every +// compose project, current and future. +// +// Requires: the Alloy process to have read access to /var/run/docker.sock +// (Ansible role should add the alloy user to the `docker` group). No Docker +// daemon changes required — we scrape the json-file driver, which is Docker's +// default and is pinned in each compose project's x-logging anchor. +// ---------------------------------------------------------------------------- +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" + refresh_interval = "30s" +} + +discovery.relabel "containers" { + targets = discovery.docker.containers.targets + + // Compose project → service label + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_project"] + target_label = "service" } + // Compose service → component label + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "component" + } + // Container name (for one-off / non-compose containers) + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + // Fall back to the container name as `service` when compose labels are + // absent (e.g. a `docker run ...` container outside any compose project) + rule { + source_labels = ["service", "container"] + separator = "@" + regex = "@(.+)" + target_label = "service" + } +} + +loki.source.docker "containers" { + host = "unix:///var/run/docker.sock" + targets = discovery.relabel.containers.output forward_to = [loki.write.default.receiver] + labels = { + hostname = "{{inventory_hostname}}", + environment = "{{deployment_environment}}", + } } loki.write "default" { diff --git a/ansible/inventory/group_vars/all/vars.yml b/ansible/inventory/group_vars/all/vars.yml index 2298466..f56bdbb 100644 --- a/ansible/inventory/group_vars/all/vars.yml +++ b/ansible/inventory/group_vars/all/vars.yml @@ -34,6 +34,7 @@ spelunker_rel: main mcp_switchboard_rel: main kernos_rel: main rommie_rel: main +kottos_rel: main # PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/ freecad_mcp_version: 0.6.1 openwebui_rel: 0.8.3 diff --git a/ansible/inventory/group_vars/all/vault.yml.example b/ansible/inventory/group_vars/all/vault.yml.example index da53293..1e3afa8 100644 --- a/ansible/inventory/group_vars/all/vault.yml.example +++ b/ansible/inventory/group_vars/all/vault.yml.example @@ -99,3 +99,25 @@ vault_ntth_token_1_app_secret: changeme vault_ntth_token_2_app_secret: changeme vault_ntth_token_3_app_secret: changeme vault_ntth_token_4_app_secret: changeme + +# Kottos (Pallas FastAgent runtime on puck) +# vault_kottos_openai_api_key — API key for the OpenAI-compatible LLM +# endpoint (nyx Qwen in Ouranos, varies +# per environment). Set to any string +# if the endpoint doesn't validate. +# vault_kottos_github_pat — GitHub personal access token passed +# into the github MCP Docker container +# via GITHUB_PERSONAL_ACCESS_TOKEN env. +# vault_kottos_angelia_bearer — Bearer token for the Angelia MCP +# server (accepts the outgoing auth). +# vault_kottos_mnemosyne_jwt — Long-lived team JWT minted in the +# Daedalus admin UI → Settings → +# Pallas Instances → kottos row → +# "Reveal" or "Rotate". Mnemosyne +# validates this on every search_memory +# call and scopes results to the +# workspaces attached to this team. +vault_kottos_openai_api_key: changeme +vault_kottos_github_pat: changeme +vault_kottos_angelia_bearer: changeme +vault_kottos_mnemosyne_jwt: changeme diff --git a/ansible/inventory/host_vars/puck.incus.yml b/ansible/inventory/host_vars/puck.incus.yml index 9cbbcce..ddee54e 100644 --- a/ansible/inventory/host_vars/puck.incus.yml +++ b/ansible/inventory/host_vars/puck.incus.yml @@ -7,6 +7,7 @@ services: - docker - gitea_runner - athena + - kottos # Gitea Runner gitea_runner_name: "puck-runner" @@ -14,14 +15,86 @@ gitea_runner_name: "puck-runner" # Alloy alloy_log_level: "warn" angelia_syslog_port: 51422 +# mnemosyne_syslog_port retained for inventory-compatibility while the +# Alloy Docker-socket discovery block rolls out; no listener binds to it +# any more. Delete once the docker-socket pipeline is proven in prod. mnemosyne_syslog_port: 51431 athena_syslog_port: 51424 kairos_syslog_port: 51425 icarlos_syslog_port: 51426 spelunker_syslog_port: 51428 jupyterlab_syslog_port: 51411 +# daedalus_syslog_port retained for the same reason as mnemosyne above. daedalus_syslog_port: 51430 +# ============================================================================= +# PPLG scrape targets on puck +# ============================================================================= +# Consumed by ``ansible/pplg/prometheus.yml.j2`` on Prospero. Defining them +# here keeps the scrape config fully parametric so the same playbook runs +# unchanged against Ouranos / Virgo / Taurus — each environment sets its +# own puck-equivalent host in its host_vars. + +# Daedalus (FastAPI on puck, behind nginx) +daedalus_metrics_host: "puck.incus" +daedalus_metrics_port: 22181 + +# Mnemosyne — /metrics is served by nginx (mnemosyne-web:23181) and +# proxied to the Django app container, which owns the single +# prometheus_client process registry that both django-prometheus +# (HTTP / Celery) and the MCP server's tool-call counters write to. +mnemosyne_metrics_host: "puck.incus" +mnemosyne_metrics_port: 23181 + +# ============================================================================= +# Kottos Configuration (Pallas FastAgent runtime) +# ============================================================================= +# Engineering agents (Harper, Scotty, Research, Tech Research) running as a +# single systemd-managed ``pallas`` process. Logs land in journald via +# SyslogIdentifier=kottos, then Alloy's journal relabel block tags them as +# {service="pallas", project="kottos"} for Loki. +kottos_user: kottos +kottos_group: kottos +kottos_directory: /srv/kottos +kottos_host: "puck.incus" +kottos_namespace: "ca.helu.kottos" + +# Ports — registry at 24100, agents 24101–24149, sub-agents 24150–24199 +kottos_registry_port: 24100 +kottos_harper_port: 24101 +kottos_scotty_port: 24102 +kottos_research_port: 24150 +kottos_tech_research_port: 24151 + +# Log level — INFO surfaces lifecycle + failures, DEBUG adds per-request +# detail and successful health probe lines. Ouranos Lab convention: +# health-check 200 OKs live in DEBUG, never in INFO. +pallas_log_level: INFO + +# fast-agent's own logger — keep at INFO in prod, bump to DEBUG alongside +# pallas_log_level when chasing MCP transport issues. +kottos_fastagent_log_level: info + +# LLM provider — the same OpenAI-compatible Qwen endpoint Kottos uses today. +kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf" +kottos_openai_base_url: "http://nyx.helu.ca:22079/v1" +kottos_model_vision: true +kottos_model_context_window: 192000 +kottos_model_max_output_tokens: 16384 +kottos_timezone: "America/Toronto" + +# Downstream MCP server URLs — each parametric so Virgo / Taurus override +# them in their own host_vars without touching the templates. +kottos_argos_url: "http://miranda.incus:25534/mcp" +kottos_neo4j_cypher_url: "http://circe.helu.ca:22034/mcp" +kottos_kernos_scotty_url: "http://caliban.incus:22062/mcp" +kottos_rommie_url: "http://caliban.incus:20361/mcp" +kottos_gitea_url: "http://miranda.incus:25535/mcp" +kottos_grafana_url: "http://miranda.incus:25533/mcp" +kottos_kernos_harper_url: "http://korax.helu.ca:20261/mcp" +kottos_angelia_url: "https://ouranos.helu.ca/mcp/" +kottos_mnemosyne_url: "https://mnemosyne.ouranos.helu.ca/mcp/" + # ============================================================================= # Athena Configuration # ============================================================================= diff --git a/ansible/kottos/.env.j2 b/ansible/kottos/.env.j2 new file mode 100644 index 0000000..04e64c4 --- /dev/null +++ b/ansible/kottos/.env.j2 @@ -0,0 +1,24 @@ +# Kottos runtime environment — rendered by Ansible from inventory host_vars. +# ------------------------------------------------------------------------ +# Loaded by systemd (EnvironmentFile=) and inherited by the pallas process. +# ``.env`` vars NOT set here come from pallas.server's defaults — tweak by +# adding the variable to host_vars and this template, not by editing the +# rendered file on the host. + +# ── Logging ───────────────────────────────────────────────────────────────── +# Stdout JSON is the preferred sink for systemd+journald+Alloy deployments. +# Rotating file sink is disabled by pointing PALLAS_LOG_FILE at /dev/null so +# we don't write every record twice. +PALLAS_LOG_STDOUT=1 +PALLAS_LOG_FILE=/dev/null +PALLAS_LOG_LEVEL={{ pallas_log_level | default('INFO') }} + +# ── Config location ───────────────────────────────────────────────────────── +# PALLAS_AGENTS_CONFIG can be overridden to point at a non-default topology +# (e.g. staging scenarios). Default: agents.yaml next to the working dir. +PALLAS_AGENTS_CONFIG={{ kottos_directory }}/agents.yaml + +# ── LLM provider / MCP server secrets ─────────────────────────────────────── +# Secrets are rendered into fastagent.secrets.yaml rather than env vars so +# fast-agent's existing YAML-merge logic applies. This block stays empty +# intentionally — the template exists for future per-host tunables. diff --git a/ansible/kottos/agents.yaml.j2 b/ansible/kottos/agents.yaml.j2 new file mode 100644 index 0000000..af068ab --- /dev/null +++ b/ansible/kottos/agents.yaml.j2 @@ -0,0 +1,43 @@ +# Kottos — Deployment Configuration (rendered by Ansible) +# ------------------------------------------------------------------ +# Single source of truth for agent topology, ports, and registry +# metadata. Read by Pallas at startup. The kottos/agents.yaml +# committed in the kottos repo is the local-dev equivalent; Ansible +# overwrites it with this rendered version. +# +# Host + namespace + registry port come from inventory host_vars so +# Ouranos / Virgo / Taurus each get their own shape without template +# edits. + +name: kottos +version: "1.0.0" +host: {{ kottos_agents_host | default(kottos_host) | default(inventory_hostname) }} +namespace: {{ kottos_namespace | default('ca.helu.kottos') }} +registry_port: {{ kottos_registry_port | default(24100) }} + +agents: + harper: + module: agents.harper + port: {{ kottos_harper_port | default(24101) }} + title: Harper + description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving" + depends_on: [research, tech_research] + + scotty: + module: agents.scotty + port: {{ kottos_scotty_port | default(24102) }} + title: Scotty + description: "Systems administration expert — infrastructure diagnostics, security hardening, and keeping everything running" + depends_on: [tech_research] + + research: + module: agents.research + port: {{ kottos_research_port | default(24150) }} + title: Research Agent + description: "Web search via Argos and knowledge graph via Neo4j" + + tech_research: + module: agents.tech_research + port: {{ kottos_tech_research_port | default(24151) }} + title: Tech Research + description: "Technical investigation — library comparisons, API docs, framework patterns, code examples" diff --git a/ansible/kottos/deploy.yml b/ansible/kottos/deploy.yml new file mode 100644 index 0000000..ee6146a --- /dev/null +++ b/ansible/kottos/deploy.yml @@ -0,0 +1,192 @@ +--- +- name: Deploy Kottos (Pallas FastAgent runtime) + hosts: ubuntu + vars: + ansible_common_remote_group: "{{ kottos_group | default([]) }}" + allow_world_readable_tmpfiles: true + + tasks: + - name: Check if host has kottos service + ansible.builtin.set_fact: + has_kottos_service: "{{ 'kottos' in services | default([]) }}" + + - name: Skip hosts without kottos service + ansible.builtin.meta: end_host + when: not has_kottos_service + + - name: Create Kottos group + become: true + ansible.builtin.group: + name: "{{ kottos_group }}" + state: present + + - name: Create kottos user + become: true + ansible.builtin.user: + name: "{{ kottos_user }}" + group: "{{ kottos_group }}" + home: "/home/{{ kottos_user }}" + shell: /bin/bash + system: false + create_home: true + + - name: Add keeper_user to kottos group (optional — enables passwordless tailing) + become: true + ansible.builtin.user: + name: "{{ keeper_user }}" + groups: "{{ kottos_group }}" + append: true + when: keeper_user is defined + + - name: Reset connection to pick up new group membership + ansible.builtin.meta: reset_connection + + - name: Create Kottos install directory + become: true + ansible.builtin.file: + path: "{{ kottos_directory }}" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + state: directory + mode: '0750' + + - name: Ensure base packages for Python + Docker MCP workflows + become: true + ansible.builtin.apt: + name: + - tar + - python3 + - python3-venv + - python3-dev + - git + state: present + update_cache: true + + - name: Transfer and unarchive Kottos release + become: true + ansible.builtin.unarchive: + src: "~/rel/kottos_{{ kottos_rel }}.tar" + dest: "{{ kottos_directory }}" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + mode: '0550' + notify: restart kottos + + - name: Ensure .venv directory ownership is correct + become: true + ansible.builtin.file: + path: "{{ kottos_directory }}/.venv" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + state: directory + recurse: true + when: ansible_facts['file'] is defined or true + + - name: Create virtual environment for Kottos + become: true + become_user: "{{ kottos_user }}" + ansible.builtin.command: + cmd: "python3 -m venv {{ kottos_directory }}/.venv/" + creates: "{{ kottos_directory }}/.venv/bin/activate" + + - name: Install wheel in the virtualenv + become: true + become_user: "{{ kottos_user }}" + ansible.builtin.pip: + name: + - wheel + state: latest + virtualenv: "{{ kottos_directory }}/.venv" + + - name: Install Kottos (pyproject.toml — pulls in pallas-mcp and fast-agent-mcp) + become: true + become_user: "{{ kottos_user }}" + ansible.builtin.pip: + chdir: "{{ kottos_directory }}/kottos" + name: . + virtualenv: "{{ kottos_directory }}/.venv" + virtualenv_command: python3 -m venv + notify: restart kottos + + - name: Template agents.yaml + become: true + ansible.builtin.template: + src: agents.yaml.j2 + dest: "{{ kottos_directory }}/agents.yaml" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + mode: '0640' + notify: restart kottos + + - name: Template fastagent.config.yaml + become: true + ansible.builtin.template: + src: fastagent.config.yaml.j2 + dest: "{{ kottos_directory }}/fastagent.config.yaml" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + mode: '0640' + notify: restart kottos + + - name: Template fastagent.secrets.yaml (vault-rendered) + become: true + ansible.builtin.template: + src: fastagent.secrets.yaml.j2 + dest: "{{ kottos_directory }}/fastagent.secrets.yaml" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + mode: '0600' + notify: restart kottos + no_log: true + + - name: Template runtime .env (PALLAS_LOG_STDOUT etc.) + become: true + ansible.builtin.template: + src: .env.j2 + dest: "{{ kottos_directory }}/.env" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + mode: '0640' + notify: restart kottos + + - name: Template systemd unit + become: true + ansible.builtin.template: + src: kottos.service.j2 + dest: /etc/systemd/system/kottos.service + owner: root + group: root + mode: '0644' + notify: restart kottos + + - name: Enable and start kottos service + become: true + ansible.builtin.systemd: + name: kottos + enabled: true + state: started + daemon_reload: true + + - name: Flush handlers before validation probes + ansible.builtin.meta: flush_handlers + + # ── Validation ────────────────────────────────────────────────────────── + # Registry is the only endpoint that responds with a deterministic JSON + # payload without requiring an MCP session, so we probe it. Agent ports + # are exercised by Daedalus's health-poll loop once registered. + - name: Validate Kottos registry responds + ansible.builtin.uri: + url: "http://localhost:{{ kottos_registry_port | default(24100) }}/.well-known/mcp/server.json" + status_code: 200 + return_content: true + register: registry_check + retries: 10 + delay: 3 + until: registry_check.status == 200 + + handlers: + - name: restart kottos + become: true + ansible.builtin.systemd: + name: kottos + state: restarted diff --git a/ansible/kottos/fastagent.config.yaml.j2 b/ansible/kottos/fastagent.config.yaml.j2 new file mode 100644 index 0000000..9fed8d0 --- /dev/null +++ b/ansible/kottos/fastagent.config.yaml.j2 @@ -0,0 +1,114 @@ +# Kottos — fast-agent configuration (rendered by Ansible) +# ------------------------------------------------------------------ +# Committed-to-kottos copy is the local-dev equivalent; Ansible overwrites +# it with this rendered file on deploy. MCP server URLs are parametrised +# so the same template renders correctly for Ouranos (.incus) and Virgo +# (.virgo / .taurus) — each environment's host_vars supplies the base URLs. + +default_model: {{ kottos_default_model | default('openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf') }} + +# ── Model Capabilities ────────────────────────────────────────────────────── +# Declares capabilities for models not in fast-agent's ModelDatabase. +# vision: true adds image/jpeg, image/png, image/webp to the tokenizer list. +model_capabilities: + vision: {{ kottos_model_vision | default(true) | string | lower }} + context_window: {{ kottos_model_context_window | default(192000) }} + max_output_tokens: {{ kottos_model_max_output_tokens | default(16384) }} + +# ── LLM Providers ─────────────────────────────────────────────────────────── +openai: + base_url: {{ kottos_openai_base_url | default('http://nyx.helu.ca:22079/v1') }} + +mcp: + servers: + # ── Web search via SearXNG (argos) ─────────────────────────────────────── + argos: + transport: http + url: "{{ kottos_argos_url | default('http://miranda.incus:25534/mcp') }}" + + # ── Knowledge graph — Neo4j ────────────────────────────────────────────── + neo4j_cypher: + transport: http + url: "{{ kottos_neo4j_cypher_url | default('http://circe.helu.ca:22034/mcp') }}" + + # ── Shell + file operations — Kernos (Caliban) ─────────────────────────── + kernos_scotty: + transport: http + url: "{{ kottos_kernos_scotty_url | default('http://caliban.incus:22062/mcp') }}" + load_on_start: false + + # ── Agent S computer automation — Rommie on Caliban ────────────────────── + rommie: + transport: http + url: "{{ kottos_rommie_url | default('http://caliban.incus:20361/mcp') }}" + load_on_start: false + + # ── Git repository management — Gitea MCP ──────────────────────────────── + gitea: + transport: http + url: "{{ kottos_gitea_url | default('http://miranda.incus:25535/mcp') }}" + + # ── Grafana observability ─────────────────────────────────────────────── + grafana: + transport: http + url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}" + + # ── Shell + file operations — Kernos (Korax) ───────────────────────────── + kernos_harper: + transport: http + url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}" + load_on_start: false + + # ── Angelia messaging ─────────────────────────────────────────────────── + # Auth header provided by fastagent.secrets.yaml (vault-rendered). + angelia: + transport: http + url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}" + + # ── GitHub MCP Server (local Docker, stdio) ────────────────────────────── + # GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml + github: + command: "docker" + args: + - "run" + - "-i" + - "--rm" + - "-e" + - "GITHUB_PERSONAL_ACCESS_TOKEN" + - "ghcr.io/github/github-mcp-server" + + # ── Library/framework documentation — Context7 (local stdio) ───────────── + context7: + command: "npx" + args: ["-y", "@upstash/context7-mcp"] + + # ── Current time and timezone (local stdio) ────────────────────────────── + time: + command: "mcp-server-time" + args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"] + + # ── Mnemosyne knowledge search — workspace-scoped ──────────────────────── + # Auth is a long-lived team JWT supplied by fastagent.secrets.yaml + # (forward_inbound_auth=false — Mnemosyne validates the team JWT). + mnemosyne: + transport: http + url: "{{ kottos_mnemosyne_url | default('https://mnemosyne.ouranos.helu.ca/mcp/') }}" + + # ── Kottos internal sub-agents ─────────────────────────────────────────── + # These stay on localhost regardless of environment — Pallas serves the + # sub-agents on the same host as the top-level agents. + research: + transport: http + url: "http://localhost:{{ kottos_research_port | default(24150) }}/mcp" + + tech_research: + transport: http + url: "http://localhost:{{ kottos_tech_research_port | default(24151) }}/mcp" + +logger: + type: none + level: {{ kottos_fastagent_log_level | default('info') }} + progress_display: false + show_chat: false + show_tools: false + truncate_tools: true diff --git a/ansible/kottos/fastagent.secrets.yaml.j2 b/ansible/kottos/fastagent.secrets.yaml.j2 new file mode 100644 index 0000000..525e795 --- /dev/null +++ b/ansible/kottos/fastagent.secrets.yaml.j2 @@ -0,0 +1,27 @@ +# Kottos — fast-agent secrets (rendered by Ansible from the vault) +# ------------------------------------------------------------------ +# Never commit the rendered file. Each value here pulls from a vault +# variable — if a vault variable is missing, Ansible will fail the +# template step with a clear error before the file is written. +# +# Same structure as fastagent.config.yaml; values merge with secrets +# taking precedence (fast-agent deep-merges the two). + +openai: + api_key: "{{ vault_kottos_openai_api_key }}" + +mcp: + servers: + github: + env: + GITHUB_PERSONAL_ACCESS_TOKEN: "{{ vault_kottos_github_pat }}" + + angelia: + headers: + Authorization: "Bearer {{ vault_kottos_angelia_bearer }}" + + # Long-lived team JWT minted in Daedalus admin UI. + # See kottos/README.md § "Mnemosyne memory" for the rotation procedure. + mnemosyne: + headers: + Authorization: "Bearer {{ vault_kottos_mnemosyne_jwt }}" diff --git a/ansible/kottos/kottos.service.j2 b/ansible/kottos/kottos.service.j2 new file mode 100644 index 0000000..39f72e9 --- /dev/null +++ b/ansible/kottos/kottos.service.j2 @@ -0,0 +1,33 @@ +[Unit] +Description=Kottos — Pallas FastAgent runtime ({{ kottos_host | default(inventory_hostname) }}) +After=network.target +Wants=network-online.target + +[Service] +Type=simple +User={{ kottos_user }} +Group={{ kottos_group }} +WorkingDirectory={{ kottos_directory }} +EnvironmentFile={{ kottos_directory }}/.env +ExecStart={{ kottos_directory }}/.venv/bin/pallas +Restart=always +RestartSec=5 + +# Journal is the durable sink (Alloy picks up via loki.source.journal and +# relabels SyslogIdentifier=kottos into {service="pallas", project="kottos"} +# for Loki). Stdout from pallas is already JSON thanks to +# PALLAS_LOG_STDOUT=1 set in the .env file. +StandardOutput=journal +StandardError=journal +SyslogIdentifier=kottos + +# Pallas needs to reach localhost sibling agents + upstream MCP servers +# and read its own .venv / agents.yaml / config files. No hardening flags +# that would block those paths. +NoNewPrivileges=false +ProtectSystem=false +ProtectHome=false +PrivateTmp=false + +[Install] +WantedBy=multi-user.target diff --git a/ansible/kottos/stage.yml b/ansible/kottos/stage.yml new file mode 100644 index 0000000..fd8af3e --- /dev/null +++ b/ansible/kottos/stage.yml @@ -0,0 +1,48 @@ +- name: Stage Kottos release tarball + hosts: localhost + gather_facts: false + vars: + archive_path: "{{rel_dir}}/kottos_{{kottos_rel}}.tar" + kottos_repo_url: "ssh://git@git.helu.ca:22022/r/kottos.git" + kottos_repo_dir: "{{repo_dir}}/kottos" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Ensure repo directory exists + file: + path: "{{repo_dir}}" + state: directory + mode: '755' + + - name: Clone Kottos repository if not present + ansible.builtin.git: + repo: "{{kottos_repo_url}}" + dest: "{{kottos_repo_dir}}" + version: "{{kottos_rel}}" + accept_hostkey: true + register: git_clone + ignore_errors: true + + - name: Fetch latest changes if already cloned + ansible.builtin.git: + repo: "{{kottos_repo_url}}" + dest: "{{kottos_repo_dir}}" + version: "{{kottos_rel}}" + update: true + force: true + + - name: Create release archive + ansible.builtin.archive: + path: "{{kottos_repo_dir}}" + dest: "{{archive_path}}" + format: tar + exclude_path: + - "{{kottos_repo_dir}}/.git" + - "{{kottos_repo_dir}}/.venv" + - "{{kottos_repo_dir}}/__pycache__" + - "{{kottos_repo_dir}}/fastagent.secrets.yaml" diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 index 521ef9d..82716d0 100644 --- a/ansible/pplg/alert_rules.yml.j2 +++ b/ansible/pplg/alert_rules.yml.j2 @@ -312,6 +312,78 @@ groups: summary: "Daedalus S3 error rate above 1%" description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + # ============================================================================ + # Mnemosyne Application Alerts + # ============================================================================ + # One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint. + # The Django app container hosts the single prometheus_client registry that + # both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool + # call counters) write to, so "MCP is broken" signals show up as + # ``mcp_tool_invocations_total{status="error"}`` on the same job rather + # than a separate up{} series. + - name: mnemosyne_alerts + rules: + - alert: MnemosyneDown + expr: up{job="mnemosyne"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Mnemosyne is down" + description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable." + + - alert: MnemosyneHighErrorRate + expr: | + sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m])) + / sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Mnemosyne HTTP 5xx error rate above 5%" + description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes." + + - alert: MnemosyneSlowResponses + expr: | + histogram_quantile(0.95, + sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m])) + ) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Mnemosyne p95 response time above 5s" + description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes." + + # MCP tool-call error surface — owned by mcp_server.metrics on the + # same /metrics endpoint. This complements MnemosyneDown by catching + # "app is up but the MCP layer is sick" — e.g. auth token lookups are + # failing, or Neo4j vector search is 500-ing. + - alert: MnemosyneMCPToolErrors + expr: | + sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m])) + / sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10 + for: 5m + labels: + severity: warning + annotations: + summary: "Mnemosyne MCP tool error rate above 10%" + description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})." + + # Celery queue depth — high pending count usually means the embedding + # worker is stuck or throttled by the embedding provider. Requires + # ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``; + # if that is not deployed yet, this rule simply never fires. + - alert: MnemosyneCeleryBacklog + expr: | + sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100 + for: 10m + labels: + severity: warning + annotations: + summary: "Mnemosyne Celery backlog on {{ $labels.queue }}" + description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})." + # Red Panda Seal of Approval 🐼 # "If the metrics aren't red, go back to bed" {% endraw %} diff --git a/ansible/pplg/dashboards_provider.yml.j2 b/ansible/pplg/dashboards_provider.yml.j2 new file mode 100644 index 0000000..d1e67b5 --- /dev/null +++ b/ansible/pplg/dashboards_provider.yml.j2 @@ -0,0 +1,23 @@ +# Grafana dashboard file provider +# Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml +# +# Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports +# any JSON file it finds. Each dashboard JSON lives in that directory and +# is owned by Ansible — operators should not edit dashboards through the +# Grafana UI (changes won't survive a deploy; export the final JSON and +# land it in this role). +apiVersion: 1 + +providers: + - name: 'puck' + orgId: 1 + folder: 'Puck Services' + folderUid: puck-services + type: file + disableDeletion: false + editable: true + allowUiUpdates: false + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/ansible/pplg/deploy.yml b/ansible/pplg/deploy.yml index a99051d..bc718b4 100644 --- a/ansible/pplg/deploy.yml +++ b/ansible/pplg/deploy.yml @@ -208,6 +208,32 @@ group: grafana mode: '750' + - name: Ensure Grafana dashboard provisioning directory exists + ansible.builtin.file: + path: /etc/grafana/provisioning/dashboards + state: directory + owner: grafana + group: grafana + mode: '750' + + - name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards) + ansible.builtin.template: + src: "dashboards_provider.yml.j2" + dest: "/etc/grafana/provisioning/dashboards/puck.yaml" + owner: grafana + group: grafana + mode: '640' + notify: restart grafana + + - name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus) + ansible.builtin.template: + src: "puck_services_dashboard.json.j2" + dest: "/var/lib/grafana/dashboards/puck_services.json" + owner: grafana + group: grafana + mode: '640' + notify: restart grafana + - name: Template Grafana main configuration ansible.builtin.template: src: "grafana.ini.j2" diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 index 9c6a50f..35bd7e1 100644 --- a/ansible/pplg/prometheus.yml.j2 +++ b/ansible/pplg/prometheus.yml.j2 @@ -47,7 +47,18 @@ scrape_configs: - job_name: 'daedalus' static_configs: - - targets: ['puck.incus:22181'] + - targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}'] + metrics_path: '/metrics' + scrape_interval: 15s + + # Mnemosyne — single /metrics endpoint on the app container serves both + # django-prometheus HTTP/Celery metrics and the MCP server's tool-call + # counters (the mcp_server.metrics module registers into the same + # prometheus_client process registry on the Django side). The mcp + # container itself does not expose /metrics; run 'em on the WSGI side. + - job_name: 'mnemosyne' + static_configs: + - targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}'] metrics_path: '/metrics' scrape_interval: 15s diff --git a/ansible/pplg/puck_services_dashboard.json.j2 b/ansible/pplg/puck_services_dashboard.json.j2 new file mode 100644 index 0000000..f17d3f5 --- /dev/null +++ b/ansible/pplg/puck_services_dashboard.json.j2 @@ -0,0 +1,242 @@ +{ + "title": "Puck Services — Logs & Health", + "uid": "puck-services-logs", + "tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "editable": true, + "fiscalYearStartMonth": 0, + "weekStart": "", + "refresh": "30s", + "time": {"from": "now-1h", "to": "now"}, + "templating": { + "list": [ + { + "name": "loki", + "type": "datasource", + "query": "loki", + "current": {"selected": false, "text": "Loki", "value": "Loki"}, + "hide": 0, + "label": "Loki datasource" + }, + { + "name": "prom", + "type": "datasource", + "query": "prometheus", + "current": {"selected": false, "text": "Prometheus", "value": "Prometheus"}, + "hide": 0, + "label": "Prometheus datasource" + } + ] + }, + "panels": [ + { + "id": 1, + "type": "row", + "title": "Mnemosyne", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0} + }, + { + "id": 2, + "type": "timeseries", + "title": "Mnemosyne — log rate by level", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))", + "legendFormat": "{{level}}" + } + ], + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 3, + "type": "logs", + "title": "Mnemosyne — errors (last 25)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1}, + "targets": [ + { + "refId": "A", + "expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"", + "maxLines": 25 + } + ], + "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true} + }, + { + "id": 4, + "type": "stat", + "title": "Mnemosyne — HTTP 5xx rate", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 8, "x": 0, "y": 9}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "value", + "textMode": "auto" + }, + "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}} + }, + { + "id": 5, + "type": "stat", + "title": "Mnemosyne — p95 latency", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 8, "x": 8, "y": 9}, + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}} + }, + { + "id": 6, + "type": "stat", + "title": "Mnemosyne — MCP tool error rate", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 8, "x": 16, "y": 9}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}}, + "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}} + }, + + { + "id": 10, + "type": "row", + "title": "Pallas (Kottos agents)", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 13} + }, + { + "id": 11, + "type": "timeseries", + "title": "Pallas — log rate by agent (component)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 14}, + "targets": [ + { + "refId": "A", + "expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))", + "legendFormat": "{{component}}" + } + ], + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 12, + "type": "logs", + "title": "Pallas — forward trace errors (opaque MCP transport failures)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 14}, + "targets": [ + { + "refId": "A", + "expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"", + "maxLines": 25 + } + ], + "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true} + }, + { + "id": 13, + "type": "logs", + "title": "Pallas — last 25 ERROR lines (any agent)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 22}, + "targets": [ + { + "refId": "A", + "expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"", + "maxLines": 25 + } + ], + "options": {"showLabels": true, "showTime": true, "wrapLogMessage": true} + }, + + { + "id": 20, + "type": "row", + "title": "Daedalus", + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30} + }, + { + "id": 21, + "type": "timeseries", + "title": "Daedalus — log rate by level", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 31}, + "targets": [ + { + "refId": "A", + "expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))", + "legendFormat": "{{level}}" + } + ], + "options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}} + }, + { + "id": 22, + "type": "stat", + "title": "Daedalus — HTTP 5xx rate", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 31}, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}}, + "fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}} + }, + { + "id": 23, + "type": "stat", + "title": "Daedalus — MCP p95 latency", + "datasource": {"type": "prometheus", "uid": "${prom}"}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 31}, + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))" + } + ], + "options": {"reduceOptions": {"calcs": ["lastNotNull"]}}, + "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}} + }, + { + "id": 24, + "type": "logs", + "title": "Daedalus — errors (last 25)", + "datasource": {"type": "loki", "uid": "${loki}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 39}, + "targets": [ + { + "refId": "A", + "expr": "{service=\"daedalus\"} | json | level=\"ERROR\"", + "maxLines": 25 + } + ], + "options": {"showLabels": false, "showTime": true, "wrapLogMessage": true} + } + ] +} diff --git a/ansible/site.yml b/ansible/site.yml index 98313bb..d7bda5f 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -44,3 +44,9 @@ - name: Deploy Agent S import_playbook: agent_s/deploy.yml + +- name: Stage Kottos (Pallas FastAgent runtime) + import_playbook: kottos/stage.yml + +- name: Deploy Kottos + import_playbook: kottos/deploy.yml diff --git a/docs/kottos.md b/docs/kottos.md index c1146e8..957a761 100644 --- a/docs/kottos.md +++ b/docs/kottos.md @@ -163,6 +163,96 @@ The registry includes model capabilities on each agent entry: } ``` +## Deployment + +Kottos runs two ways: + +1. **Locally on caliban**, hand-started for iteration (`kottos` from the repo root). This is the flow documented above in *Quickstart*. +2. **In Ouranos / Virgo / Taurus via Ansible**, as a `systemd`-managed `pallas` process on the puck.incus container. This is the pipeline that feeds the Puck Services dashboard in Grafana. + +### Ansible role + +Lives in `ouranos/ansible/kottos/`: + +| File | Purpose | +|---|---| +| `deploy.yml` | Main playbook — user/group, venv, systemd unit, config templating, registry probe. | +| `stage.yml` | Clones `git.helu.ca/r/kottos` at `{{ kottos_rel }}` and creates the release tarball. | +| `kottos.service.j2` | systemd unit. `SyslogIdentifier=kottos`, `StandardOutput=journal`, `PALLAS_LOG_STDOUT=1` via the env file. | +| `.env.j2` | Runtime environment for `pallas` — logging config, `PALLAS_AGENTS_CONFIG`. | +| `agents.yaml.j2` | Deployment topology with host/ports pulled from inventory. | +| `fastagent.config.yaml.j2` | LLM provider + MCP server URLs, parametric per environment. | +| `fastagent.secrets.yaml.j2` | API keys and auth tokens, rendered from Ansible Vault. | + +### Inventory + +Host variables live in `inventory/host_vars/puck.incus.yml` under **Kottos Configuration**: + +```yaml +kottos_user: kottos +kottos_group: kottos +kottos_directory: /srv/kottos +kottos_host: "puck.incus" +kottos_registry_port: 24100 +kottos_harper_port: 24101 +kottos_scotty_port: 24102 +kottos_research_port: 24150 +kottos_tech_research_port: 24151 +pallas_log_level: INFO +kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf" +kottos_openai_base_url: "http://nyx.helu.ca:22079/v1" +# ...plus one entry per downstream MCP URL so each environment overrides freely +``` + +Every host variable is parametric — Virgo's `puck.virgo.yml` (or wherever the Pallas host lives) can override any value without touching the templates. + +### Vault + +Four vault keys required — all documented in `inventory/group_vars/all/vault.yml.example`: + +| Key | Used for | +|---|---| +| `vault_kottos_openai_api_key` | OpenAI-compatible LLM endpoint (nyx Qwen in Ouranos). | +| `vault_kottos_github_pat` | `GITHUB_PERSONAL_ACCESS_TOKEN` for the local GitHub MCP Docker container. | +| `vault_kottos_angelia_bearer` | Bearer token accepted by the Angelia MCP server. | +| `vault_kottos_mnemosyne_jwt` | Long-lived team JWT from Daedalus admin UI — Mnemosyne validates it on every `search_memory` call and scopes results to this team's workspaces. | + +### Deploying + +Wired into `site.yml`: + +```bash +cd ansible +ansible-playbook kottos/stage.yml # clone repo + build tarball (local) +ansible-playbook kottos/deploy.yml # deploy + template + start +``` + +Or run the full site (`ansible-playbook site.yml`) — kottos's stage + deploy steps are the last block in the sequence. + +### Logs + +Journal identifier `kottos`, so on the host: + +```bash +sudo journalctl -u kottos -f --output=cat | jq . +``` + +Alloy on puck's journal source relabels `__journal_syslog_identifier=kottos` to `{service="pallas", project="kottos"}`, then into Loki. Everything shows up in Grafana's *Puck Services — Logs & Health* dashboard under the **Pallas** row, with per-agent colouring driven by the `component` JSON field (`harper`, `scotty`, `research`, `tech_research`). + +For per-agent follow-along: + +```logql +{service="pallas", project="kottos", component="harper"} | json +``` + +For the opaque-MCP-transport-failure trace stream (see Pallas's bearer-forwarding incident history): + +```logql +{service="pallas", project="kottos"} |= "pallas.forward.trace" | json +``` + +See [logging.md](logging.md) for the full label schema + level policy + add-a-new-service guide. + ## Downstream MCP Servers | Server | Host | URL | diff --git a/docs/logging.md b/docs/logging.md new file mode 100644 index 0000000..f86f966 --- /dev/null +++ b/docs/logging.md @@ -0,0 +1,173 @@ +# Unified Logging — Mnemosyne, Pallas, Daedalus + +PPLG is the single destination for every service's logs. This document describes the label schema every service emits, the two transports Alloy uses to collect logs, and the level policy that keeps INFO output actionable. + +The three in-scope services today are **Mnemosyne**, **Pallas** (running as Kottos/Mentor/Iolaus), and **Daedalus**. The same patterns generalise to any future service that deploys on a `docker`-enabled host or under `systemd+journald`. + +## Label schema + +Every Loki log stream carries these labels, and nothing else: + +| Label | Example values | Source | +|---|---|---| +| `service` | `mnemosyne`, `pallas`, `daedalus`, `athena`, `kairos`, `angelia` | Docker compose project name (container logs) **or** explicit systemd relabel rule (journal logs) | +| `component` | `app`, `mcp`, `worker`, `nginx`, `harper`, `scotty`, `research`, `tech_research` | Docker compose service name **or** per-agent `ContextVar` (Pallas) | +| `project` | `kottos` (Pallas only) | `agents.yaml` `name:` field read by `pallas.log.set_project()` | +| `hostname` | `puck.incus`, `caliban.incus` | Alloy's `inventory_hostname` template var | +| `environment` | `ouranos`, `virgo`, `taurus` | `deployment_environment` from Ansible group_vars | + +**Everything else is a JSON field in the log body**, not a label. That includes `level`, `logger`, `funcName`, `lineno`, `message`, `request_id`, `workspace_id`, `agent`, `tool`, `duration_ms`, and any `extra={...}` kwargs the application passed in. LogQL's `| json` pipeline parses these on-query — keeping them out of the label index is what keeps Loki fast. + +## Level policy + +Same rules for every service. Health-check `200 OK`s live in DEBUG, never in INFO. + +| Level | Meaning | +|---|---| +| `ERROR` | Broken; requires human attention. | +| `WARNING` | Degraded but self-recovering — retries, skipped items, missing optional config. | +| `INFO` | Lifecycle events and failures. Start, ready, shutdown, preflight, LLM provider validation. 200 OKs on health endpoints are **not** INFO. | +| `DEBUG` | Per-request detail, successful health probes, verbose traces. Enable on demand when troubleshooting. | + +Mnemosyne enforces this with `mnemosyne.log_filters.SuppressHealthAccessFilter` on Django/gunicorn access loggers; Pallas with `_HealthAccessFilter` on `uvicorn.access`; Daedalus with the equivalent filter in `daedalus.logging`. + +## Two transports, one Alloy + +Alloy on each host uses exactly two sources for application logs. Pick whichever matches the service's runtime model — **don't** invent a third. + +### 1. Docker socket (for compose projects) + +`discovery.docker` enumerates every running container, and `loki.source.docker` tails their stdout via the `json-file` driver. Compose project → `service` label, compose service → `component` label. One block covers every compose project on the host, current and future. + +**Requirements on the service side:** + +- Emit JSON lines to **stdout**, one per log record. Mnemosyne uses `python-json-logger`; Daedalus uses `structlog`; any Python service can do the same. +- Pin the logging driver to `json-file` with bounded rotation in `docker-compose.yaml`: + + ```yaml + x-logging: &default-logging + driver: json-file + options: + tag: "{{.Name}}" + max-size: "10m" + max-file: "5" + + services: + app: + # ... + logging: *default-logging + ``` + + `json-file` is Docker's default, but pinning it defensively guarantees Alloy sees the same driver on every host. + +- On the Alloy host, the `alloy` user must be in the `docker` group to read `/var/run/docker.sock`. The `ouranos/ansible/alloy/` role handles this. + +### 2. Systemd journal (for systemd-managed units) + +`loki.source.journal` tails journald. A `loki.relabel "journal_"` block translates `__journal_syslog_identifier` → `service` / `project` labels so Pallas-managed agents land alongside Docker-based services with the same schema. + +**Requirements on the service side:** + +- Emit JSON to **stdout** (journald captures it with `PRIORITY=6` INFO by default). +- The systemd unit must set a distinctive `SyslogIdentifier=` — the Alloy relabel block keys off this. +- Under Pallas, set `PALLAS_LOG_STDOUT=1` in the unit's `EnvironmentFile`. Also set `PALLAS_LOG_FILE=/dev/null` to disable the rotating file sink (journald is already durable). + +Example, from `ouranos/ansible/kottos/kottos.service.j2`: + +```ini +[Service] +... +EnvironmentFile=/srv/kottos/.env +ExecStart=/srv/kottos/.venv/bin/pallas +StandardOutput=journal +StandardError=journal +SyslogIdentifier=kottos +``` + +And the matching Alloy relabel rule on puck: + +```alloy +loki.relabel "journal_puck" { + forward_to = [] + rule { + source_labels = ["__journal_syslog_identifier"] + regex = "kottos" + target_label = "service" + replacement = "pallas" + } + rule { + source_labels = ["__journal_syslog_identifier"] + regex = "kottos" + target_label = "project" + replacement = "kottos" + } + // ... +} +``` + +## Per-service reference + +### Mnemosyne (Docker compose on puck) + +- Logging config: `mnemosyne/mnemosyne/mnemosyne/settings.py` → `LOGGING` dict using `pythonjsonlogger.json.JsonFormatter`. +- Component attribution: `MNEMOSYNE_COMPONENT` env var set per docker-compose service (`init`, `app`, `mcp`, `worker`). The settings module reads it into `static_fields.component`. +- Health-filter: `mnemosyne.log_filters.SuppressHealthAccessFilter` on the `access` handler. +- Metrics: `/metrics` on the nginx container (port 23181) — served by django-prometheus on the app container plus `mcp_server.metrics` (shared `prometheus_client` registry). +- Scrape job: `mnemosyne` (see `ouranos/ansible/pplg/prometheus.yml.j2`). +- Alerts: `mnemosyne_alerts` group in `ouranos/ansible/pplg/alert_rules.yml.j2`. + +### Pallas — Kottos (systemd on puck via Ansible role `ouranos/ansible/kottos/`) + +- Logging config: `pallas/pallas/log.py` → `setup_logging()` with `PALLAS_LOG_STDOUT=1`. +- Component attribution: `pallas.log.set_agent_component(name)` is called by `_start_agent()` inside each agent's asyncio task, setting a `contextvars.ContextVar` that the `_StaticFieldsFilter` reads per record. Each agent (harper, scotty, research, tech_research) carries its own value without leaking across tasks. +- Project attribution: `pallas.log.set_project(deploy_name)` is called once in `main()` from `agents.yaml`'s `name:`. For Kottos this renders as `project="kottos"` on every record. +- Deployed by: `ansible-playbook kottos/deploy.yml` (wired into `site.yml`). +- Metrics: none today — Pallas is observed through logs only. Future phase will add a `prometheus_client` endpoint on the registry port for `pallas_agent_requests_total{agent=…}`, `pallas_downstream_mcp_errors_total{server=…}`. + +### Daedalus (Docker compose on puck) + +- Logging config: `daedalus/backend/daedalus/logging.py` — `structlog` JSON processor chain, already production-ready. +- Component attribution: `structlog.contextvars.bind_contextvars(service="daedalus", component="api")` at app startup. +- Health-filter: `_SuppressHealthAccessFilter` on uvicorn's access logger. +- Metrics: `/metrics` on the api container (port 22181). +- Scrape job: `daedalus`. +- Alerts: `daedalus_alerts` group. + +## Useful LogQL queries + +Once the pipeline is live, the "troubleshooting is a nightmare" problem becomes three-click queries in Grafana Explore: + +```logql +# All Mnemosyne errors in the last 15m +{service="mnemosyne"} | json | level="ERROR" + +# Everything Harper did in the last hour +{service="pallas", project="kottos", component="harper"} | json + +# The infamous pallas.forward.trace stream (MCP transport failures) +{service="pallas", project="kottos"} |= "pallas.forward.trace" + +# Cross-service trace of a single request (requires X-Request-Id propagation +# — not yet implemented; Phase 1.5 nice-to-have) +{environment="ouranos"} | json | request_id="" + +# 5xx spike in Daedalus by path +sum by (path) (rate({service="daedalus"} | json | level="ERROR" [5m])) +``` + +The **Puck Services — Logs & Health** dashboard in Grafana (`/etc/grafana/provisioning/dashboards/puck.yaml` → `/var/lib/grafana/dashboards/puck_services.json`) has these pre-wired as panels per service row. + +## Adding a new service + +If you're adding a service to puck (or any Ouranos/Virgo host with this stack): + +1. **Emit JSON to stdout** with `service`/`component` as static fields. Copy Mnemosyne's settings pattern or Pallas's `_StaticFieldsFilter`. +2. **Pick a transport:** + - Docker compose → add the `x-logging: &default-logging` anchor + `logging: *default-logging` on each service. Done. No Alloy changes needed. + - systemd → set `SyslogIdentifier=` on the unit and add a two-rule relabel block to the host's `loki.relabel "journal_"` block. +3. **Expose `/metrics`** if the service is in Python — `prometheus_client` plus either `django-prometheus` or `prometheus_fastapi_instrumentator`. +4. **Add a scrape job** in `ouranos/ansible/pplg/prometheus.yml.j2` (parametrise the target — `{{ _metrics_host }}:{{ _metrics_port }}`) and wire the defaults into the host's `host_vars`. +5. **Add alerts** in `ouranos/ansible/pplg/alert_rules.yml.j2`. At minimum: `Down`, `HighErrorRate`. Use the metric names the service actually exposes — no dead rules. +6. **Optional**: add panels to the Puck Services dashboard JSON. + +No new transport. No per-service Alloy block. No custom log format.