feat(alloy): add journal relabeling and kottos integration on puck

Introduce structured journal relabel rules on puck to tag Pallas-managed
units with {service, project, component} labels matching the Mnemosyne
and Daedalus schema. Add kottos release variable and vault secrets
example entries for the new Pallas FastAgent runtime.

Remove the defunct mnemosyne syslog listener now that Mnemosyne ships
JSON logs via the docker-socket pipeline.
This commit is contained in:
2026-05-11 13:54:14 -04:00
parent e92ab80bbf
commit 8c95173705
19 changed files with 1336 additions and 27 deletions

View File

@@ -18,10 +18,60 @@ loki.source.file "system_logs" {
forward_to = [loki.write.default.receiver] forward_to = [loki.write.default.receiver]
} }
// Journal relabel rules — tag Pallas-managed units (kottos now, mentor /
// iolaus later) with the same {service, project, component} schema used
// by Mnemosyne and Daedalus. Rules run top-to-bottom and STOP at the
// first target_label match per source, so the generic "systemd" fallback
// stays last. If a new Pallas host/project ever lands here, copy one of
// the blocks below and adjust SyslogIdentifier + project.
loki.relabel "journal_puck" {
forward_to = []
// Expose the systemd unit as an auxiliary label for debugging.
rule {
source_labels = ["__journal__systemd_unit"]
target_label = "unit"
}
// Kottos — Pallas FastAgent runtime for the engineering agent project.
// SyslogIdentifier=kottos is set in ouranos/ansible/kottos/kottos.service.j2.
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "service"
replacement = "pallas"
}
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "project"
replacement = "kottos"
}
// Alloy itself — useful to separate from the "systemd" bucket when the
// shipping pipeline misbehaves.
rule {
source_labels = ["__journal__systemd_unit"]
regex = "alloy\\.service"
target_label = "service"
replacement = "alloy"
}
// Default fallback — everything else becomes service="systemd". We
// also set job here for backwards compatibility with existing
// dashboards that filter on ``job="systemd"``.
rule {
source_labels = ["__journal__systemd_unit"]
regex = ".+"
target_label = "job"
replacement = "systemd"
}
}
loki.source.journal "systemd_logs" { loki.source.journal "systemd_logs" {
forward_to = [loki.write.default.receiver] forward_to = [loki.write.default.receiver]
relabel_rules = loki.relabel.journal_puck.rules
labels = { labels = {
job = "systemd",
hostname = "{{inventory_hostname}}", hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}", environment = "{{deployment_environment}}",
} }
@@ -69,19 +119,11 @@ loki.source.syslog "kairos_logs" {
forward_to = [loki.write.default.receiver] forward_to = [loki.write.default.receiver]
} }
loki.source.syslog "menosyne_logs" { // Mnemosyne used to ship via syslog on {{mnemosyne_syslog_port}}; it now
listener { // logs line-delimited JSON to container stdout and is picked up by the
address = "127.0.0.1:{{mnemosyne_syslog_port}}" // docker-socket block below. The host_var is retained as a reserved port
protocol = "tcp" // number but no listener binds to it — remove the var from the inventory
syslog_format = "{{ syslog_format }}" // when the rollout is verified.
labels = {
job = "menosyne",
hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}",
}
}
forward_to = [loki.write.default.receiver]
}
loki.source.syslog "spelunker_logs" { loki.source.syslog "spelunker_logs" {
listener { listener {
@@ -111,19 +153,66 @@ loki.source.syslog "jupyterlab_logs" {
forward_to = [loki.write.default.receiver] forward_to = [loki.write.default.receiver]
} }
loki.source.syslog "daedalus_logs" { // Daedalus also used to ship via syslog on {{daedalus_syslog_port}}; it
listener { // already emits structlog JSON to stdout, so the docker-socket block
address = "127.0.0.1:{{daedalus_syslog_port}}" // below now handles it. Host_var kept for the same transitional reason
protocol = "tcp" // as mnemosyne above.
syslog_format = "{{ syslog_format }}"
// ----------------------------------------------------------------------------
// Docker socket — any compose project on this host lands in Loki with
// `service` = compose project (e.g. "mnemosyne", "daedalus", "kairos") and
// `component` = compose service (e.g. "app", "mcp", "worker", "nginx").
// This replaces per-service syslog listeners — one block covers every
// compose project, current and future.
//
// Requires: the Alloy process to have read access to /var/run/docker.sock
// (Ansible role should add the alloy user to the `docker` group). No Docker
// daemon changes required — we scrape the json-file driver, which is Docker's
// default and is pinned in each compose project's x-logging anchor.
// ----------------------------------------------------------------------------
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
refresh_interval = "30s"
}
discovery.relabel "containers" {
targets = discovery.docker.containers.targets
// Compose project → service label
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
target_label = "service"
}
// Compose service → component label
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "component"
}
// Container name (for one-off / non-compose containers)
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
// Fall back to the container name as `service` when compose labels are
// absent (e.g. a `docker run ...` container outside any compose project)
rule {
source_labels = ["service", "container"]
separator = "@"
regex = "@(.+)"
target_label = "service"
}
}
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.relabel.containers.output
forward_to = [loki.write.default.receiver]
labels = { labels = {
job = "daedalus",
hostname = "{{inventory_hostname}}", hostname = "{{inventory_hostname}}",
environment = "{{deployment_environment}}", environment = "{{deployment_environment}}",
} }
} }
forward_to = [loki.write.default.receiver]
}
loki.write "default" { loki.write "default" {
endpoint { endpoint {

View File

@@ -34,6 +34,7 @@ spelunker_rel: main
mcp_switchboard_rel: main mcp_switchboard_rel: main
kernos_rel: main kernos_rel: main
rommie_rel: main rommie_rel: main
kottos_rel: main
# PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/ # PyPI release version (no 'v' prefix) - https://pypi.org/project/open-webui/
freecad_mcp_version: 0.6.1 freecad_mcp_version: 0.6.1
openwebui_rel: 0.8.3 openwebui_rel: 0.8.3

View File

@@ -99,3 +99,25 @@ vault_ntth_token_1_app_secret: changeme
vault_ntth_token_2_app_secret: changeme vault_ntth_token_2_app_secret: changeme
vault_ntth_token_3_app_secret: changeme vault_ntth_token_3_app_secret: changeme
vault_ntth_token_4_app_secret: changeme vault_ntth_token_4_app_secret: changeme
# Kottos (Pallas FastAgent runtime on puck)
# vault_kottos_openai_api_key — API key for the OpenAI-compatible LLM
# endpoint (nyx Qwen in Ouranos, varies
# per environment). Set to any string
# if the endpoint doesn't validate.
# vault_kottos_github_pat — GitHub personal access token passed
# into the github MCP Docker container
# via GITHUB_PERSONAL_ACCESS_TOKEN env.
# vault_kottos_angelia_bearer — Bearer token for the Angelia MCP
# server (accepts the outgoing auth).
# vault_kottos_mnemosyne_jwt — Long-lived team JWT minted in the
# Daedalus admin UI → Settings →
# Pallas Instances → kottos row →
# "Reveal" or "Rotate". Mnemosyne
# validates this on every search_memory
# call and scopes results to the
# workspaces attached to this team.
vault_kottos_openai_api_key: changeme
vault_kottos_github_pat: changeme
vault_kottos_angelia_bearer: changeme
vault_kottos_mnemosyne_jwt: changeme

View File

@@ -7,6 +7,7 @@ services:
- docker - docker
- gitea_runner - gitea_runner
- athena - athena
- kottos
# Gitea Runner # Gitea Runner
gitea_runner_name: "puck-runner" gitea_runner_name: "puck-runner"
@@ -14,14 +15,86 @@ gitea_runner_name: "puck-runner"
# Alloy # Alloy
alloy_log_level: "warn" alloy_log_level: "warn"
angelia_syslog_port: 51422 angelia_syslog_port: 51422
# mnemosyne_syslog_port retained for inventory-compatibility while the
# Alloy Docker-socket discovery block rolls out; no listener binds to it
# any more. Delete once the docker-socket pipeline is proven in prod.
mnemosyne_syslog_port: 51431 mnemosyne_syslog_port: 51431
athena_syslog_port: 51424 athena_syslog_port: 51424
kairos_syslog_port: 51425 kairos_syslog_port: 51425
icarlos_syslog_port: 51426 icarlos_syslog_port: 51426
spelunker_syslog_port: 51428 spelunker_syslog_port: 51428
jupyterlab_syslog_port: 51411 jupyterlab_syslog_port: 51411
# daedalus_syslog_port retained for the same reason as mnemosyne above.
daedalus_syslog_port: 51430 daedalus_syslog_port: 51430
# =============================================================================
# PPLG scrape targets on puck
# =============================================================================
# Consumed by ``ansible/pplg/prometheus.yml.j2`` on Prospero. Defining them
# here keeps the scrape config fully parametric so the same playbook runs
# unchanged against Ouranos / Virgo / Taurus — each environment sets its
# own puck-equivalent host in its host_vars.
# Daedalus (FastAPI on puck, behind nginx)
daedalus_metrics_host: "puck.incus"
daedalus_metrics_port: 22181
# Mnemosyne — /metrics is served by nginx (mnemosyne-web:23181) and
# proxied to the Django app container, which owns the single
# prometheus_client process registry that both django-prometheus
# (HTTP / Celery) and the MCP server's tool-call counters write to.
mnemosyne_metrics_host: "puck.incus"
mnemosyne_metrics_port: 23181
# =============================================================================
# Kottos Configuration (Pallas FastAgent runtime)
# =============================================================================
# Engineering agents (Harper, Scotty, Research, Tech Research) running as a
# single systemd-managed ``pallas`` process. Logs land in journald via
# SyslogIdentifier=kottos, then Alloy's journal relabel block tags them as
# {service="pallas", project="kottos"} for Loki.
kottos_user: kottos
kottos_group: kottos
kottos_directory: /srv/kottos
kottos_host: "puck.incus"
kottos_namespace: "ca.helu.kottos"
# Ports — registry at 24100, agents 2410124149, sub-agents 2415024199
kottos_registry_port: 24100
kottos_harper_port: 24101
kottos_scotty_port: 24102
kottos_research_port: 24150
kottos_tech_research_port: 24151
# Log level — INFO surfaces lifecycle + failures, DEBUG adds per-request
# detail and successful health probe lines. Ouranos Lab convention:
# health-check 200 OKs live in DEBUG, never in INFO.
pallas_log_level: INFO
# fast-agent's own logger — keep at INFO in prod, bump to DEBUG alongside
# pallas_log_level when chasing MCP transport issues.
kottos_fastagent_log_level: info
# LLM provider — the same OpenAI-compatible Qwen endpoint Kottos uses today.
kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
kottos_openai_base_url: "http://nyx.helu.ca:22079/v1"
kottos_model_vision: true
kottos_model_context_window: 192000
kottos_model_max_output_tokens: 16384
kottos_timezone: "America/Toronto"
# Downstream MCP server URLs — each parametric so Virgo / Taurus override
# them in their own host_vars without touching the templates.
kottos_argos_url: "http://miranda.incus:25534/mcp"
kottos_neo4j_cypher_url: "http://circe.helu.ca:22034/mcp"
kottos_kernos_scotty_url: "http://caliban.incus:22062/mcp"
kottos_rommie_url: "http://caliban.incus:20361/mcp"
kottos_gitea_url: "http://miranda.incus:25535/mcp"
kottos_grafana_url: "http://miranda.incus:25533/mcp"
kottos_kernos_harper_url: "http://korax.helu.ca:20261/mcp"
kottos_angelia_url: "https://ouranos.helu.ca/mcp/"
kottos_mnemosyne_url: "https://mnemosyne.ouranos.helu.ca/mcp/"
# ============================================================================= # =============================================================================
# Athena Configuration # Athena Configuration
# ============================================================================= # =============================================================================

24
ansible/kottos/.env.j2 Normal file
View File

@@ -0,0 +1,24 @@
# Kottos runtime environment — rendered by Ansible from inventory host_vars.
# ------------------------------------------------------------------------
# Loaded by systemd (EnvironmentFile=) and inherited by the pallas process.
# ``.env`` vars NOT set here come from pallas.server's defaults — tweak by
# adding the variable to host_vars and this template, not by editing the
# rendered file on the host.
# ── Logging ─────────────────────────────────────────────────────────────────
# Stdout JSON is the preferred sink for systemd+journald+Alloy deployments.
# Rotating file sink is disabled by pointing PALLAS_LOG_FILE at /dev/null so
# we don't write every record twice.
PALLAS_LOG_STDOUT=1
PALLAS_LOG_FILE=/dev/null
PALLAS_LOG_LEVEL={{ pallas_log_level | default('INFO') }}
# ── Config location ─────────────────────────────────────────────────────────
# PALLAS_AGENTS_CONFIG can be overridden to point at a non-default topology
# (e.g. staging scenarios). Default: agents.yaml next to the working dir.
PALLAS_AGENTS_CONFIG={{ kottos_directory }}/agents.yaml
# ── LLM provider / MCP server secrets ───────────────────────────────────────
# Secrets are rendered into fastagent.secrets.yaml rather than env vars so
# fast-agent's existing YAML-merge logic applies. This block stays empty
# intentionally — the template exists for future per-host tunables.

View File

@@ -0,0 +1,43 @@
# Kottos — Deployment Configuration (rendered by Ansible)
# ------------------------------------------------------------------
# Single source of truth for agent topology, ports, and registry
# metadata. Read by Pallas at startup. The kottos/agents.yaml
# committed in the kottos repo is the local-dev equivalent; Ansible
# overwrites it with this rendered version.
#
# Host + namespace + registry port come from inventory host_vars so
# Ouranos / Virgo / Taurus each get their own shape without template
# edits.
name: kottos
version: "1.0.0"
host: {{ kottos_agents_host | default(kottos_host) | default(inventory_hostname) }}
namespace: {{ kottos_namespace | default('ca.helu.kottos') }}
registry_port: {{ kottos_registry_port | default(24100) }}
agents:
harper:
module: agents.harper
port: {{ kottos_harper_port | default(24101) }}
title: Harper
description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving"
depends_on: [research, tech_research]
scotty:
module: agents.scotty
port: {{ kottos_scotty_port | default(24102) }}
title: Scotty
description: "Systems administration expert — infrastructure diagnostics, security hardening, and keeping everything running"
depends_on: [tech_research]
research:
module: agents.research
port: {{ kottos_research_port | default(24150) }}
title: Research Agent
description: "Web search via Argos and knowledge graph via Neo4j"
tech_research:
module: agents.tech_research
port: {{ kottos_tech_research_port | default(24151) }}
title: Tech Research
description: "Technical investigation — library comparisons, API docs, framework patterns, code examples"

192
ansible/kottos/deploy.yml Normal file
View File

@@ -0,0 +1,192 @@
---
- name: Deploy Kottos (Pallas FastAgent runtime)
hosts: ubuntu
vars:
ansible_common_remote_group: "{{ kottos_group | default([]) }}"
allow_world_readable_tmpfiles: true
tasks:
- name: Check if host has kottos service
ansible.builtin.set_fact:
has_kottos_service: "{{ 'kottos' in services | default([]) }}"
- name: Skip hosts without kottos service
ansible.builtin.meta: end_host
when: not has_kottos_service
- name: Create Kottos group
become: true
ansible.builtin.group:
name: "{{ kottos_group }}"
state: present
- name: Create kottos user
become: true
ansible.builtin.user:
name: "{{ kottos_user }}"
group: "{{ kottos_group }}"
home: "/home/{{ kottos_user }}"
shell: /bin/bash
system: false
create_home: true
- name: Add keeper_user to kottos group (optional — enables passwordless tailing)
become: true
ansible.builtin.user:
name: "{{ keeper_user }}"
groups: "{{ kottos_group }}"
append: true
when: keeper_user is defined
- name: Reset connection to pick up new group membership
ansible.builtin.meta: reset_connection
- name: Create Kottos install directory
become: true
ansible.builtin.file:
path: "{{ kottos_directory }}"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
state: directory
mode: '0750'
- name: Ensure base packages for Python + Docker MCP workflows
become: true
ansible.builtin.apt:
name:
- tar
- python3
- python3-venv
- python3-dev
- git
state: present
update_cache: true
- name: Transfer and unarchive Kottos release
become: true
ansible.builtin.unarchive:
src: "~/rel/kottos_{{ kottos_rel }}.tar"
dest: "{{ kottos_directory }}"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '0550'
notify: restart kottos
- name: Ensure .venv directory ownership is correct
become: true
ansible.builtin.file:
path: "{{ kottos_directory }}/.venv"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
state: directory
recurse: true
when: ansible_facts['file'] is defined or true
- name: Create virtual environment for Kottos
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.command:
cmd: "python3 -m venv {{ kottos_directory }}/.venv/"
creates: "{{ kottos_directory }}/.venv/bin/activate"
- name: Install wheel in the virtualenv
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.pip:
name:
- wheel
state: latest
virtualenv: "{{ kottos_directory }}/.venv"
- name: Install Kottos (pyproject.toml — pulls in pallas-mcp and fast-agent-mcp)
become: true
become_user: "{{ kottos_user }}"
ansible.builtin.pip:
chdir: "{{ kottos_directory }}/kottos"
name: .
virtualenv: "{{ kottos_directory }}/.venv"
virtualenv_command: python3 -m venv
notify: restart kottos
- name: Template agents.yaml
become: true
ansible.builtin.template:
src: agents.yaml.j2
dest: "{{ kottos_directory }}/agents.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '0640'
notify: restart kottos
- name: Template fastagent.config.yaml
become: true
ansible.builtin.template:
src: fastagent.config.yaml.j2
dest: "{{ kottos_directory }}/fastagent.config.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '0640'
notify: restart kottos
- name: Template fastagent.secrets.yaml (vault-rendered)
become: true
ansible.builtin.template:
src: fastagent.secrets.yaml.j2
dest: "{{ kottos_directory }}/fastagent.secrets.yaml"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '0600'
notify: restart kottos
no_log: true
- name: Template runtime .env (PALLAS_LOG_STDOUT etc.)
become: true
ansible.builtin.template:
src: .env.j2
dest: "{{ kottos_directory }}/.env"
owner: "{{ kottos_user }}"
group: "{{ kottos_group }}"
mode: '0640'
notify: restart kottos
- name: Template systemd unit
become: true
ansible.builtin.template:
src: kottos.service.j2
dest: /etc/systemd/system/kottos.service
owner: root
group: root
mode: '0644'
notify: restart kottos
- name: Enable and start kottos service
become: true
ansible.builtin.systemd:
name: kottos
enabled: true
state: started
daemon_reload: true
- name: Flush handlers before validation probes
ansible.builtin.meta: flush_handlers
# ── Validation ──────────────────────────────────────────────────────────
# Registry is the only endpoint that responds with a deterministic JSON
# payload without requiring an MCP session, so we probe it. Agent ports
# are exercised by Daedalus's health-poll loop once registered.
- name: Validate Kottos registry responds
ansible.builtin.uri:
url: "http://localhost:{{ kottos_registry_port | default(24100) }}/.well-known/mcp/server.json"
status_code: 200
return_content: true
register: registry_check
retries: 10
delay: 3
until: registry_check.status == 200
handlers:
- name: restart kottos
become: true
ansible.builtin.systemd:
name: kottos
state: restarted

View File

@@ -0,0 +1,114 @@
# Kottos — fast-agent configuration (rendered by Ansible)
# ------------------------------------------------------------------
# Committed-to-kottos copy is the local-dev equivalent; Ansible overwrites
# it with this rendered file on deploy. MCP server URLs are parametrised
# so the same template renders correctly for Ouranos (.incus) and Virgo
# (.virgo / .taurus) — each environment's host_vars supplies the base URLs.
default_model: {{ kottos_default_model | default('openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf') }}
# ── Model Capabilities ──────────────────────────────────────────────────────
# Declares capabilities for models not in fast-agent's ModelDatabase.
# vision: true adds image/jpeg, image/png, image/webp to the tokenizer list.
model_capabilities:
vision: {{ kottos_model_vision | default(true) | string | lower }}
context_window: {{ kottos_model_context_window | default(192000) }}
max_output_tokens: {{ kottos_model_max_output_tokens | default(16384) }}
# ── LLM Providers ───────────────────────────────────────────────────────────
openai:
base_url: {{ kottos_openai_base_url | default('http://nyx.helu.ca:22079/v1') }}
mcp:
servers:
# ── Web search via SearXNG (argos) ───────────────────────────────────────
argos:
transport: http
url: "{{ kottos_argos_url | default('http://miranda.incus:25534/mcp') }}"
# ── Knowledge graph — Neo4j ──────────────────────────────────────────────
neo4j_cypher:
transport: http
url: "{{ kottos_neo4j_cypher_url | default('http://circe.helu.ca:22034/mcp') }}"
# ── Shell + file operations — Kernos (Caliban) ───────────────────────────
kernos_scotty:
transport: http
url: "{{ kottos_kernos_scotty_url | default('http://caliban.incus:22062/mcp') }}"
load_on_start: false
# ── Agent S computer automation — Rommie on Caliban ──────────────────────
rommie:
transport: http
url: "{{ kottos_rommie_url | default('http://caliban.incus:20361/mcp') }}"
load_on_start: false
# ── Git repository management — Gitea MCP ────────────────────────────────
gitea:
transport: http
url: "{{ kottos_gitea_url | default('http://miranda.incus:25535/mcp') }}"
# ── Grafana observability ───────────────────────────────────────────────
grafana:
transport: http
url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}"
# ── Shell + file operations — Kernos (Korax) ─────────────────────────────
kernos_harper:
transport: http
url: "{{ kottos_kernos_harper_url | default('http://korax.helu.ca:20261/mcp') }}"
load_on_start: false
# ── Angelia messaging ───────────────────────────────────────────────────
# Auth header provided by fastagent.secrets.yaml (vault-rendered).
angelia:
transport: http
url: "{{ kottos_angelia_url | default('https://ouranos.helu.ca/mcp/') }}"
# ── GitHub MCP Server (local Docker, stdio) ──────────────────────────────
# GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml
github:
command: "docker"
args:
- "run"
- "-i"
- "--rm"
- "-e"
- "GITHUB_PERSONAL_ACCESS_TOKEN"
- "ghcr.io/github/github-mcp-server"
# ── Library/framework documentation — Context7 (local stdio) ─────────────
context7:
command: "npx"
args: ["-y", "@upstash/context7-mcp"]
# ── Current time and timezone (local stdio) ──────────────────────────────
time:
command: "mcp-server-time"
args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"]
# ── Mnemosyne knowledge search — workspace-scoped ────────────────────────
# Auth is a long-lived team JWT supplied by fastagent.secrets.yaml
# (forward_inbound_auth=false — Mnemosyne validates the team JWT).
mnemosyne:
transport: http
url: "{{ kottos_mnemosyne_url | default('https://mnemosyne.ouranos.helu.ca/mcp/') }}"
# ── Kottos internal sub-agents ───────────────────────────────────────────
# These stay on localhost regardless of environment — Pallas serves the
# sub-agents on the same host as the top-level agents.
research:
transport: http
url: "http://localhost:{{ kottos_research_port | default(24150) }}/mcp"
tech_research:
transport: http
url: "http://localhost:{{ kottos_tech_research_port | default(24151) }}/mcp"
logger:
type: none
level: {{ kottos_fastagent_log_level | default('info') }}
progress_display: false
show_chat: false
show_tools: false
truncate_tools: true

View File

@@ -0,0 +1,27 @@
# Kottos — fast-agent secrets (rendered by Ansible from the vault)
# ------------------------------------------------------------------
# Never commit the rendered file. Each value here pulls from a vault
# variable — if a vault variable is missing, Ansible will fail the
# template step with a clear error before the file is written.
#
# Same structure as fastagent.config.yaml; values merge with secrets
# taking precedence (fast-agent deep-merges the two).
openai:
api_key: "{{ vault_kottos_openai_api_key }}"
mcp:
servers:
github:
env:
GITHUB_PERSONAL_ACCESS_TOKEN: "{{ vault_kottos_github_pat }}"
angelia:
headers:
Authorization: "Bearer {{ vault_kottos_angelia_bearer }}"
# Long-lived team JWT minted in Daedalus admin UI.
# See kottos/README.md § "Mnemosyne memory" for the rotation procedure.
mnemosyne:
headers:
Authorization: "Bearer {{ vault_kottos_mnemosyne_jwt }}"

View File

@@ -0,0 +1,33 @@
[Unit]
Description=Kottos — Pallas FastAgent runtime ({{ kottos_host | default(inventory_hostname) }})
After=network.target
Wants=network-online.target
[Service]
Type=simple
User={{ kottos_user }}
Group={{ kottos_group }}
WorkingDirectory={{ kottos_directory }}
EnvironmentFile={{ kottos_directory }}/.env
ExecStart={{ kottos_directory }}/.venv/bin/pallas
Restart=always
RestartSec=5
# Journal is the durable sink (Alloy picks up via loki.source.journal and
# relabels SyslogIdentifier=kottos into {service="pallas", project="kottos"}
# for Loki). Stdout from pallas is already JSON thanks to
# PALLAS_LOG_STDOUT=1 set in the .env file.
StandardOutput=journal
StandardError=journal
SyslogIdentifier=kottos
# Pallas needs to reach localhost sibling agents + upstream MCP servers
# and read its own .venv / agents.yaml / config files. No hardening flags
# that would block those paths.
NoNewPrivileges=false
ProtectSystem=false
ProtectHome=false
PrivateTmp=false
[Install]
WantedBy=multi-user.target

48
ansible/kottos/stage.yml Normal file
View File

@@ -0,0 +1,48 @@
- name: Stage Kottos release tarball
hosts: localhost
gather_facts: false
vars:
archive_path: "{{rel_dir}}/kottos_{{kottos_rel}}.tar"
kottos_repo_url: "ssh://git@git.helu.ca:22022/r/kottos.git"
kottos_repo_dir: "{{repo_dir}}/kottos"
tasks:
- name: Ensure release directory exists
file:
path: "{{rel_dir}}"
state: directory
mode: '755'
- name: Ensure repo directory exists
file:
path: "{{repo_dir}}"
state: directory
mode: '755'
- name: Clone Kottos repository if not present
ansible.builtin.git:
repo: "{{kottos_repo_url}}"
dest: "{{kottos_repo_dir}}"
version: "{{kottos_rel}}"
accept_hostkey: true
register: git_clone
ignore_errors: true
- name: Fetch latest changes if already cloned
ansible.builtin.git:
repo: "{{kottos_repo_url}}"
dest: "{{kottos_repo_dir}}"
version: "{{kottos_rel}}"
update: true
force: true
- name: Create release archive
ansible.builtin.archive:
path: "{{kottos_repo_dir}}"
dest: "{{archive_path}}"
format: tar
exclude_path:
- "{{kottos_repo_dir}}/.git"
- "{{kottos_repo_dir}}/.venv"
- "{{kottos_repo_dir}}/__pycache__"
- "{{kottos_repo_dir}}/fastagent.secrets.yaml"

View File

@@ -312,6 +312,78 @@ groups:
summary: "Daedalus S3 error rate above 1%" summary: "Daedalus S3 error rate above 1%"
description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes." description: "Daedalus S3 error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
# ============================================================================
# Mnemosyne Application Alerts
# ============================================================================
# One scrape job, ``mnemosyne``, on the nginx-fronted /metrics endpoint.
# The Django app container hosts the single prometheus_client registry that
# both django-prometheus (HTTP + Celery) and mcp_server.metrics (MCP tool
# call counters) write to, so "MCP is broken" signals show up as
# ``mcp_tool_invocations_total{status="error"}`` on the same job rather
# than a separate up{} series.
- name: mnemosyne_alerts
rules:
- alert: MnemosyneDown
expr: up{job="mnemosyne"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Mnemosyne is down"
description: "The Mnemosyne /metrics endpoint has been unreachable for more than 2 minutes. Both the Django app and the MCP server (same container family) are presumed unavailable."
- alert: MnemosyneHighErrorRate
expr: |
sum(rate(django_http_responses_total_by_status_total{job="mnemosyne",status=~"5.."}[5m]))
/ sum(rate(django_http_responses_total_by_status_total{job="mnemosyne"}[5m])) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne HTTP 5xx error rate above 5%"
description: "Mnemosyne is returning HTTP 5xx errors at {{ $value | humanizePercentage }} of requests over the last 5 minutes."
- alert: MnemosyneSlowResponses
expr: |
histogram_quantile(0.95,
sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job="mnemosyne"}[5m]))
) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne p95 response time above 5s"
description: "Mnemosyne p95 response latency is {{ $value | printf \"%.2f\" }}s over the last 5 minutes."
# MCP tool-call error surface — owned by mcp_server.metrics on the
# same /metrics endpoint. This complements MnemosyneDown by catching
# "app is up but the MCP layer is sick" — e.g. auth token lookups are
# failing, or Neo4j vector search is 500-ing.
- alert: MnemosyneMCPToolErrors
expr: |
sum(rate(mcp_tool_invocations_total{job="mnemosyne",status="error"}[5m]))
/ sum(rate(mcp_tool_invocations_total{job="mnemosyne"}[5m])) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "Mnemosyne MCP tool error rate above 10%"
description: "MCP tool calls are erroring at {{ $value | humanizePercentage }} of invocations — check the mcp container logs in Loki ({service=\"mnemosyne\", component=\"mcp\"})."
# Celery queue depth — high pending count usually means the embedding
# worker is stuck or throttled by the embedding provider. Requires
# ``celery-prometheus-exporter`` or similar to emit ``celery_queue_length``;
# if that is not deployed yet, this rule simply never fires.
- alert: MnemosyneCeleryBacklog
expr: |
sum by (queue) (celery_queue_length{queue=~"embedding|batch|celery"}) > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Mnemosyne Celery backlog on {{ $labels.queue }}"
description: "Celery queue '{{ $labels.queue }}' has {{ $value }} pending tasks for more than 10 minutes — check the worker logs in Loki ({service=\"mnemosyne\", component=\"worker\"})."
# Red Panda Seal of Approval 🐼 # Red Panda Seal of Approval 🐼
# "If the metrics aren't red, go back to bed" # "If the metrics aren't red, go back to bed"
{% endraw %} {% endraw %}

View File

@@ -0,0 +1,23 @@
# Grafana dashboard file provider
# Deployed to: /etc/grafana/provisioning/dashboards/puck.yaml
#
# Grafana polls the ``path`` every ``updateIntervalSeconds`` and re-imports
# any JSON file it finds. Each dashboard JSON lives in that directory and
# is owned by Ansible — operators should not edit dashboards through the
# Grafana UI (changes won't survive a deploy; export the final JSON and
# land it in this role).
apiVersion: 1
providers:
- name: 'puck'
orgId: 1
folder: 'Puck Services'
folderUid: puck-services
type: file
disableDeletion: false
editable: true
allowUiUpdates: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View File

@@ -208,6 +208,32 @@
group: grafana group: grafana
mode: '750' mode: '750'
- name: Ensure Grafana dashboard provisioning directory exists
ansible.builtin.file:
path: /etc/grafana/provisioning/dashboards
state: directory
owner: grafana
group: grafana
mode: '750'
- name: Template Grafana dashboard provider (file source → /var/lib/grafana/dashboards)
ansible.builtin.template:
src: "dashboards_provider.yml.j2"
dest: "/etc/grafana/provisioning/dashboards/puck.yaml"
owner: grafana
group: grafana
mode: '640'
notify: restart grafana
- name: Template Puck Services dashboard (Mnemosyne + Pallas + Daedalus)
ansible.builtin.template:
src: "puck_services_dashboard.json.j2"
dest: "/var/lib/grafana/dashboards/puck_services.json"
owner: grafana
group: grafana
mode: '640'
notify: restart grafana
- name: Template Grafana main configuration - name: Template Grafana main configuration
ansible.builtin.template: ansible.builtin.template:
src: "grafana.ini.j2" src: "grafana.ini.j2"

View File

@@ -47,7 +47,18 @@ scrape_configs:
- job_name: 'daedalus' - job_name: 'daedalus'
static_configs: static_configs:
- targets: ['puck.incus:22181'] - targets: ['{{ daedalus_metrics_host }}:{{ daedalus_metrics_port }}']
metrics_path: '/metrics'
scrape_interval: 15s
# Mnemosyne — single /metrics endpoint on the app container serves both
# django-prometheus HTTP/Celery metrics and the MCP server's tool-call
# counters (the mcp_server.metrics module registers into the same
# prometheus_client process registry on the Django side). The mcp
# container itself does not expose /metrics; run 'em on the WSGI side.
- job_name: 'mnemosyne'
static_configs:
- targets: ['{{ mnemosyne_metrics_host }}:{{ mnemosyne_metrics_port }}']
metrics_path: '/metrics' metrics_path: '/metrics'
scrape_interval: 15s scrape_interval: 15s

View File

@@ -0,0 +1,242 @@
{
"title": "Puck Services — Logs & Health",
"uid": "puck-services-logs",
"tags": ["puck", "logs", "mnemosyne", "pallas", "daedalus"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"editable": true,
"fiscalYearStartMonth": 0,
"weekStart": "",
"refresh": "30s",
"time": {"from": "now-1h", "to": "now"},
"templating": {
"list": [
{
"name": "loki",
"type": "datasource",
"query": "loki",
"current": {"selected": false, "text": "Loki", "value": "Loki"},
"hide": 0,
"label": "Loki datasource"
},
{
"name": "prom",
"type": "datasource",
"query": "prometheus",
"current": {"selected": false, "text": "Prometheus", "value": "Prometheus"},
"hide": 0,
"label": "Prometheus datasource"
}
]
},
"panels": [
{
"id": 1,
"type": "row",
"title": "Mnemosyne",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}
},
{
"id": 2,
"type": "timeseries",
"title": "Mnemosyne — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"mnemosyne\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 3,
"type": "logs",
"title": "Mnemosyne — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
"targets": [
{
"refId": "A",
"expr": "{service=\"mnemosyne\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 4,
"type": "stat",
"title": "Mnemosyne — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\",status=~\"5..\"}[5m])) / clamp_min(sum(rate(django_http_responses_total_by_status_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {
"reduceOptions": {"calcs": ["lastNotNull"]},
"colorMode": "value",
"textMode": "auto"
},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 5,
"type": "stat",
"title": "Mnemosyne — p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 9},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, sum by (le) (rate(django_http_requests_latency_including_middlewares_seconds_bucket{job=\"mnemosyne\"}[5m])))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 1}, {"color": "red", "value": 5}]}}}
},
{
"id": 6,
"type": "stat",
"title": "Mnemosyne — MCP tool error rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 9},
"targets": [
{
"refId": "A",
"expr": "sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\",status=\"error\"}[5m])) / clamp_min(sum(rate(mcp_tool_invocations_total{job=\"mnemosyne\"}[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.05}, {"color": "red", "value": 0.10}]}}}
},
{
"id": 10,
"type": "row",
"title": "Pallas (Kottos agents)",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 13}
},
{
"id": 11,
"type": "timeseries",
"title": "Pallas — log rate by agent (component)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 14},
"targets": [
{
"refId": "A",
"expr": "sum by (component) (rate({service=\"pallas\", project=\"kottos\"} | json [5m]))",
"legendFormat": "{{component}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 12,
"type": "logs",
"title": "Pallas — forward trace errors (opaque MCP transport failures)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 14},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} |= \"pallas.forward.trace\" | json | level=~\"ERROR|WARNING\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
},
{
"id": 13,
"type": "logs",
"title": "Pallas — last 25 ERROR lines (any agent)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 22},
"targets": [
{
"refId": "A",
"expr": "{service=\"pallas\", project=\"kottos\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": true, "showTime": true, "wrapLogMessage": true}
},
{
"id": 20,
"type": "row",
"title": "Daedalus",
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}
},
{
"id": 21,
"type": "timeseries",
"title": "Daedalus — log rate by level",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum by (level) (rate({service=\"daedalus\"} | json [5m]))",
"legendFormat": "{{level}}"
}
],
"options": {"legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}}
},
{
"id": 22,
"type": "stat",
"title": "Daedalus — HTTP 5xx rate",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 31},
"targets": [
{
"refId": "A",
"expr": "sum(rate(daedalus_http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(daedalus_http_requests_total[5m])), 0.0001)"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "percentunit", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 0.01}, {"color": "red", "value": 0.05}]}}}
},
{
"id": 23,
"type": "stat",
"title": "Daedalus — MCP p95 latency",
"datasource": {"type": "prometheus", "uid": "${prom}"},
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 31},
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, rate(daedalus_mcp_request_duration_seconds_bucket[5m]))"
}
],
"options": {"reduceOptions": {"calcs": ["lastNotNull"]}},
"fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green"}, {"color": "orange", "value": 5}, {"color": "red", "value": 30}]}}}
},
{
"id": 24,
"type": "logs",
"title": "Daedalus — errors (last 25)",
"datasource": {"type": "loki", "uid": "${loki}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 39},
"targets": [
{
"refId": "A",
"expr": "{service=\"daedalus\"} | json | level=\"ERROR\"",
"maxLines": 25
}
],
"options": {"showLabels": false, "showTime": true, "wrapLogMessage": true}
}
]
}

View File

@@ -44,3 +44,9 @@
- name: Deploy Agent S - name: Deploy Agent S
import_playbook: agent_s/deploy.yml import_playbook: agent_s/deploy.yml
- name: Stage Kottos (Pallas FastAgent runtime)
import_playbook: kottos/stage.yml
- name: Deploy Kottos
import_playbook: kottos/deploy.yml

View File

@@ -163,6 +163,96 @@ The registry includes model capabilities on each agent entry:
} }
``` ```
## Deployment
Kottos runs two ways:
1. **Locally on caliban**, hand-started for iteration (`kottos` from the repo root). This is the flow documented above in *Quickstart*.
2. **In Ouranos / Virgo / Taurus via Ansible**, as a `systemd`-managed `pallas` process on the puck.incus container. This is the pipeline that feeds the Puck Services dashboard in Grafana.
### Ansible role
Lives in `ouranos/ansible/kottos/`:
| File | Purpose |
|---|---|
| `deploy.yml` | Main playbook — user/group, venv, systemd unit, config templating, registry probe. |
| `stage.yml` | Clones `git.helu.ca/r/kottos` at `{{ kottos_rel }}` and creates the release tarball. |
| `kottos.service.j2` | systemd unit. `SyslogIdentifier=kottos`, `StandardOutput=journal`, `PALLAS_LOG_STDOUT=1` via the env file. |
| `.env.j2` | Runtime environment for `pallas` — logging config, `PALLAS_AGENTS_CONFIG`. |
| `agents.yaml.j2` | Deployment topology with host/ports pulled from inventory. |
| `fastagent.config.yaml.j2` | LLM provider + MCP server URLs, parametric per environment. |
| `fastagent.secrets.yaml.j2` | API keys and auth tokens, rendered from Ansible Vault. |
### Inventory
Host variables live in `inventory/host_vars/puck.incus.yml` under **Kottos Configuration**:
```yaml
kottos_user: kottos
kottos_group: kottos
kottos_directory: /srv/kottos
kottos_host: "puck.incus"
kottos_registry_port: 24100
kottos_harper_port: 24101
kottos_scotty_port: 24102
kottos_research_port: 24150
kottos_tech_research_port: 24151
pallas_log_level: INFO
kottos_default_model: "openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf"
kottos_openai_base_url: "http://nyx.helu.ca:22079/v1"
# ...plus one entry per downstream MCP URL so each environment overrides freely
```
Every host variable is parametric — Virgo's `puck.virgo.yml` (or wherever the Pallas host lives) can override any value without touching the templates.
### Vault
Four vault keys required — all documented in `inventory/group_vars/all/vault.yml.example`:
| Key | Used for |
|---|---|
| `vault_kottos_openai_api_key` | OpenAI-compatible LLM endpoint (nyx Qwen in Ouranos). |
| `vault_kottos_github_pat` | `GITHUB_PERSONAL_ACCESS_TOKEN` for the local GitHub MCP Docker container. |
| `vault_kottos_angelia_bearer` | Bearer token accepted by the Angelia MCP server. |
| `vault_kottos_mnemosyne_jwt` | Long-lived team JWT from Daedalus admin UI — Mnemosyne validates it on every `search_memory` call and scopes results to this team's workspaces. |
### Deploying
Wired into `site.yml`:
```bash
cd ansible
ansible-playbook kottos/stage.yml # clone repo + build tarball (local)
ansible-playbook kottos/deploy.yml # deploy + template + start
```
Or run the full site (`ansible-playbook site.yml`) — kottos's stage + deploy steps are the last block in the sequence.
### Logs
Journal identifier `kottos`, so on the host:
```bash
sudo journalctl -u kottos -f --output=cat | jq .
```
Alloy on puck's journal source relabels `__journal_syslog_identifier=kottos` to `{service="pallas", project="kottos"}`, then into Loki. Everything shows up in Grafana's *Puck Services — Logs & Health* dashboard under the **Pallas** row, with per-agent colouring driven by the `component` JSON field (`harper`, `scotty`, `research`, `tech_research`).
For per-agent follow-along:
```logql
{service="pallas", project="kottos", component="harper"} | json
```
For the opaque-MCP-transport-failure trace stream (see Pallas's bearer-forwarding incident history):
```logql
{service="pallas", project="kottos"} |= "pallas.forward.trace" | json
```
See [logging.md](logging.md) for the full label schema + level policy + add-a-new-service guide.
## Downstream MCP Servers ## Downstream MCP Servers
| Server | Host | URL | | Server | Host | URL |

173
docs/logging.md Normal file
View File

@@ -0,0 +1,173 @@
# Unified Logging — Mnemosyne, Pallas, Daedalus
PPLG is the single destination for every service's logs. This document describes the label schema every service emits, the two transports Alloy uses to collect logs, and the level policy that keeps INFO output actionable.
The three in-scope services today are **Mnemosyne**, **Pallas** (running as Kottos/Mentor/Iolaus), and **Daedalus**. The same patterns generalise to any future service that deploys on a `docker`-enabled host or under `systemd+journald`.
## Label schema
Every Loki log stream carries these labels, and nothing else:
| Label | Example values | Source |
|---|---|---|
| `service` | `mnemosyne`, `pallas`, `daedalus`, `athena`, `kairos`, `angelia` | Docker compose project name (container logs) **or** explicit systemd relabel rule (journal logs) |
| `component` | `app`, `mcp`, `worker`, `nginx`, `harper`, `scotty`, `research`, `tech_research` | Docker compose service name **or** per-agent `ContextVar` (Pallas) |
| `project` | `kottos` (Pallas only) | `agents.yaml` `name:` field read by `pallas.log.set_project()` |
| `hostname` | `puck.incus`, `caliban.incus` | Alloy's `inventory_hostname` template var |
| `environment` | `ouranos`, `virgo`, `taurus` | `deployment_environment` from Ansible group_vars |
**Everything else is a JSON field in the log body**, not a label. That includes `level`, `logger`, `funcName`, `lineno`, `message`, `request_id`, `workspace_id`, `agent`, `tool`, `duration_ms`, and any `extra={...}` kwargs the application passed in. LogQL's `| json` pipeline parses these on-query — keeping them out of the label index is what keeps Loki fast.
## Level policy
Same rules for every service. Health-check `200 OK`s live in DEBUG, never in INFO.
| Level | Meaning |
|---|---|
| `ERROR` | Broken; requires human attention. |
| `WARNING` | Degraded but self-recovering — retries, skipped items, missing optional config. |
| `INFO` | Lifecycle events and failures. Start, ready, shutdown, preflight, LLM provider validation. 200 OKs on health endpoints are **not** INFO. |
| `DEBUG` | Per-request detail, successful health probes, verbose traces. Enable on demand when troubleshooting. |
Mnemosyne enforces this with `mnemosyne.log_filters.SuppressHealthAccessFilter` on Django/gunicorn access loggers; Pallas with `_HealthAccessFilter` on `uvicorn.access`; Daedalus with the equivalent filter in `daedalus.logging`.
## Two transports, one Alloy
Alloy on each host uses exactly two sources for application logs. Pick whichever matches the service's runtime model — **don't** invent a third.
### 1. Docker socket (for compose projects)
`discovery.docker` enumerates every running container, and `loki.source.docker` tails their stdout via the `json-file` driver. Compose project → `service` label, compose service → `component` label. One block covers every compose project on the host, current and future.
**Requirements on the service side:**
- Emit JSON lines to **stdout**, one per log record. Mnemosyne uses `python-json-logger`; Daedalus uses `structlog`; any Python service can do the same.
- Pin the logging driver to `json-file` with bounded rotation in `docker-compose.yaml`:
```yaml
x-logging: &default-logging
driver: json-file
options:
tag: "{{.Name}}"
max-size: "10m"
max-file: "5"
services:
app:
# ...
logging: *default-logging
```
`json-file` is Docker's default, but pinning it defensively guarantees Alloy sees the same driver on every host.
- On the Alloy host, the `alloy` user must be in the `docker` group to read `/var/run/docker.sock`. The `ouranos/ansible/alloy/` role handles this.
### 2. Systemd journal (for systemd-managed units)
`loki.source.journal` tails journald. A `loki.relabel "journal_<host>"` block translates `__journal_syslog_identifier` → `service` / `project` labels so Pallas-managed agents land alongside Docker-based services with the same schema.
**Requirements on the service side:**
- Emit JSON to **stdout** (journald captures it with `PRIORITY=6` INFO by default).
- The systemd unit must set a distinctive `SyslogIdentifier=` — the Alloy relabel block keys off this.
- Under Pallas, set `PALLAS_LOG_STDOUT=1` in the unit's `EnvironmentFile`. Also set `PALLAS_LOG_FILE=/dev/null` to disable the rotating file sink (journald is already durable).
Example, from `ouranos/ansible/kottos/kottos.service.j2`:
```ini
[Service]
...
EnvironmentFile=/srv/kottos/.env
ExecStart=/srv/kottos/.venv/bin/pallas
StandardOutput=journal
StandardError=journal
SyslogIdentifier=kottos
```
And the matching Alloy relabel rule on puck:
```alloy
loki.relabel "journal_puck" {
forward_to = []
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "service"
replacement = "pallas"
}
rule {
source_labels = ["__journal_syslog_identifier"]
regex = "kottos"
target_label = "project"
replacement = "kottos"
}
// ...
}
```
## Per-service reference
### Mnemosyne (Docker compose on puck)
- Logging config: `mnemosyne/mnemosyne/mnemosyne/settings.py` → `LOGGING` dict using `pythonjsonlogger.json.JsonFormatter`.
- Component attribution: `MNEMOSYNE_COMPONENT` env var set per docker-compose service (`init`, `app`, `mcp`, `worker`). The settings module reads it into `static_fields.component`.
- Health-filter: `mnemosyne.log_filters.SuppressHealthAccessFilter` on the `access` handler.
- Metrics: `/metrics` on the nginx container (port 23181) — served by django-prometheus on the app container plus `mcp_server.metrics` (shared `prometheus_client` registry).
- Scrape job: `mnemosyne` (see `ouranos/ansible/pplg/prometheus.yml.j2`).
- Alerts: `mnemosyne_alerts` group in `ouranos/ansible/pplg/alert_rules.yml.j2`.
### Pallas — Kottos (systemd on puck via Ansible role `ouranos/ansible/kottos/`)
- Logging config: `pallas/pallas/log.py` → `setup_logging()` with `PALLAS_LOG_STDOUT=1`.
- Component attribution: `pallas.log.set_agent_component(name)` is called by `_start_agent()` inside each agent's asyncio task, setting a `contextvars.ContextVar` that the `_StaticFieldsFilter` reads per record. Each agent (harper, scotty, research, tech_research) carries its own value without leaking across tasks.
- Project attribution: `pallas.log.set_project(deploy_name)` is called once in `main()` from `agents.yaml`'s `name:`. For Kottos this renders as `project="kottos"` on every record.
- Deployed by: `ansible-playbook kottos/deploy.yml` (wired into `site.yml`).
- Metrics: none today — Pallas is observed through logs only. Future phase will add a `prometheus_client` endpoint on the registry port for `pallas_agent_requests_total{agent=…}`, `pallas_downstream_mcp_errors_total{server=…}`.
### Daedalus (Docker compose on puck)
- Logging config: `daedalus/backend/daedalus/logging.py` — `structlog` JSON processor chain, already production-ready.
- Component attribution: `structlog.contextvars.bind_contextvars(service="daedalus", component="api")` at app startup.
- Health-filter: `_SuppressHealthAccessFilter` on uvicorn's access logger.
- Metrics: `/metrics` on the api container (port 22181).
- Scrape job: `daedalus`.
- Alerts: `daedalus_alerts` group.
## Useful LogQL queries
Once the pipeline is live, the "troubleshooting is a nightmare" problem becomes three-click queries in Grafana Explore:
```logql
# All Mnemosyne errors in the last 15m
{service="mnemosyne"} | json | level="ERROR"
# Everything Harper did in the last hour
{service="pallas", project="kottos", component="harper"} | json
# The infamous pallas.forward.trace stream (MCP transport failures)
{service="pallas", project="kottos"} |= "pallas.forward.trace"
# Cross-service trace of a single request (requires X-Request-Id propagation
# — not yet implemented; Phase 1.5 nice-to-have)
{environment="ouranos"} | json | request_id="<paste-id>"
# 5xx spike in Daedalus by path
sum by (path) (rate({service="daedalus"} | json | level="ERROR" [5m]))
```
The **Puck Services — Logs & Health** dashboard in Grafana (`/etc/grafana/provisioning/dashboards/puck.yaml` → `/var/lib/grafana/dashboards/puck_services.json`) has these pre-wired as panels per service row.
## Adding a new service
If you're adding a service to puck (or any Ouranos/Virgo host with this stack):
1. **Emit JSON to stdout** with `service`/`component` as static fields. Copy Mnemosyne's settings pattern or Pallas's `_StaticFieldsFilter`.
2. **Pick a transport:**
- Docker compose → add the `x-logging: &default-logging` anchor + `logging: *default-logging` on each service. Done. No Alloy changes needed.
- systemd → set `SyslogIdentifier=<name>` on the unit and add a two-rule relabel block to the host's `loki.relabel "journal_<host>"` block.
3. **Expose `/metrics`** if the service is in Python — `prometheus_client` plus either `django-prometheus` or `prometheus_fastapi_instrumentator`.
4. **Add a scrape job** in `ouranos/ansible/pplg/prometheus.yml.j2` (parametrise the target — `{{ <service>_metrics_host }}:{{ <service>_metrics_port }}`) and wire the defaults into the host's `host_vars`.
5. **Add alerts** in `ouranos/ansible/pplg/alert_rules.yml.j2`. At minimum: `Down`, `HighErrorRate`. Use the metric names the service actually exposes — no dead rules.
6. **Optional**: add panels to the Puck Services dashboard JSON.
No new transport. No per-service Alloy block. No custom log format.