diff --git a/ansible/freecad_mcp/.env.j2 b/ansible/freecad_mcp/.env.j2 index 51bf2b2..7fc3c27 100644 --- a/ansible/freecad_mcp/.env.j2 +++ b/ansible/freecad_mcp/.env.j2 @@ -4,18 +4,17 @@ # ============================================================================= # MCP Transport Configuration # ============================================================================= -FREECAD_MCP_TRANSPORT=http -FREECAD_MCP_HTTP_PORT={{ freecad_mcp_port }} +FREECAD_TRANSPORT=http +FREECAD_HTTP_PORT={{ freecad_mcp_port }} # ============================================================================= # FreeCAD Connection Mode # ============================================================================= -FREECAD_MCP_MODE={{ freecad_mcp_mode | default('xmlrpc') }} -FREECAD_MCP_XMLRPC_HOST={{ freecad_mcp_xmlrpc_host | default('localhost') }} -FREECAD_MCP_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }} -FREECAD_MCP_TIMEOUT_MS={{ freecad_mcp_timeout_ms | default('30000') }} +FREECAD_MODE={{ freecad_mcp_mode | default('xmlrpc') }} +FREECAD_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }} +FREECAD_TIMEOUT_MS={{ freecad_mcp_timeout_ms | default('30000') }} # ============================================================================= # Logging # ============================================================================= -FREECAD_MCP_LOG_LEVEL={{ freecad_mcp_log_level | default('INFO') }} +FREECAD_LOG_LEVEL={{ freecad_mcp_log_level | default('INFO') }} diff --git a/ansible/freecad_mcp/README.md b/ansible/freecad_mcp/README.md index 9e77bde..76a8e59 100644 --- a/ansible/freecad_mcp/README.md +++ b/ansible/freecad_mcp/README.md @@ -1,8 +1,7 @@ # FreeCAD Robust MCP Server — Ansible Deployment Deploys the [FreeCAD Robust MCP Server](https://pypi.org/project/freecad-robust-mcp/) -to Caliban as a systemd service with HTTP transport, ready for MCP Switchboard -consumption. +to Caliban as a systemd service with HTTP transport. ## Architecture @@ -12,8 +11,8 @@ consumption. │ │ │ ┌──────────────────────┐ │ │ │ freecad-mcp.service │ │ -│ │ (streamable-http) │◄─── :22032 ──────────┤◄── MCP Switchboard -│ │ venv + PyPI package │ │ (oberon.incus) +│ │ (streamable-http) │◄─── :22061 ──────────┤◄── MCP Client +│ │ venv + PyPI package │ │ │ └──────────────────────┘ │ │ │ │ │ │ xmlrpc :9875 │ @@ -25,6 +24,18 @@ consumption. └─────────────────────────────────────────────────┘ ``` +## FreeCAD bridge required for tool calls + +The service starts and answers the MCP `initialize` handshake **without** FreeCAD +running — the XML-RPC connection to FreeCAD is only attempted on the first CAD +tool call (lazy connect). So a green Ansible healthcheck means "transport up", +**not** "FreeCAD reachable". + +Actual CAD tool calls require FreeCAD running with the Robust MCP Bridge +workbench started, listening on XML-RPC `localhost:9875`. Standing up that bridge +(GUI or headless) on Caliban is a separate step from getting this service to +boot. + ## Prerequisites - Caliban host in Ansible inventory (already exists in Ouranos) @@ -62,7 +73,7 @@ Add to `ansible/inventory/host_vars/caliban.incus.yml`: freecad_mcp_user: harper freecad_mcp_group: harper freecad_mcp_directory: /srv/freecad-mcp -freecad_mcp_port: 22032 +freecad_mcp_port: 22061 freecad_mcp_version: "0.5.0" ``` @@ -100,7 +111,7 @@ The playbook automatically validates the deployment by: You can also manually test: ```bash -curl -X POST http://caliban.incus:22032/mcp \ +curl -X POST http://caliban.incus:22061/mcp \ -H "Content-Type: application/json" \ -d '{"jsonrpc":"2.0","method":"initialize","id":1,"params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"curl","version":"1.0.0"}}}' ``` @@ -126,5 +137,4 @@ The systemd service runs with hardened settings: | `PrivateTmp` | `true` | Isolated /tmp namespace | | `ReadWritePaths` | `/srv/freecad-mcp` | Only app directory is writable | -This is significantly more hardened than the Kernos service (which needs -broad filesystem access for shell commands). + diff --git a/ansible/freecad_mcp/deploy.yml b/ansible/freecad_mcp/deploy.yml index 98893c4..cdd7455 100644 --- a/ansible/freecad_mcp/deploy.yml +++ b/ansible/freecad_mcp/deploy.yml @@ -216,3 +216,102 @@ ansible.builtin.systemd: name: freecad-mcp state: restarted + + +# ============================================================================= +# FreeCAD MCP Bridge (GUI) — runs FreeCAD on the XRDP desktop as principal_user, +# exposing the XML-RPC bridge on localhost:9875 that the MCP server connects to. +# ============================================================================= +- name: Deploy FreeCAD MCP Bridge (GUI) + hosts: freecad_mcp + tasks: + + - name: Ensure FreeCAD is installed + become: true + ansible.builtin.apt: + name: [freecad, tar] + state: present + update_cache: true + + - name: Create FreeCAD MCP bridge directory + become: true + become_user: "{{ principal_user }}" + ansible.builtin.file: + path: "{{ freecad_mcp_bridge_directory }}" + state: directory + mode: '0755' + + - name: Transfer and extract FreeCAD MCP bridge release + become: true + become_user: "{{ principal_user }}" + ansible.builtin.unarchive: + src: "~/rel/freecad_mcp_bridge_{{ freecad_mcp_git_ref }}.tar" + dest: "{{ freecad_mcp_bridge_directory }}" + notify: restart freecad-mcp-bridge + + - name: Template FreeCAD MCP bridge systemd service + become: true + ansible.builtin.template: + src: freecad-mcp-bridge.service.j2 + dest: /etc/systemd/system/freecad-mcp-bridge.service + owner: root + group: root + mode: '644' + notify: + - reload systemd + - restart freecad-mcp-bridge + + - name: Enable and start freecad-mcp-bridge service + become: true + ansible.builtin.systemd: + name: freecad-mcp-bridge + enabled: true + state: started + daemon_reload: true + + - name: Flush handlers to restart bridge before validation + ansible.builtin.meta: flush_handlers + + - name: Wait for FreeCAD XML-RPC bridge to listen + ansible.builtin.wait_for: + port: "{{ freecad_mcp_xmlrpc_port | default(9875) }}" + host: localhost + delay: 5 + timeout: 60 + + - name: Verify bridge is in GUI mode (FreeCAD.GuiUp via XML-RPC execute) + ansible.builtin.command: + argv: + - python3 + - -c + - | + import sys, xmlrpc.client + proxy = xmlrpc.client.ServerProxy( + "http://localhost:{{ freecad_mcp_xmlrpc_port | default(9875) }}", allow_none=True) + resp = proxy.execute("_result_ = bool(FreeCAD.GuiUp)") + if not (resp.get("success") and resp.get("result") is True): + sys.exit("Bridge reachable but not in GUI mode: %r" % resp) + print("FreeCAD bridge GUI mode confirmed") + register: bridge_gui_check + retries: 5 + delay: 5 + until: bridge_gui_check.rc == 0 + changed_when: false + + - name: Display bridge info + ansible.builtin.debug: + msg: >- + FreeCAD MCP Bridge running in GUI mode on {{ inventory_hostname }}, + XML-RPC localhost:{{ freecad_mcp_xmlrpc_port | default(9875) }} + + handlers: + - name: reload systemd + become: true + ansible.builtin.systemd: + daemon_reload: true + + - name: restart freecad-mcp-bridge + become: true + ansible.builtin.systemd: + name: freecad-mcp-bridge + state: restarted diff --git a/ansible/freecad_mcp/freecad-mcp-bridge.service.j2 b/ansible/freecad_mcp/freecad-mcp-bridge.service.j2 new file mode 100644 index 0000000..5841f30 --- /dev/null +++ b/ansible/freecad_mcp/freecad-mcp-bridge.service.j2 @@ -0,0 +1,21 @@ +[Unit] +Description=FreeCAD MCP XML-RPC Bridge (GUI) +After=network.target + +[Service] +Type=simple +User={{ principal_user }} +WorkingDirectory={{ freecad_mcp_bridge_directory }} +Environment=DISPLAY={{ freecad_mcp_bridge_display }} +Environment=XAUTHORITY=/home/{{ principal_user }}/.Xauthority +Environment=FREECAD_XMLRPC_PORT={{ freecad_mcp_xmlrpc_port | default('9875') }} +Environment=FREECAD_SOCKET_PORT={{ freecad_mcp_socket_port | default('9876') }} +ExecStart=/usr/bin/freecad {{ freecad_mcp_bridge_directory }}/freecad/RobustMCPBridge/freecad_mcp_bridge/startup_bridge.py +Restart=on-failure +RestartSec=10 +StandardOutput=journal +StandardError=journal +SyslogIdentifier=freecad-mcp-bridge + +[Install] +WantedBy=multi-user.target diff --git a/ansible/freecad_mcp/stage.yml b/ansible/freecad_mcp/stage.yml new file mode 100644 index 0000000..5fef8f0 --- /dev/null +++ b/ansible/freecad_mcp/stage.yml @@ -0,0 +1,46 @@ +--- +- name: Stage FreeCAD MCP bridge release tarball + hosts: localhost + gather_facts: false + vars: + freecad_mcp_archive: "{{rel_dir}}/freecad_mcp_bridge_{{freecad_mcp_git_ref}}.tar" + freecad_mcp_repo_url: "git@github.com:heluca/freecad-addon-robust-mcp-server.git" + freecad_mcp_repo_dir: "{{github_dir}}/freecad-addon-robust-mcp-server" + + tasks: + - name: Ensure release directory exists + file: + path: "{{rel_dir}}" + state: directory + mode: '755' + + - name: Ensure github directory exists + file: + path: "{{github_dir}}" + state: directory + mode: '755' + + - name: Clone freecad-addon-robust-mcp-server repository if not present + ansible.builtin.git: + repo: "{{freecad_mcp_repo_url}}" + dest: "{{freecad_mcp_repo_dir}}" + version: "{{freecad_mcp_git_ref}}" + accept_hostkey: true + register: freecad_mcp_clone + + - name: Fetch all remote branches and tags + ansible.builtin.command: git fetch --all + args: + chdir: "{{freecad_mcp_repo_dir}}" + when: freecad_mcp_clone is not changed + + - name: Pull latest changes + ansible.builtin.command: git pull + args: + chdir: "{{freecad_mcp_repo_dir}}" + when: freecad_mcp_clone is not changed + + - name: Create FreeCAD MCP bridge archive for specified release + ansible.builtin.command: git archive -o "{{freecad_mcp_archive}}" "{{freecad_mcp_git_ref}}" + args: + chdir: "{{freecad_mcp_repo_dir}}" diff --git a/ansible/gitea/deploy.yml b/ansible/gitea/deploy.yml index 0e5942e..c2fd503 100644 --- a/ansible/gitea/deploy.yml +++ b/ansible/gitea/deploy.yml @@ -18,6 +18,7 @@ - git-lfs - curl - memcached + - acl state: present update_cache: true @@ -187,8 +188,8 @@ --config {{ gitea_config_file }} --name "{{ gitea_oauth_name }}" --provider openidConnect - --key "{{ gitea_oauth2_client_id }}" - --secret "{{ gitea_oauth2_client_secret }}" + --key "{{ gitea_oauth_client_id }}" + --secret "{{ gitea_oauth_client_secret }}" --auto-discover-url "https://id.ouranos.helu.ca/.well-known/openid-configuration" --scopes "{{ gitea_oauth_scopes }}" --skip-local-2fa diff --git a/ansible/inventory/group_vars/all/vars.yml b/ansible/inventory/group_vars/all/vars.yml index d242ec8..d7490c6 100644 --- a/ansible/inventory/group_vars/all/vars.yml +++ b/ansible/inventory/group_vars/all/vars.yml @@ -41,6 +41,7 @@ openwebui_rel: 0.8.3 pulseaudio_module_xrdp_rel: devel searxng_oauth2_proxy_version: 7.6.0 # Git ref (branch, tag, or commit) - https://github.com/heluca/freecad-addon-robust-mcp-server +# Used for both the pip-installed MCP server and the staged GUI bridge tarball. freecad_mcp_git_ref: "main" # Docker image versions (third-party) diff --git a/ansible/inventory/host_vars/caliban.incus.yml b/ansible/inventory/host_vars/caliban.incus.yml index 0ad8bba..34c6ced 100644 --- a/ansible/inventory/host_vars/caliban.incus.yml +++ b/ansible/inventory/host_vars/caliban.incus.yml @@ -41,6 +41,12 @@ freecad_mcp_user: harper freecad_mcp_group: harper freecad_mcp_directory: /srv/freecad-mcp freecad_mcp_port: 22061 +freecad_mcp_xmlrpc_port: 9875 +freecad_mcp_socket_port: 9876 + +# FreeCAD MCP Bridge (GUI, runs as principal_user on the XRDP display) +freecad_mcp_bridge_directory: "/home/{{ principal_user }}/freecad-mcp-bridge" +freecad_mcp_bridge_display: ":10" # JupyterLab Configuration diff --git a/ansible/inventory/host_vars/puck.incus.yml b/ansible/inventory/host_vars/puck.incus.yml index b2d6752..ff13570 100644 --- a/ansible/inventory/host_vars/puck.incus.yml +++ b/ansible/inventory/host_vars/puck.incus.yml @@ -107,6 +107,12 @@ athena_directory: /srv/athena athena_port: 22481 athena_domain: "ouranos.helu.ca" +# Prometheus scrape targets (see pplg/prometheus.yml.j2, athena job) +athena_app_metrics_host: "puck.incus" +athena_app_metrics_port: 22481 +athena_web_metrics_host: "puck.incus" +athena_web_metrics_port: 22491 + # Casdoor SSO Credentials (from vault) athena_casdoor_client_id: "{{ vault_athena_oauth_client_id }}" athena_casdoor_client_secret: "{{ vault_athena_oauth_client_secret }}" diff --git a/ansible/kottos/.env.j2 b/ansible/kottos/.env.j2 deleted file mode 100644 index 04e64c4..0000000 --- a/ansible/kottos/.env.j2 +++ /dev/null @@ -1,24 +0,0 @@ -# Kottos runtime environment — rendered by Ansible from inventory host_vars. -# ------------------------------------------------------------------------ -# Loaded by systemd (EnvironmentFile=) and inherited by the pallas process. -# ``.env`` vars NOT set here come from pallas.server's defaults — tweak by -# adding the variable to host_vars and this template, not by editing the -# rendered file on the host. - -# ── Logging ───────────────────────────────────────────────────────────────── -# Stdout JSON is the preferred sink for systemd+journald+Alloy deployments. -# Rotating file sink is disabled by pointing PALLAS_LOG_FILE at /dev/null so -# we don't write every record twice. -PALLAS_LOG_STDOUT=1 -PALLAS_LOG_FILE=/dev/null -PALLAS_LOG_LEVEL={{ pallas_log_level | default('INFO') }} - -# ── Config location ───────────────────────────────────────────────────────── -# PALLAS_AGENTS_CONFIG can be overridden to point at a non-default topology -# (e.g. staging scenarios). Default: agents.yaml next to the working dir. -PALLAS_AGENTS_CONFIG={{ kottos_directory }}/agents.yaml - -# ── LLM provider / MCP server secrets ─────────────────────────────────────── -# Secrets are rendered into fastagent.secrets.yaml rather than env vars so -# fast-agent's existing YAML-merge logic applies. This block stays empty -# intentionally — the template exists for future per-host tunables. diff --git a/ansible/kottos/agents.yaml.j2 b/ansible/kottos/agents.yaml.j2 index af068ab..bdaa42c 100644 --- a/ansible/kottos/agents.yaml.j2 +++ b/ansible/kottos/agents.yaml.j2 @@ -1,43 +1,62 @@ -# Kottos — Deployment Configuration (rendered by Ansible) -# ------------------------------------------------------------------ -# Single source of truth for agent topology, ports, and registry -# metadata. Read by Pallas at startup. The kottos/agents.yaml -# committed in the kottos repo is the local-dev equivalent; Ansible -# overwrites it with this rendered version. -# -# Host + namespace + registry port come from inventory host_vars so -# Ouranos / Virgo / Taurus each get their own shape without template -# edits. +# Kottos — Deployment Configuration +# Single source of truth for agent topology, ports, and registry metadata. +# Read by Pallas at startup. name: kottos version: "1.0.0" -host: {{ kottos_agents_host | default(kottos_host) | default(inventory_hostname) }} -namespace: {{ kottos_namespace | default('ca.helu.kottos') }} -registry_port: {{ kottos_registry_port | default(24100) }} +host: {{ kottos_bind_host | default(inventory_hostname) }} +namespace: ca.helu.kottos +registry_port: {{ kottos_registry_port }} agents: harper: module: agents.harper - port: {{ kottos_harper_port | default(24101) }} + port: 24101 title: Harper description: "Scrappy engineer — rapid prototyping, hacking, and creative problem-solving" depends_on: [research, tech_research] +{% if kottos_harper_model is defined %} + model: {{ kottos_harper_model }} +{% endif %} scotty: module: agents.scotty - port: {{ kottos_scotty_port | default(24102) }} + port: 24102 title: Scotty description: "Systems administration expert — infrastructure diagnostics, security hardening, and keeping everything running" depends_on: [tech_research] +{% if kottos_scotty_model is defined %} + model: {{ kottos_scotty_model }} +{% endif %} research: module: agents.research - port: {{ kottos_research_port | default(24150) }} + port: 24150 title: Research Agent description: "Web search via Argos and knowledge graph via Neo4j" +{% if kottos_research_model is defined %} + model: {{ kottos_research_model }} + model_capabilities: + vision: {{ kottos_research_model_vision | default(true) }} + context_window: {{ kottos_research_model_context_window | default(16384) }} + max_output_tokens: {{ kottos_research_model_max_output_tokens | default(8192) }} +{% endif %} tech_research: module: agents.tech_research - port: {{ kottos_tech_research_port | default(24151) }} + port: 24151 title: Tech Research description: "Technical investigation — library comparisons, API docs, framework patterns, code examples" +{% if kottos_tech_research_model is defined %} + model: {{ kottos_tech_research_model }} +{% endif %} + + case: + module: agents.case + port: 24152 + title: CASE + description: "Field systems agent — SD card imaging, LAN scanning, and storage operations on korax.helu.ca" + depends_on: [] +{% if kottos_case_model is defined %} + model: {{ kottos_case_model }} +{% endif %} diff --git a/ansible/kottos/deploy.yml b/ansible/kottos/deploy.yml index ee6146a..ed47f36 100644 --- a/ansible/kottos/deploy.yml +++ b/ansible/kottos/deploy.yml @@ -1,10 +1,17 @@ --- -- name: Deploy Kottos (Pallas FastAgent runtime) +- name: Deploy Kottos AI Agent Platform hosts: ubuntu vars: ansible_common_remote_group: "{{ kottos_group | default([]) }}" allow_world_readable_tmpfiles: true + handlers: + - name: restart kottos + become: true + ansible.builtin.systemd: + name: kottos + state: restarted + tasks: - name: Check if host has kottos service ansible.builtin.set_fact: @@ -14,51 +21,84 @@ ansible.builtin.meta: end_host when: not has_kottos_service + - name: Install required packages + become: true + ansible.builtin.apt: + name: + - acl + - npm + - curl + state: present + update_cache: true + - name: Create Kottos group become: true ansible.builtin.group: name: "{{ kottos_group }}" state: present - - name: Create kottos user + - name: Create Kottos user become: true ansible.builtin.user: name: "{{ kottos_user }}" group: "{{ kottos_group }}" - home: "/home/{{ kottos_user }}" + home: "{{ kottos_directory }}" shell: /bin/bash - system: false - create_home: true + system: true + create_home: false - - name: Add keeper_user to kottos group (optional — enables passwordless tailing) + - name: Add keeper_user to kottos group become: true ansible.builtin.user: name: "{{ keeper_user }}" groups: "{{ kottos_group }}" append: true - when: keeper_user is defined + + - name: Add kottos user to docker group + become: true + ansible.builtin.user: + name: "{{ kottos_user }}" + groups: docker + append: true + notify: restart kottos - name: Reset connection to pick up new group membership ansible.builtin.meta: reset_connection - - name: Create Kottos install directory + - name: Create Kottos directory become: true ansible.builtin.file: path: "{{ kottos_directory }}" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" state: directory - mode: '0750' + mode: '750' - - name: Ensure base packages for Python + Docker MCP workflows + - name: Create vendored Pallas directory + become: true + ansible.builtin.file: + path: "{{ kottos_directory }}/vendor/pallas" + owner: "{{ kottos_user }}" + group: "{{ kottos_group }}" + state: directory + mode: '750' + + - name: Ensure tar is installed for unarchive task become: true ansible.builtin.apt: name: - tar - - python3 - - python3-venv - - python3-dev - - git + state: present + update_cache: true + + - name: Ensure Python 3.13, venv, dev headers, and ACL are installed + become: true + ansible.builtin.apt: + name: + - python3.13 + - python3.13-venv + - python3.13-dev + - acl state: present update_cache: true @@ -69,43 +109,52 @@ dest: "{{ kottos_directory }}" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" - mode: '0550' + mode: '550' notify: restart kottos - - name: Ensure .venv directory ownership is correct + - name: Transfer and unarchive vendored Pallas source become: true - ansible.builtin.file: - path: "{{ kottos_directory }}/.venv" + ansible.builtin.unarchive: + src: "~/rel/pallas_{{ pallas_rel }}.tar" + dest: "{{ kottos_directory }}/vendor/pallas" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" - state: directory - recurse: true - when: ansible_facts['file'] is defined or true + mode: '550' + notify: restart kottos - - name: Create virtual environment for Kottos + - name: Rewrite pallas-mcp dependency to use vendored local path + become: true + ansible.builtin.replace: + path: "{{ kottos_directory }}/pyproject.toml" + regexp: '"pallas-mcp @ git\+ssh://[^"]+"' + replace: '"pallas-mcp @ file://{{ kottos_directory }}/vendor/pallas"' + notify: restart kottos + + - name: Create virtual environment for Kottos (Python 3.13) become: true become_user: "{{ kottos_user }}" ansible.builtin.command: - cmd: "python3 -m venv {{ kottos_directory }}/.venv/" + cmd: "python3.13 -m venv {{ kottos_directory }}/.venv/" creates: "{{ kottos_directory }}/.venv/bin/activate" - - name: Install wheel in the virtualenv + - name: Install wheel and mcp-server-time in virtualenv become: true become_user: "{{ kottos_user }}" ansible.builtin.pip: name: - wheel + - mcp-server-time state: latest virtualenv: "{{ kottos_directory }}/.venv" - - name: Install Kottos (pyproject.toml — pulls in pallas-mcp and fast-agent-mcp) + - name: Install Kottos (and its rewritten local pallas-mcp) in virtualenv become: true become_user: "{{ kottos_user }}" ansible.builtin.pip: - chdir: "{{ kottos_directory }}/kottos" + chdir: "{{ kottos_directory }}" name: . virtualenv: "{{ kottos_directory }}/.venv" - virtualenv_command: python3 -m venv + virtualenv_command: python3.13 -m venv notify: restart kottos - name: Template agents.yaml @@ -115,7 +164,7 @@ dest: "{{ kottos_directory }}/agents.yaml" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" - mode: '0640' + mode: '640' notify: restart kottos - name: Template fastagent.config.yaml @@ -125,38 +174,27 @@ dest: "{{ kottos_directory }}/fastagent.config.yaml" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" - mode: '0640' + mode: '640' notify: restart kottos - - name: Template fastagent.secrets.yaml (vault-rendered) + - name: Template fastagent.secrets.yaml become: true ansible.builtin.template: src: fastagent.secrets.yaml.j2 dest: "{{ kottos_directory }}/fastagent.secrets.yaml" owner: "{{ kottos_user }}" group: "{{ kottos_group }}" - mode: '0600' - notify: restart kottos - no_log: true - - - name: Template runtime .env (PALLAS_LOG_STDOUT etc.) - become: true - ansible.builtin.template: - src: .env.j2 - dest: "{{ kottos_directory }}/.env" - owner: "{{ kottos_user }}" - group: "{{ kottos_group }}" - mode: '0640' + mode: '640' notify: restart kottos - - name: Template systemd unit + - name: Template systemd service file become: true ansible.builtin.template: src: kottos.service.j2 dest: /etc/systemd/system/kottos.service owner: root group: root - mode: '0644' + mode: '644' notify: restart kottos - name: Enable and start kottos service @@ -167,26 +205,15 @@ state: started daemon_reload: true - - name: Flush handlers before validation probes + - name: Flush handlers to restart service before validation ansible.builtin.meta: flush_handlers - # ── Validation ────────────────────────────────────────────────────────── - # Registry is the only endpoint that responds with a deterministic JSON - # payload without requiring an MCP session, so we probe it. Agent ports - # are exercised by Daedalus's health-poll loop once registered. - - name: Validate Kottos registry responds + - name: Validate Kottos registry liveness ansible.builtin.uri: - url: "http://localhost:{{ kottos_registry_port | default(24100) }}/.well-known/mcp/server.json" + url: "http://localhost:{{ kottos_registry_port }}/live" status_code: 200 return_content: true - register: registry_check + register: kottos_live retries: 10 - delay: 3 - until: registry_check.status == 200 - - handlers: - - name: restart kottos - become: true - ansible.builtin.systemd: - name: kottos - state: restarted + delay: 5 + until: kottos_live.status == 200 diff --git a/ansible/kottos/fastagent.config.yaml.j2 b/ansible/kottos/fastagent.config.yaml.j2 index d011380..e884a92 100644 --- a/ansible/kottos/fastagent.config.yaml.j2 +++ b/ansible/kottos/fastagent.config.yaml.j2 @@ -1,66 +1,64 @@ -# Kottos — fast-agent configuration (rendered by Ansible) -# ------------------------------------------------------------------ -# Committed-to-kottos copy is the local-dev equivalent; Ansible overwrites -# it with this rendered file on deploy. MCP server URLs are parametrised -# so the same template renders correctly for Ouranos (.incus) and Virgo -# (.virgo / .taurus) — each environment's host_vars supplies the base URLs. +# Kottos — Configuration +# LLM provider and MCP server settings. +# Secrets (api_key, tokens) live in fastagent.secrets.yaml (gitignored) +# +# This template is intended to be byte-identical between environments +# (Virgo dev, Taurus prod). All environment-specific values come from +# host_vars or group_vars/all/vars.yml. Do NOT introduce environment- +# specific literals here. -default_model: {{ kottos_default_model | default('openai.Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf') }} +# Default Model Definition +default_model: {{ kottos_default_model }} -# ── Model Capabilities ────────────────────────────────────────────────────── # Declares capabilities for models not in fast-agent's ModelDatabase. # vision: true adds image/jpeg, image/png, image/webp to the tokenizer list. model_capabilities: - vision: {{ kottos_model_vision | default(true) | string | lower }} - context_window: {{ kottos_model_context_window | default(192000) }} - max_output_tokens: {{ kottos_model_max_output_tokens | default(16384) }} + vision: {{ kottos_model_vision }} + context_window: {{ kottos_model_context_window }} + max_output_tokens: {{ kottos_model_max_output_tokens }} -# ── LLM Providers ─────────────────────────────────────────────────────────── +# LLM Providers +anthropic: + base_url: {{ kottos_anthropic_base_url }} +generic: + base_url: {{ kottos_generic_base_url }} openai: - base_url: {{ kottos_openai_base_url | default('http://nyx.helu.ca:22079/v1') }} + base_url: {{ kottos_openai_base_url }} +# MCP Servers — alphabetical to match the dev sample (kottos/fastagent.config.yaml) mcp: servers: - # ── Web search via SearXNG (argos) ─────────────────────────────────────── - argos: - transport: http - url: "{{ kottos_argos_url | default('http://miranda.incus:25534/mcp') }}" - - # ── Knowledge graph — Neo4j ────────────────────────────────────────────── - neo4j_cypher: - transport: http - url: "{{ kottos_neo4j_cypher_url | default('http://circe.helu.ca:22034/mcp') }}" - - # ── Shell + file operations — Kernos (Caliban) ─────────────────────────── - kernos_scotty: - transport: http - url: "{{ kottos_kernos_scotty_url | default('http://caliban.incus:22062/mcp') }}" - load_on_start: false - - # ── Agent S computer automation — Rommie on Caliban ────────────────────── - rommie: - transport: http - url: "{{ kottos_rommie_url | default('http://caliban.incus:20361/mcp') }}" - load_on_start: false - - # ── Git repository management — Gitea MCP ──────────────────────────────── - gitea: - transport: http - url: "{{ kottos_gitea_url | default('http://miranda.incus:25535/mcp') }}" - - # ── Grafana observability ─────────────────────────────────────────────── - grafana: - transport: http - url: "{{ kottos_grafana_url | default('http://miranda.incus:25533/mcp') }}" - - # ── Shell + file operations — Kernos (Korax) ───────────────────────────── + ## Andromeda Shell & File Operations — Kernos for Harper + ### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token) andromeda: transport: http - url: "{{ kottos_kernos_harper_url | default('http://caliban.helu.ca:20261/mcp') }}" - load_on_start: false + url: "{{ kottos_andromeda_mcp_url }}" - # ── GitHub MCP Server (local Docker, stdio) ────────────────────────────── - # GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml + ## Argos Web Search & Page Fetch + ### No Auth + argos: + transport: http + url: "{{ kottos_argos_mcp_url }}" + + ## Argus Shell & File Operations — Kernos for Scotty + ### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token) + argus: + transport: http + url: "{{ kottos_argus_mcp_url }}" + + ## Context7 Library/framework documentation (local stdio) + context7: + command: "npx" + args: ["-y", "@upstash/context7-mcp"] + + ## Gitea Git Repository Management + ### No client auth (server-side auth only) + gitea: + transport: http + url: "{{ kottos_gitea_mcp_url }}" + + ## GitHub MCP Server (local Docker, stdio) + ### GITHUB_PERSONAL_ACCESS_TOKEN provided by fastagent.secrets.yaml github: command: "docker" args: @@ -71,38 +69,57 @@ mcp: - "GITHUB_PERSONAL_ACCESS_TOKEN" - "ghcr.io/github/github-mcp-server" - # ── Library/framework documentation — Context7 (local stdio) ───────────── - context7: - command: "npx" - args: ["-y", "@upstash/context7-mcp"] + ## Grafana Observability + ### No Auth + grafana: + transport: http + url: "{{ kottos_grafana_mcp_url }}" - # ── Current time and timezone (local stdio) ────────────────────────────── - time: - command: "mcp-server-time" - args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"] + ## Korax Shell & File Operations — Kernos for CASE + ### Auth header provided by fastagent.secrets.yaml (per-agent Kernos token) + korax: + transport: http + url: "{{ kottos_korax_mcp_url }}" + load_on_start: false - # ── Mnemosyne knowledge search — workspace-scoped ──────────────────────── - # Auth is a long-lived team JWT supplied by fastagent.secrets.yaml - # (forward_inbound_auth=false — Mnemosyne validates the team JWT). + ## Mnemosyne Knowledge Library — workspace-scoped + ### Auth is a long-lived team JWT rendered into fastagent.secrets.yaml from + ### the OCI Vault entry {env}-mnemosyne-kottos-token. mnemosyne: transport: http - url: "{{ kottos_mnemosyne_url | default('https://mnemosyne.ouranos.helu.ca/mcp/') }}" + url: "{{ kottos_mnemosyne_mcp_url }}" - # ── Kottos internal sub-agents ─────────────────────────────────────────── - # These stay on localhost regardless of environment — Pallas serves the - # sub-agents on the same host as the top-level agents. + ## Neo4j Cypher Memory Graph + neo4j_cypher: + transport: http + url: "{{ kottos_neo4j_mcp_url }}" + + ## Kottos internal sub-agents + ### Research (Web, Knowledge) research: transport: http - url: "http://localhost:{{ kottos_research_port | default(24150) }}/mcp" + url: "{{ kottos_research_mcp_url }}" + ## Rommie Agent S Computer Use Agent + rommie: + transport: http + url: "{{ kottos_rommie_mcp_url }}" + load_on_start: false + + ### Research (Web, Context7) tech_research: transport: http - url: "http://localhost:{{ kottos_tech_research_port | default(24151) }}/mcp" + url: "{{ kottos_tech_research_mcp_url }}" + + ## Current time and time calculator (local stdio) + time: + command: "{{ kottos_directory }}/.venv/bin/mcp-server-time" + args: ["--local-timezone={{ kottos_timezone | default('America/Toronto') }}"] logger: - type: none - level: {{ kottos_fastagent_log_level | default('info') }} - progress_display: false - show_chat: false - show_tools: false + type: console + level: info + progress_display: true + show_chat: true + show_tools: true truncate_tools: true diff --git a/ansible/kottos/fastagent.secrets.yaml.j2 b/ansible/kottos/fastagent.secrets.yaml.j2 index 525e795..70e7db7 100644 --- a/ansible/kottos/fastagent.secrets.yaml.j2 +++ b/ansible/kottos/fastagent.secrets.yaml.j2 @@ -1,27 +1,35 @@ -# Kottos — fast-agent secrets (rendered by Ansible from the vault) -# ------------------------------------------------------------------ -# Never commit the rendered file. Each value here pulls from a vault -# variable — if a vault variable is missing, Ansible will fail the -# template step with a clear error before the file is written. -# -# Same structure as fastagent.config.yaml; values merge with secrets -# taking precedence (fast-agent deep-merges the two). +# Kottos — Secrets +# Managed by Ansible. Values fetched from OCI Vault at deploy time. +# Merges with fastagent.config.yaml (secrets take precedence). openai: - api_key: "{{ vault_kottos_openai_api_key }}" + api_key: "{{ kottos_openai_api_key }}" + +anthropic: + api_key: "{{ kottos_anthropic_api_key }}" mcp: servers: - github: - env: - GITHUB_PERSONAL_ACCESS_TOKEN: "{{ vault_kottos_github_pat }}" - - angelia: + # Per-agent Kernos MCP bearer tokens so Kernos can distinguish callers. + # Kottos itself does not consume these — they are surfaced to each agent + # module via fast-agent's server auth headers below. + argus: headers: - Authorization: "Bearer {{ vault_kottos_angelia_bearer }}" + Authorization: "Bearer {{ scotty_kernos_mcp_token }}" + andromeda: + headers: + Authorization: "Bearer {{ harper_kernos_mcp_token }}" + korax: + headers: + Authorization: "Bearer {{ case_kernos_mcp_token }}" - # Long-lived team JWT minted in Daedalus admin UI. - # See kottos/README.md § "Mnemosyne memory" for the rotation procedure. + # Downstream MCP bearer tokens + arke: + headers: + Authorization: "Bearer {{ kottos_arke_mcp_token }}" mnemosyne: headers: - Authorization: "Bearer {{ vault_kottos_mnemosyne_jwt }}" + Authorization: "Bearer {{ mnemosyne_kottos_token }}" + github: + env: + GITHUB_PERSONAL_ACCESS_TOKEN: "{{ kottos_github_pa_token }}" diff --git a/ansible/kottos/kottos.service.j2 b/ansible/kottos/kottos.service.j2 index 39f72e9..60741f5 100644 --- a/ansible/kottos/kottos.service.j2 +++ b/ansible/kottos/kottos.service.j2 @@ -1,6 +1,6 @@ [Unit] -Description=Kottos — Pallas FastAgent runtime ({{ kottos_host | default(inventory_hostname) }}) -After=network.target +Description=Kottos AI Agent Platform +After=network-online.target Wants=network-online.target [Service] @@ -8,26 +8,17 @@ Type=simple User={{ kottos_user }} Group={{ kottos_group }} WorkingDirectory={{ kottos_directory }} -EnvironmentFile={{ kottos_directory }}/.env +Environment="PATH={{ kottos_directory }}/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" ExecStart={{ kottos_directory }}/.venv/bin/pallas Restart=always -RestartSec=5 +RestartSec=10 -# Journal is the durable sink (Alloy picks up via loki.source.journal and -# relabels SyslogIdentifier=kottos into {service="pallas", project="kottos"} -# for Loki). Stdout from pallas is already JSON thanks to -# PALLAS_LOG_STDOUT=1 set in the .env file. -StandardOutput=journal -StandardError=journal -SyslogIdentifier=kottos - -# Pallas needs to reach localhost sibling agents + upstream MCP servers -# and read its own .venv / agents.yaml / config files. No hardening flags -# that would block those paths. -NoNewPrivileges=false -ProtectSystem=false -ProtectHome=false -PrivateTmp=false +# Security hardening +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +ReadWritePaths={{ kottos_directory }} [Install] WantedBy=multi-user.target diff --git a/ansible/kottos/remove.yml b/ansible/kottos/remove.yml new file mode 100644 index 0000000..f7396b2 --- /dev/null +++ b/ansible/kottos/remove.yml @@ -0,0 +1,34 @@ +--- +- name: Remove Kottos AI Agent Platform + hosts: ubuntu + become: true + + tasks: + - name: Check if host has kottos service + ansible.builtin.set_fact: + has_kottos_service: "{{ 'kottos' in services | default([]) }}" + + - name: Skip hosts without kottos service + ansible.builtin.meta: end_host + when: not has_kottos_service + + - name: Stop and disable kottos service + ansible.builtin.systemd: + name: kottos + state: stopped + enabled: false + ignore_errors: true + + - name: Remove systemd service file + ansible.builtin.file: + path: /etc/systemd/system/kottos.service + state: absent + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + + - name: Remove Kottos directory + ansible.builtin.file: + path: "{{ kottos_directory }}" + state: absent diff --git a/ansible/kottos/stage.yml b/ansible/kottos/stage.yml index fd8af3e..96d62e0 100644 --- a/ansible/kottos/stage.yml +++ b/ansible/kottos/stage.yml @@ -1,48 +1,84 @@ -- name: Stage Kottos release tarball +--- +- name: Stage Kottos and Pallas release tarballs hosts: localhost gather_facts: false vars: - archive_path: "{{rel_dir}}/kottos_{{kottos_rel}}.tar" + kottos_archive_path: "{{ rel_dir }}/kottos_{{ kottos_rel }}.tar" kottos_repo_url: "ssh://git@git.helu.ca:22022/r/kottos.git" - kottos_repo_dir: "{{repo_dir}}/kottos" + kottos_repo_dir: "{{ repo_dir }}/kottos" + pallas_archive_path: "{{ rel_dir }}/pallas_{{ pallas_rel }}.tar" + pallas_repo_url: "ssh://git@git.helu.ca:22022/r/pallas.git" + pallas_repo_dir: "{{ repo_dir }}/pallas" tasks: - name: Ensure release directory exists - file: - path: "{{rel_dir}}" + ansible.builtin.file: + path: "{{ rel_dir }}" state: directory mode: '755' - name: Ensure repo directory exists - file: - path: "{{repo_dir}}" + ansible.builtin.file: + path: "{{ repo_dir }}" state: directory mode: '755' + # --- Kottos ------------------------------------------------------------ - name: Clone Kottos repository if not present ansible.builtin.git: - repo: "{{kottos_repo_url}}" - dest: "{{kottos_repo_dir}}" - version: "{{kottos_rel}}" + repo: "{{ kottos_repo_url }}" + dest: "{{ kottos_repo_dir }}" + version: "{{ kottos_rel }}" accept_hostkey: true - register: git_clone + register: kottos_clone ignore_errors: true - - name: Fetch latest changes if already cloned - ansible.builtin.git: - repo: "{{kottos_repo_url}}" - dest: "{{kottos_repo_dir}}" - version: "{{kottos_rel}}" - update: true - force: true + - name: Fetch all remote branches and tags (kottos) + ansible.builtin.command: git fetch --all + args: + chdir: "{{ kottos_repo_dir }}" + when: kottos_clone is not changed + changed_when: false - - name: Create release archive - ansible.builtin.archive: - path: "{{kottos_repo_dir}}" - dest: "{{archive_path}}" - format: tar - exclude_path: - - "{{kottos_repo_dir}}/.git" - - "{{kottos_repo_dir}}/.venv" - - "{{kottos_repo_dir}}/__pycache__" - - "{{kottos_repo_dir}}/fastagent.secrets.yaml" + - name: Pull latest changes (kottos) + ansible.builtin.command: git pull + args: + chdir: "{{ kottos_repo_dir }}" + when: kottos_clone is not changed + changed_when: false + + - name: Create Kottos archive for specified release + ansible.builtin.command: git archive -o "{{ kottos_archive_path }}" "{{ kottos_rel }}" + args: + chdir: "{{ kottos_repo_dir }}" + changed_when: true + + # --- Pallas (kottos runtime dependency) -------------------------------- + - name: Clone Pallas repository if not present + ansible.builtin.git: + repo: "{{ pallas_repo_url }}" + dest: "{{ pallas_repo_dir }}" + version: "{{ pallas_rel }}" + accept_hostkey: true + register: pallas_clone + ignore_errors: true + + - name: Fetch all remote branches and tags (pallas) + ansible.builtin.command: git fetch --all + args: + chdir: "{{ pallas_repo_dir }}" + when: pallas_clone is not changed + changed_when: false + + - name: Pull latest changes (pallas) + ansible.builtin.command: git pull + args: + chdir: "{{ pallas_repo_dir }}" + when: pallas_clone is not changed + changed_when: false + + - name: Create Pallas archive for specified release + ansible.builtin.command: git archive -o "{{ pallas_archive_path }}" "{{ pallas_rel }}" + args: + chdir: "{{ pallas_repo_dir }}" + changed_when: true diff --git a/ansible/pplg/alert_rules.yml.j2 b/ansible/pplg/alert_rules.yml.j2 index fdfa65e..ef6b2b2 100644 --- a/ansible/pplg/alert_rules.yml.j2 +++ b/ansible/pplg/alert_rules.yml.j2 @@ -244,6 +244,23 @@ groups: summary: "High log ingestion rate" description: "Loki is receiving logs at {{ $value | humanize }}/s which may indicate excessive logging" + # ============================================================================ + # Django Application Alerts (generic — any Django app exporting the counter) + # ============================================================================ + # Apps emit django_superuser_logins_total from a user_logged_in signal when + # the authenticating user is a superuser. The job/component labels identify + # which app fired; forensic detail (user, IP) is in the matching Loki line. + - name: django_alerts + rules: + - alert: DjangoSuperuserLogin + expr: increase(django_superuser_logins_total[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Superuser login on {{ $labels.job }}" + description: "A superuser account just logged in to {{ $labels.job }} (component {{ $labels.component }}). This account is rarely used — confirm it was expected. Forensic detail (user, IP) in Loki: {service=\"{{ $labels.job }}\"} |= \"event=superuser_login\"." + # ============================================================================ # Daedalus Application Alerts # ============================================================================ diff --git a/ansible/pplg/prometheus.yml.j2 b/ansible/pplg/prometheus.yml.j2 index 3b8eaee..1a4c530 100644 --- a/ansible/pplg/prometheus.yml.j2 +++ b/ansible/pplg/prometheus.yml.j2 @@ -68,6 +68,21 @@ scrape_configs: labels: component: web + # Athena — same shape as Mnemosyne: the Django container exposes /metrics + # (django-prometheus) proxied via nginx on the app port; a separate + # nginx-prometheus-exporter sidecar re-exposes the web container's + # stub_status in Prometheus format on the web-metrics port. + - job_name: 'athena' + metrics_path: '/metrics' + scrape_interval: 15s + static_configs: + - targets: ['{{ athena_app_metrics_host }}:{{ athena_app_metrics_port }}'] + labels: + component: app + - targets: ['{{ athena_web_metrics_host }}:{{ athena_web_metrics_port }}'] + labels: + component: web + # Pallas — each deployment is one scrape target (registry port). # Pallas uses a single process-global registry, so per-agent /metrics # endpoints serve the same snapshot; the `agent` dimension is carried diff --git a/docs/alloy.md b/docs/alloy.md new file mode 100644 index 0000000..8e073c2 --- /dev/null +++ b/docs/alloy.md @@ -0,0 +1,289 @@ +# Alloy Log & Metric Collection + +Grafana Alloy runs as a **native systemd service** (never in Docker) on every +Ouranos host with `alloy` in its `services` list. It collects logs and forwards +them to **Loki on Prospero** (`http://prospero.incus:3100/loki/api/v1/push`), +and scrapes host/container metrics that it **remote-writes** to **Prometheus on +Prospero** (`http://prospero.incus:9090/api/v1/write`). + +## Overview + +- **Default config:** [`ansible/alloy/config.alloy.j2`](../ansible/alloy/config.alloy.j2) — journal-only fallback for hosts without a dedicated config. +- **Per-host config:** [`ansible/alloy//config.alloy.j2`](../ansible/alloy/) — overrides the default when present. +- **Selection:** [`alloy/deploy.yml`](../ansible/alloy/deploy.yml) stat-checks `/config.alloy.j2` on the controller; if it exists, that template is rendered, otherwise the default is used. +- **Log destination:** Loki on `prospero.incus:3100` via `loki.write "default"`. +- **Metric destination:** Prometheus on `prospero.incus:9090` via `prometheus.remote_write "default"`. +- **Environment:** every stream is labelled `environment="{{ deployment_environment }}"` (`ouranos`) and `hostname="{{ inventory_hostname }}"`. +- **Deploy:** `ansible-playbook alloy/deploy.yml` (optionally `--limit `). + +`deploy.yml` also adds the `alloy` user to the host's `docker` group when the +host has `docker` in its services — this is what lets Alloy read +`/var/run/docker.sock` for the Docker discovery and cAdvisor blocks below. + +## Log Sources + +Ouranos collects logs through three mechanisms. New Dockerised services should +use the **Docker socket discovery** path (preferred); the per-service syslog +listener is the older pattern, still in use on several hosts. + +### 1. Systemd journal (native services) + +Every host includes a `loki.source.journal` component capturing all systemd +unit output. By default journal entries are labelled `job="systemd"`; a +`loki.relabel` component can promote specific units to a richer label set (see +[Journal relabeling](#journal-relabeling-native-services)). + +This is the correct path for **native systemd services** (binaries managed by a +`.service` unit) — they write to stdout/stderr, systemd captures it in the +journal, and Alloy forwards it. No syslog port or log file needed. + +### 2. Docker socket discovery (preferred for containers) + +> **Reference implementation:** [`ansible/alloy/puck/config.alloy.j2`](../ansible/alloy/puck/config.alloy.j2). +> Puck is currently the lead host for this pattern; other Docker hosts still use +> per-service syslog listeners and should migrate to this model over time. + +A **single** pair of `discovery.docker` + `loki.source.docker` blocks collects +stdout from **every Compose project on the host**, current and future — no +per-service configuration. Container log streams are labelled from Docker's own +Compose metadata: + +- `service` ← Compose **project** name (e.g. `athena`, `mnemosyne`, `daedalus`) +- `component` ← Compose **service** name (e.g. `app`, `mcp`, `nginx`, `worker`) +- `container` ← raw container name (for non-Compose `docker run` containers) + +```alloy +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" + refresh_interval = "30s" +} + +discovery.relabel "containers" { + targets = discovery.docker.containers.targets + + rule { // Compose project → service + source_labels = ["__meta_docker_container_label_com_docker_compose_project"] + target_label = "service" + } + rule { // Compose service → component + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "component" + } + rule { // container name (non-Compose) + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + rule { // fall back to container name as service + source_labels = ["service", "container"] + separator = "@" + regex = "@(.+)" + target_label = "service" + } +} + +loki.source.docker "containers" { + host = "unix:///var/run/docker.sock" + targets = discovery.relabel.containers.output + forward_to = [loki.write.default.receiver] + labels = { + hostname = "{{ inventory_hostname }}", + environment = "{{ deployment_environment }}", + } +} +``` + +**Why this is preferred over syslog listeners:** + +- **Zero per-service wiring.** Adding a new Compose project requires no Alloy + change — it is discovered automatically and labelled by its project name. +- **No startup ordering hazard.** It scrapes Docker's default `json-file` log + driver, so containers never block on an Alloy listener being up (contrast the + syslog driver, below). +- **Consistent `{service, component}` schema** across apps, matching the + Prometheus `component` label used by multi-target scrape jobs (app vs web). + +**Requirements:** + +- The Compose project must use the default **`json-file`** log driver (i.e. it + must *not* set `logging: { driver: syslog }`). The app must log to **stdout**. +- The `alloy` user needs read access to `/var/run/docker.sock` (handled by + `deploy.yml` adding it to the `docker` group on Docker hosts). +- The `service` label is the **Compose project name**, which defaults to the + deploy directory's basename. Confirm it (`docker compose config` → `name:`) + when an alert or dashboard depends on a specific `service=` selector. + +### 3. Docker syslog driver (legacy, per-service) + +The older pattern: each container ships logs via Docker's `syslog` driver to a +dedicated Alloy `loki.source.syslog` listener on a localhost port, labelled with +a static `job`. + +```alloy +loki.source.syslog "kairos_logs" { + listener { + address = "127.0.0.1:{{ kairos_syslog_port }}" + protocol = "tcp" + syslog_format = "{{ syslog_format }}" // rfc3164 + labels = { + job = "kairos", + hostname = "{{ inventory_hostname }}", + environment = "{{ deployment_environment }}", + } + } + forward_to = [loki.write.default.receiver] +} +``` + +Container side, in the service's `docker-compose.yml.j2`: + +```yaml +logging: + driver: syslog + options: + syslog-address: "tcp://127.0.0.1:{{ kairos_syslog_port }}" + syslog-format: "{{ syslog_format | default('rfc3164') }}" +``` + +Ports follow the `514XX` convention and live in the host's `host_vars`. + +> ⚠️ **Ordering hazard.** The listener must exist before the container starts. +> If `docker compose up` runs while the Alloy listener is not bound, the +> container fails immediately with `failed to initialize logging driver: dial +> tcp 127.0.0.1:: connect: connection refused`. Deploy/verify Alloy on the +> host *before* deploying a syslog-driver service. This hazard is the main +> reason new services should prefer the Docker-socket path instead. + +> **Note — labels differ between the two Docker paths.** The syslog listener +> sets `job=""` (no `service`/`component`). The Docker-socket block +> sets `service=""` + `component=""` (no `job`). When +> migrating a service off syslog, update any dashboards or alert annotations +> that filter on `{job="…"}` to use `{service="…"}`. + +## Journal relabeling (native services) + +By default all journal entries share `job="systemd"`, making per-service +filtering impossible. A `loki.relabel` component overrides labels based on the +systemd unit. The journal source forwards to the relabel component instead of +directly to `loki.write`. + +```alloy +loki.source.journal "systemd_logs" { + forward_to = [loki.write.default.receiver] + relabel_rules = loki.relabel.journal_puck.rules + labels = { + hostname = "{{ inventory_hostname }}", + environment = "{{ deployment_environment }}", + } +} + +loki.relabel "journal_puck" { + forward_to = [] + + rule { // Pallas runtime → service/project schema + source_labels = ["__journal_syslog_identifier"] + regex = "kottos" + target_label = "service" + replacement = "pallas" + } + + rule { // default fallback + source_labels = ["__journal__systemd_unit"] + regex = ".+" + target_label = "job" + replacement = "systemd" + } +} +``` + +Rules run top-to-bottom; the first match per `target_label` wins, so the +generic `systemd` fallback stays **last**. Escape dots in unit regexes +(`alloy\\.service`). The `__journal_*` fields are hidden metadata — used for +relabeling, not shipped to Loki. + +## Metrics + +On Docker hosts the per-host config also scrapes host and container metrics and +**remote-writes** them to Prometheus (Alloy is the push agent; Prometheus does +not scrape these hosts directly): + +- `prometheus.exporter.unix` — node metrics (Incus-safe collectors only). +- `prometheus.exporter.process` — `namedprocess_namegroup_*` per command. +- `prometheus.exporter.cadvisor` — `container_*` metrics via the Docker socket. + +These feed `prometheus.scrape` (`job_name` = the host, e.g. `puck`) → +`prometheus.relabel` (adds `instance=`) → +`prometheus.remote_write` → `prospero.incus:9090`. + +> Application `/metrics` endpoints (e.g. django-prometheus, the +> nginx-prometheus-exporter sidecar) are **not** scraped by Alloy. Prometheus on +> Prospero scrapes those directly — see +> [`pplg/prometheus.yml.j2`](../ansible/pplg/prometheus.yml.j2). + +## Current inventory + +### Hosts using Docker socket discovery + +| Host | Block | Notes | +|------|-------|-------| +| `puck` | `discovery.docker` + `loki.source.docker "containers"` | Reference implementation. Covers all Compose projects (athena, mnemosyne, daedalus, kairos, …) as `service`/`component`. | + +### Hosts using per-service syslog listeners + +| Host | Services (job labels) | +|------|-----------------------| +| `puck` | angelia, kairos, spelunker, jupyterlab *(transitional — see below)* | +| `miranda` | argos, neo4j-cypher, grafana_mcp, gitea-mcp, searxng | +| `oberon` | rabbitmq, smtp4dev | +| `rosalind` | gitea, hass, lobechat, jellyfin, searxng (+ apache log files) | +| `titania` | casdoor, haproxy | +| `ariel`, `umbriel` | neo4j | + +### Transitional state on puck + +`athena`, `mnemosyne`, and `daedalus` have **migrated off** their syslog +listeners to the Docker-socket block; their old `*_syslog_port` host_vars are +retained as reserved-but-unused and can be removed once each rollout is +verified. The remaining `puck` syslog listeners (angelia, kairos, spelunker, +jupyterlab) are candidates to migrate the same way. + +## Querying in Grafana + +```logql +# All Athena container logs (any component) +{service="athena"} + +# Just the Athena MCP container +{service="athena", component="mcp"} + +# Superuser-login forensic line behind the DjangoSuperuserLogin alert +{service="athena"} |= "event=superuser_login" + +# A syslog-driver service (legacy label scheme) +{job="kairos"} + +# Errors across everything on one host +{hostname="puck.incus"} |~ "(?i)error" +``` + +## Adding a new Dockerised service + +**Preferred (Docker socket — no Alloy change needed):** + +1. Ensure the service's Compose project uses the default `json-file` log driver + (do **not** set `logging: { driver: syslog }`) and the app logs to stdout. +2. Confirm the host's per-host Alloy config has the `discovery.docker` + + `loki.source.docker` blocks (currently `puck`). If not, add them once + (copy from [`puck/config.alloy.j2`](../ansible/alloy/puck/config.alloy.j2)). +3. Deploy the service. Verify in Grafana: `{service=""}` + returns entries, with `component=`. + +**Legacy (syslog driver — only if the host has no Docker-socket block):** + +1. Allocate a `514XX` syslog port in the host's `host_vars`. +2. Add a `loki.source.syslog` block to `ansible/alloy//config.alloy.j2`. +3. Add the `syslog` logging driver to the service's `docker-compose.yml.j2`. +4. **Deploy Alloy first**, then the service. +5. Verify: `{job="