diff --git a/docs/mnemosyne_integration.md b/docs/mnemosyne_integration.md new file mode 100644 index 0000000..da2909a --- /dev/null +++ b/docs/mnemosyne_integration.md @@ -0,0 +1,102 @@ +# Mnemosyne Integration — Pallas Reference + +This document summarises the Pallas-specific changes required for Mnemosyne knowledge integration. The full specification lives in [`daedalus/docs/mnemosyne_integration.md`](../../daedalus/docs/mnemosyne_integration.md). + +--- + +## Overview + +Pallas agents gain access to Mnemosyne's content-type-aware knowledge graph as a downstream MCP server. Agents can search documents, browse libraries, retrieve items, and traverse the concept graph — all via standard MCP tool calls. + +--- + +## Configuration Changes + +### fastagent.config.yaml + +Add the Mnemosyne MCP server: + +```yaml +mcp: + servers: + # ... existing servers (argos, neo4j_cypher, kernos, rommie, gitea, grafana) ... + mnemosyne: + transport: http + url: "http://puck.incus:22091/mcp" +``` + +### Agent Definitions + +#### Research Agent (port 23031) + +The `knowledge` agent in the research chain gains Mnemosyne access: + +```python +@fast.agent(name="search", servers=["argos"]) +@fast.agent(name="knowledge", servers=["neo4j_cypher", "mnemosyne"]) +@fast.chain(name="research", sequence=["search", "knowledge"], default=True) +``` + +The `knowledge` agent's system instruction should guide tool selection: + +> Use `mnemosyne.search_knowledge` for document content retrieval — it handles chunking, vector search, re-ranking, and content-type-aware context. Use `neo4j_cypher` for graph topology queries, relationship exploration, and data not managed by Mnemosyne. + +#### Infrastructure Agent (port 23032) + +No changes — Infrastructure does not use Mnemosyne. + +#### Orchestrator (port 23033) + +```python +@fast.agent(name="research_sub", servers=["argos", "neo4j_cypher", "mnemosyne"]) +@fast.agent(name="infra_sub", servers=["kernos", "gitea", "rommie"]) +@fast.orchestrator(name="orchestrator", agents=["research_sub", "infra_sub"], + plan_type="iterative", default=True) +``` + +### Registry Update + +Update agent descriptions to reflect Mnemosyne access: + +```json +{ + "server": { + "name": "ca.helu.ouranos/pallas-research", + "title": "Research Agent", + "description": "Web search via Argos, knowledge graph via Neo4j, and content library search via Mnemosyne", + "version": "1.1.0", + "remotes": [ + { "type": "streamable-http", "url": "http://puck.incus:23031/mcp" } + ] + } +} +``` + +--- + +## Available Mnemosyne MCP Tools + +These tools become available to agents with `mnemosyne` in their `servers` list: + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `search_knowledge` | Hybrid vector + full-text + graph search with re-ranking | Document content retrieval, question answering over stored knowledge | +| `search_by_category` | Search scoped to a library type (fiction, technical, etc.) | When the user specifies or implies a content domain | +| `list_libraries` | List all knowledge libraries | Discovering what knowledge domains exist | +| `list_collections` | List collections within a library | Browsing a specific knowledge domain | +| `get_item` | Retrieve item metadata + chunk previews + concept links | Deep dive on a specific document/item | +| `get_concepts` | Traverse concept graph | Exploring relationships between topics, people, places | + +--- + +## Downstream MCP Servers (Updated) + +| Server | Host | URL | Used by | +|--------|------|-----|---------| +| argos | miranda.incus | `http://miranda.incus:25534/mcp` | Research, Orchestrator | +| neo4j_cypher | circe.helu.ca | `http://circe.helu.ca:22034/mcp` | Research, Orchestrator | +| **mnemosyne** | **puck.incus** | **`http://puck.incus:22091/mcp`** | **Research, Orchestrator** | +| kernos | caliban.incus | `http://caliban.incus:22021/mcp` | Infrastructure, Orchestrator | +| gitea | miranda.incus | `http://miranda.incus:25535/mcp` | Infrastructure, Orchestrator | +| rommie | caliban.incus | `http://caliban.incus:22031/mcp` | Infrastructure, Orchestrator | +| grafana | miranda.incus | `http://miranda.incus:25533/mcp` | Infrastructure | diff --git a/docs/pallas.md b/docs/pallas.md new file mode 100644 index 0000000..9a45541 --- /dev/null +++ b/docs/pallas.md @@ -0,0 +1,495 @@ +# Pallas — Technical Reference + +Pallas is the generic runtime that turns [fast-agent](https://github.com/evalstate/fast-agent) agent definitions into StreamableHTTP MCP servers. It is **completely deployment-agnostic**: all environment-specific values (agent names, ports, hosts, model) live in the calling project's configuration files, not in Pallas itself. + +--- + +## Solution Architecture + +Pallas occupies the middle tier of a three-layer MCP architecture. It bridges a web-facing client (Daedalus) and a constellation of specialised downstream MCP servers. + +``` +┌──────────────────────────────────┐ +│ Daedalus │ Web UI / FastAPI / MCP client +│ Workspace management, chat, │ Discovers agents via registry +│ health monitoring, progress │ Calls agent tools via MCP +└──────────┬───────────────────────┘ + │ MCP over Streamable HTTP + ▼ +┌──────────────────────────────────┐ +│ Pallas (FastAgent MCP Bridge) │ Python runtime +│ │ +│ ┌─ Registry (port N) │ GET /.well-known/mcp/server.json +│ ├─ Agent: Research (port N+1) │ Chains, routers, sub-agents +│ ├─ Agent: Engineering (port N+2)│ Orchestrators, tool pipelines +│ └─ Agent: Orchestrator (N+3) │ Delegates across agents +│ │ +│ Each agent exposes: │ +│ • send_message tool │ +│ • get_health tool │ +│ • {agent}_history prompt │ +└──────────┬───────────────────────┘ + │ MCP over Streamable HTTP + ▼ +┌──────────────────────────────────┐ +│ Downstream MCP Servers │ +│ │ +│ Argos — web search │ +│ Neo4j — knowledge graph │ +│ Mnemosyne — content library │ +│ Kernos — shell execution │ +│ Gitea — repository mgmt │ +│ Grafana — monitoring │ +│ Rommie — system management│ +└──────────────────────────────────┘ +``` + +### Daedalus → Pallas + +| Interaction | Mechanism | +|---|---| +| Agent discovery | `GET {registry}/.well-known/mcp/server.json` — plain HTTP, returns all agents with MCP endpoint URLs | +| Agent communication | MCP `tools/call` on `send_message` — text + optional images | +| Health monitoring | MCP `tools/call` on `get_health` — programmatic, no LLM invocation | +| Progress feedback | MCP `notifications/progress` — streamed over SSE during long-running tool calls | +| Conversation history | MCP `prompts/get` on `{agent}_history` — retrieves stored message history | + +### Pallas → Downstream + +Pallas agents call downstream MCP servers via standard MCP tool calls. Each agent declares its servers in its fast-agent definition (`servers=["argos", "neo4j_cypher", ...]`). The server URLs and auth headers are configured in the consuming project's `fastagent.config.yaml`. + +### Mnemosyne's Role + +Mnemosyne provides a content-type-aware knowledge graph with hybrid search (vector + full-text + graph). Agents with `mnemosyne` in their `servers` list gain access to tools for searching documents, browsing libraries and collections, retrieving items, and traversing the concept graph. It complements Neo4j (graph topology and relationships) with content-focused retrieval and re-ranking. + +### Why MCP End-to-End + +Pallas is the protocol boundary — MCP above (from Daedalus) and MCP below (to downstream servers). This eliminates any MCP→REST→MCP translation layer. A single `fast.start_server(transport="http")` call exposes a complete agent as a StreamableHTTP MCP endpoint, giving Daedalus: + +- **Tool discovery** via `session.list_tools()` +- **Native streaming** via MCP Streamable HTTP / SSE +- **Health checks** as ordinary tool calls — no separate API surface +- **Progress notifications** built into the protocol + +--- + +## Pallas Internal Architecture + +Pallas is four modules, composed at startup: + +``` +server.py main() + │ + ├─ _load_deployment_config() parse agents.yaml + ├─ _build_agents_table() {name: (module, port)} + ├─ _build_agent_deps() dependency graph + │ + ├─ _start_all() or _run_single() + │ │ + │ ├─ _preflight() + │ │ ├─ _register_unknown_models() model registration + │ │ └─ validate_llm_providers() LLM API key + model checks + │ │ + │ ├─ start subagents (depends_on) + │ ├─ wait for subagent readiness + │ ├─ start top-level agents + │ │ │ + │ │ └─ _start_agent(name) + │ │ ├─ import agent module + │ │ ├─ MultimodalAgentMCPServer(...) + │ │ ├─ _resolve_downstream_servers() + │ │ ├─ _preflight_mcp_servers() warn on missing auth + │ │ ├─ register_health_tool() + │ │ └─ server.run_async() + │ │ + │ └─ run_registry() Starlette app on registry port + │ + └─ asyncio.run(...) +``` + +| Module | Purpose | +|---|---| +| `pallas.server` | CLI entry point, configuration loading, agent lifecycle orchestration, model registration | +| `pallas.registry` | Starlette app serving `GET /.well-known/mcp/server.json` — builds the agent catalogue from `agents.yaml` + `fastagent.config.yaml` | +| `pallas.multimodal_server` | `MultimodalAgentMCPServer` — `AgentMCPServer` subclass adding image attachment support and conversation history prompts | +| `pallas.health` | Two-layer health: startup LLM preflight validation + runtime `get_health` MCP tool with downstream server probing | + +--- + +## Installation + +```bash +pip install git+ssh://git@git.helu.ca:22022/r/pallas.git +``` + +Or as a project dependency: + +```toml +dependencies = [ + "pallas-mcp @ git+ssh://git@git.helu.ca:22022/r/pallas.git", +] +``` + +Requires Python ≥ 3.13. Key dependencies: `fast-agent-mcp`, `httpx`, `pyyaml`, `starlette`, `uvicorn`. + +--- + +## Project Layout + +Pallas reads configuration from the **working directory** at runtime. A consuming project looks like: + +``` +my-project/ +├── agents/ +│ ├── __init__.py +│ └── jarvis.py # FastAgent definitions +├── agents.yaml # Deployment topology +├── fastagent.config.yaml # FastAgent + model config +├── fastagent.secrets.yaml # API keys (gitignored) +└── .env # Secret values (gitignored) +``` + +Pallas itself contains no agent definitions, model names, ports, or hostnames. Everything is injected by the consuming project. + +--- + +## Configuration Reference + +### `agents.yaml` + +Single source of truth for deployment topology. + +```yaml +name: my-project # log prefixes and registry names +version: "1.0.0" # published in registry entries +host: my-host.example.com # hostname for registry URLs +namespace: com.example.project # reverse-domain prefix for registry names +registry_port: 8200 # port for the registry server + +agents: + jarvis: + module: agents.jarvis # importable Python module path + port: 8201 # StreamableHTTP port for this agent + title: Jarvis # human-readable name (registry) + description: "My assistant" # one-line description (registry) + depends_on: [research] # optional: start these agents first + + research: + module: agents.research + port: 8250 + title: Research Agent + description: "Web search and knowledge graph" +``` + +| Field | Required | Description | +|---|---|---| +| `name` | yes | Project name — used in log prefixes (`[my-project]`) and CLI help | +| `version` | no | Semver string published in registry entries. Default: `"1.0.0"` | +| `host` | no | Hostname used in registry `remotes[].url`. Default: `"localhost"` | +| `namespace` | no | Reverse-domain prefix for registry `server.name` (e.g. `com.example/jarvis`) | +| `registry_port` | no | Port for the registry server. Default: `24200` | +| `agents..module` | yes | Importable Python module path containing a `fast` instance | +| `agents..port` | yes | Port for this agent's StreamableHTTP MCP server | +| `agents..title` | no | Display name in registry. Default: `name.title()` | +| `agents..description` | no | Description in registry | +| `agents..depends_on` | no | List of agent names that must start and become ready before this agent | + +### `fastagent.config.yaml` Extensions + +Pallas reads two keys beyond the standard fast-agent config: + +```yaml +default_model: openai.my-model-name + +model_capabilities: + vision: false + context_window: 200000 + max_output_tokens: 32000 +``` + +| Key | Description | +|---|---| +| `default_model` | `provider.model-name` format. The provider prefix (`anthropic` or `openai`) determines which LLM provider is active for health checks. | +| `model_capabilities.vision` | `true` registers the model with multimodal tokenization; `false` registers as text-only. Default: `false` | +| `model_capabilities.context_window` | Context window size in tokens. Default: `131072` | +| `model_capabilities.max_output_tokens` | Max output token limit. Default: `16384` | + +Capabilities are declared explicitly rather than inferred from model name — naming conventions vary across model families, making regex heuristics brittle. These values are both used to register unknown models with fast-agent's `ModelDatabase` and published in the registry response. + +### `fastagent.secrets.yaml` + +```yaml +anthropic: + api_key: "${ANTHROPIC_API_KEY}" +openai: + api_key: "${OPENAI_API_KEY}" + base_url: "${OPENAI_BASE_URL}" +``` + +`${ENV_VAR}` placeholders are expanded at runtime from environment variables. + +### `.env` + +Pallas loads `.env` from the working directory into `os.environ` without overwriting existing variables. This supports both local development and systemd deployments: + +```dotenv +ANTHROPIC_API_KEY=sk-ant-... +OPENAI_API_KEY=sk-... +OPENAI_BASE_URL=http://my-llm-server:8080/v1 +``` + +`OPENAI_BASE_URL` defaults to `https://api.openai.com/v1` if unset. For local llama-cpp, vLLM, or other OpenAI-compatible servers, set it to their endpoint. + +### Environment Variables + +| Variable | Default | Purpose | +|---|---|---| +| `PALLAS_AGENTS_CONFIG` | `agents.yaml` | Override path to deployment config | + +--- + +## Running Pallas + +### CLI + +```bash +pallas # start all agents + registry +pallas --agent jarvis # start a single agent (no registry) +python -m pallas.server # equivalent to `pallas` +``` + +### Startup Sequence + +**All agents mode** (`pallas`): + +1. Load `agents.yaml`, build agents table and dependency graph +2. **Preflight** — register unknown models with `ModelDatabase`, validate LLM provider API keys and model availability +3. Start the registry server on `registry_port` +4. Start **subagents** (agents listed in other agents' `depends_on`) +5. Wait for each subagent to become ready (HTTP probe on `/mcp`, 60s timeout) +6. Start **top-level agents** (everything not a subagent) +7. All servers run concurrently via `asyncio.gather` + +**Single agent mode** (`pallas --agent `): + +1. Load `agents.yaml` +2. Preflight +3. Start the named agent (no registry, no dependency resolution) + +### Per-Agent Startup + +For each agent: + +1. Import the agent module (`agents.`) and obtain its `fast` instance +2. Enter `fast.run()` context — initialises the fast-agent runtime +3. Create a `MultimodalAgentMCPServer` wrapping the primary agent instance +4. Resolve downstream MCP server configs from the fast-agent configuration +5. Warn if any downstream auth headers reference unset environment variables +6. Register the `get_health` MCP tool with downstream server info +7. Bind to `0.0.0.0:` and serve StreamableHTTP + +--- + +## Daedalus Integration + +This section describes the contract from Pallas's perspective. The full client-side specification is in `docs/pallas_integration.md`. + +### Registration Flow + +1. Daedalus stores a registry URL (e.g. `http://puck.incus:23030`) +2. Fetches `GET {url}/.well-known/mcp/server.json` +3. Discovers all agents with their MCP endpoint URLs, titles, and descriptions +4. Creates connections to each agent + +### Health Polling + +Daedalus calls `get_health` on each connected agent at a configurable interval (default 60s). The response maps to UI indicators: + +| `status` | Daedalus behaviour | +|---|---| +| `ok` | Green badge, normal operation | +| `degraded` | Yellow badge + warning banner showing `message`. Chat allowed. | +| `error` | Red badge. Chat disabled. | + +### Progress Notifications + +Long-running agent tool calls (agentic loops, sub-agent delegation) emit MCP `notifications/progress` on the SSE stream. Daedalus must include a `progressToken` in the `_meta` of `tools/call` requests to opt in: + +```python +result = await session.call_tool( + "jarvis", + arguments={"message": user_input}, + request_params={"_meta": {"progressToken": str(uuid4())}}, +) +``` + +Progress notification fields: + +| Field | Description | +|---|---| +| `progressToken` | Matches the token sent in the request | +| `progress` | Monotonically increasing step counter | +| `total` | `null` = indeterminate (loop in progress), `1.0` = sub-task finished | +| `message` | Status text: `{server}/{tool}: started\|completed\|failed` or `{agent} step N (llm\|tool)` | + +Without a `progressToken`, Pallas skips all progress notifications and the client receives nothing until the final result. + +### Chat Blocking + +If the target agent's cached health is `error`, Daedalus returns HTTP 503 and disables the message input. `degraded` shows a warning but allows chat. + +--- + +## Registry Server + +### Endpoint + +``` +GET {host}:{registry_port}/.well-known/mcp/server.json +``` + +Plain HTTP — not MCP. No authentication. Returns `application/json`. + +### Response Structure + +Built dynamically from `agents.yaml` + `fastagent.config.yaml`: + +```json +{ + "servers": [ + { + "server": { + "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", + "name": "com.example.project/jarvis", + "title": "Jarvis", + "description": "My assistant agent", + "version": "1.0.0", + "remotes": [ + { "type": "streamable-http", "url": "http://my-host.example.com:8201/mcp" } + ], + "capabilities": { + "model": "my-model-name", + "vision": false, + "context_window": 200000, + "max_output_tokens": 32000 + } + }, + "_meta": { + "io.modelcontextprotocol.registry/official": { + "status": "active", + "updatedAt": "2026-01-01T00:00:00Z", + "isLatest": true + } + } + } + ] +} +``` + +### Registry Name Construction + +`{namespace}/{slug}` — where `slug` is the agent key with underscores replaced by hyphens. Example: namespace `com.example.project` + agent key `tech_research` → `com.example.project/tech-research`. + +### Capabilities + +If `model_capabilities` is defined in `fastagent.config.yaml`, each registry entry includes a `capabilities` object with model name, vision support, context window, and max output tokens. This allows clients to make informed decisions about what an agent can handle. + +--- + +## Multimodal Support + +`MultimodalAgentMCPServer` extends fast-agent's `AgentMCPServer` with image attachment support. + +### `send_message` Tool + +Each agent's MCP tool accepts: + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `message` | `str` | yes | Text message to the agent | +| `images` | `list[dict]` | no | Base64-encoded images: `[{"data": "...", "mime_type": "image/png"}]` | + +When `images` is provided, the message is sent as a `PromptMessageExtended` containing both `TextContent` and `ImageContent` parts — the agent's underlying model must support vision. + +### Conversation History Prompt + +For agents with `instance_scope != "request"`, a `{agent}_history` prompt is registered that returns the full conversation history as FastMCP `Message` objects. This allows clients to retrieve the stored context. + +### Bearer Token Propagation + +The server captures the authenticated bearer token from the incoming MCP request and propagates it via `request_bearer_token` context variable to downstream calls. + +--- + +## Health System + +Two-layer health checking: **startup preflight** validates LLM providers before agents launch, and a **runtime `get_health` tool** reports ongoing status. + +### Startup Preflight + +Runs once before any agents start. Validates all LLM providers that have API keys configured. + +| Provider | Active (default_model matches) | Key set, not active | +|---|---|---| +| **Anthropic** | `GET /v1/models/{model}` — confirms model exists and key is valid | `GET /v1/models/claude-sonnet-4-5` — verifies API access | +| **OpenAI** | `GET {base_url}/models` — lists models, confirms configured model is present | `GET {base_url}/models` — lists available models | + +- **Warn-only** — never blocks startup. Agents start regardless. +- **5-second timeout** per provider API call. +- Loads `.env` before checking. + +### Runtime `get_health` Tool + +Registered on each agent's MCP server. Checks: + +1. **Downstream MCP servers** — sends an MCP `initialize` handshake to each server URL. Uses `initialize` because it is the only MCP method that works without a pre-established session. After success, sends `DELETE` with the returned `Mcp-Session-Id` to tear down the session cleanly. 3-second timeout. + +2. **Active LLM provider** — includes the preflight result for the provider that `default_model` points to. Only the active provider affects health status. + +### Response Format + +```json +{ "status": "ok", "timestamp": "2026-01-01T00:00:00Z" } +``` + +```json +{ + "status": "degraded", + "timestamp": "2026-01-01T00:00:00Z", + "message": "Unreachable: neo4j_cypher; LLM: openai: model 'bad-model' not found" +} +``` + +| Status | Meaning | +|---|---| +| `ok` | All downstream servers reachable and active LLM provider healthy | +| `degraded` | One or more downstream servers unreachable, or active LLM provider failed | + +--- + +## Model Registration + +Pallas registers models not in fast-agent's built-in `ModelDatabase` at startup, using the explicit capability declarations from `fastagent.config.yaml`. + +The process: + +1. Read `default_model` and `model_capabilities` from config +2. Extract the model name (portion after the provider prefix dot) +3. Check if `ModelDatabase` already knows this model — if so, skip +4. Register with `ModelDatabase.register_runtime_model_params()`: + - `vision: true` → multimodal tokenization (`QWEN_MULTIMODAL`) + - `vision: false` → text-only tokenization (`TEXT_ONLY`) + - `context_window` and `max_output_tokens` from config (with sensible defaults) + +This avoids the brittle pattern of inferring capabilities from model name substrings, which breaks for custom or fine-tuned models with non-standard names. + +--- + +## Module Reference + +| Module | File | Purpose | +|---|---|---| +| `pallas.server` | `server.py` | CLI entry point (`pallas` command), configuration loading, agent lifecycle orchestration, dependency ordering, model registration | +| `pallas.registry` | `registry.py` | Starlette app serving `GET /.well-known/mcp/server.json` — agent catalogue built from config | +| `pallas.multimodal_server` | `multimodal_server.py` | `MultimodalAgentMCPServer` — extends `AgentMCPServer` with image support, conversation history prompts, bearer token propagation | +| `pallas.health` | `health.py` | LLM provider preflight validation, downstream MCP server probing, `get_health` tool registration | diff --git a/docs/pallas_integration.md b/docs/pallas_integration.md new file mode 100644 index 0000000..b7cdad6 --- /dev/null +++ b/docs/pallas_integration.md @@ -0,0 +1,375 @@ +# Pallas MCP Interface Specification + +This document defines the contract between **Daedalus** (MCP client / web UI) and **Pallas** (FastAgent MCP servers). It specifies the interfaces Pallas must expose: a **registry endpoint** for agent discovery, a **`get_health` MCP tool** on each agent for health monitoring, and **progress notifications** for real-time feedback during agent execution. + +--- + +## Architecture Overview + +``` + Pallas Instance (puck.incus) + ┌────────────────────────────────────────┐ + │ │ + │ Registry (port 23030) │ + Daedalus ──GET──▶│ /.well-known/mcp/server.json │ + │ │ + │ Agent: Research (port 23031) │ + Daedalus ──MCP──▶│ MultimodalAgentMCPServer │──MCP──▶ Argos, Neo4j + │ └─ get_health tool │ + │ │ + │ Agent: Engineering (port 23032) │ + Daedalus ──MCP──▶│ MultimodalAgentMCPServer │──MCP──▶ Kernos, Gitea + │ └─ get_health tool │ + │ │ + │ Agent: Orchestrator (port 23033) │ + Daedalus ──MCP──▶│ MultimodalAgentMCPServer │──MCP──▶ Research, Infra + │ └─ get_health tool │ + └────────────────────────────────────────┘ +``` + +A single Pallas instance hosts multiple FastAgent agents, each on its own port. The registry runs on a dedicated port (e.g. 23030) and provides a catalogue of all agents. Each agent exposes a `get_health` MCP tool that FastAgent intercepts programmatically — no LLM invocation. + +Daedalus registers the registry URL once in global settings. Everything else is automatic. + +--- + +## 1. Registry Endpoint + +### `GET {registry_url}/.well-known/mcp/server.json` + +The registry is a plain HTTP endpoint (not MCP) served on a dedicated port. It returns a dynamic list of all agents currently provided by the Pallas instance, following the [MCP Server Schema](https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json). + +#### Request + +``` +GET http://puck.incus:23030/.well-known/mcp/server.json +Accept: application/json +``` + +No authentication. No query parameters. + +#### Response + +```json +{ + "servers": [ + { + "server": { + "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json", + "name": "ca.helu.ouranos/pallas-research", + "title": "Research Agent", + "description": "Web search via Argos and knowledge graph via Neo4j", + "version": "1.0.0", + "icons": [ + { "src": "https://daedalus.ouranos.helu.ca/icons/research.svg", "sizes": "any" } + ], + "remotes": [ + { "type": "streamable-http", "url": "http://puck.incus:23031/mcp" } + ], + "capabilities": { + "model": "qwen3-8b-q5", + "vision": false, + "context_window": 200000, + "max_output_tokens": 32000 + } + }, + "_meta": { + "io.modelcontextprotocol.registry/official": { + "status": "active", + "updatedAt": "2026-03-12T10:00:00Z", + "isLatest": true + } + } + }, + { + "server": { + "name": "ca.helu.ouranos/pallas-infra", + "title": "Engineering Agent", + "description": "Shell access via Kernos and repository management via Gitea", + "version": "1.0.0", + "remotes": [ + { "type": "streamable-http", "url": "http://puck.incus:23032/mcp" } + ], + "capabilities": { + "model": "qwen3-8b-q5", + "vision": false, + "context_window": 200000, + "max_output_tokens": 32000 + } + }, + "_meta": { + "io.modelcontextprotocol.registry/official": { + "status": "active", + "updatedAt": "2026-03-12T10:00:00Z", + "isLatest": true + } + } + } + ] +} +``` + +#### Schema + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `servers` | array | yes | List of server entries | +| `servers[].server.name` | string | yes | Reverse-domain identifier (e.g. `ca.helu.ouranos/pallas-research`). Daedalus derives `server_id` from the segment after the last `/`. | +| `servers[].server.title` | string | no | Human-readable display name. Falls back to `name` if absent. | +| `servers[].server.description` | string | no | One-line description shown in Daedalus UI. | +| `servers[].server.version` | string | no | Semver version string. | +| `servers[].server.icons` | array | no | Array of `{ src, sizes }`. Daedalus uses the first entry. | +| `servers[].server.remotes` | array | yes | Connection endpoints. Daedalus looks for `type: "streamable-http"` and uses its `url`. | +| `servers[].server.capabilities` | object | no | Model capabilities. Contains `model` (string), `vision` (bool), `context_window` (int), `max_output_tokens` (int). Published when `model_capabilities` is configured in `fastagent.config.yaml`. | +| `servers[]._meta` | object | no | Registry metadata. Informational only — Daedalus does not act on it. | + +#### Behaviour + +- The response **must** reflect the current set of registered agents. If an agent is added or removed from Pallas, subsequent requests must reflect the change. +- Content-Type **must** be `application/json`. +- Every entry in `remotes` with `type: "streamable-http"` is treated as an MCP endpoint Daedalus can connect to. +- The `icons[].src` URL may be absolute or relative. Daedalus stores it as-is. + +--- + +## 2. Health Tool + +### MCP tool: `get_health` + +Each agent's MCP server **must** expose a tool named `get_health`. FastAgent intercepts this tool programmatically — it does not route through the LLM. This keeps health checks fast (~ms) and free of inference cost. + +#### Tool Definition + +The tool should appear in `session.list_tools()` with: + +```json +{ + "name": "get_health", + "description": "Returns the health status of this agent and its downstream dependencies.", + "inputSchema": { + "type": "object", + "properties": {}, + "additionalProperties": false + } +} +``` + +No input arguments. + +#### Invocation + +Daedalus calls this via the standard MCP SDK: + +```python +result = await session.call_tool("get_health") +``` + +#### Response + +The tool returns a single `text` content block containing a JSON object: + +```json +{ + "status": "ok", + "timestamp": "2026-03-12T15:42:00Z" +} +``` + +##### Status Values + +| Status | Meaning | Daedalus Behaviour | +|--------|---------|-------------------| +| `ok` | Agent healthy, all downstream MCP servers reachable | Green badge. Normal operation. | +| `degraded` | Agent responds but with issues (slow responses, partial downstream outage) | Yellow badge + warning banner. Chat allowed. | +| `error` | Agent cannot process requests | Red badge. Chat disabled — user cannot send messages. | + +##### Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `status` | `"ok" \| "degraded" \| "error"` | yes | Current health state | +| `timestamp` | string (ISO 8601) | no | When the health check was performed | +| `message` | string | no | Human-readable explanation. Required when `status` is `degraded` or `error`. Shown in Daedalus UI tooltips and warning banners. | + +##### Examples + +**Healthy:** +```json +{ + "status": "ok", + "timestamp": "2026-03-12T15:42:00Z" +} +``` + +**Degraded:** +```json +{ + "status": "degraded", + "timestamp": "2026-03-12T15:42:00Z", + "message": "Avg response 12s — Neo4j connection slow" +} +``` + +**Error:** +```json +{ + "status": "error", + "timestamp": "2026-03-12T15:42:00Z", + "message": "Argos MCP server unreachable" +} +``` + +#### Implementation Guidance + +The `get_health` tool checks connectivity to all downstream MCP servers the agent depends on using the MCP `initialize` handshake — the only MCP method that works without a pre-established session. This avoids burning LLM tokens on health checks. + +For each downstream MCP server: + +1. `POST` an MCP `initialize` request to the server URL (with auth headers and `Accept: application/json, text/event-stream`) +2. On success, tear down the session by sending `DELETE` with the returned `Mcp-Session-Id` header to avoid leaking server-side state +3. On failure (HTTP error, timeout, connection refused), record the server as unreachable + +Result mapping: + +- All downstream servers reachable and active LLM provider healthy → `ok` +- Some downstream servers unreachable, or active LLM provider failed preflight → `degraded` with explanation +- Agent failed to start or cannot process requests → `error` with explanation + +The tool **must not** invoke the LLM. It should complete in under 1 second (3-second timeout per downstream probe). + +--- + +## 3. Daedalus Consumption + +### Registration Flow + +1. User enters registry URL in Daedalus global settings (e.g. `http://puck.incus:23030`) +2. Daedalus `GET`s `{url}/.well-known/mcp/server.json` +3. Daedalus stores the `PallasInstance` with its registry URL +4. Discovered agents are shown with metadata (title, description, icon) + +### Workspace Attachment + +1. User selects a registered Pallas instance in workspace settings +2. Daedalus re-fetches the registry and creates `AgentConnection` rows for every agent in the instance +3. All agents from the instance become available in the workspace +4. Detaching removes all agent connections for that instance from the workspace + +### Health Polling + +- Daedalus polls `get_health` on connected agents at a configurable interval (`DAEDALUS_MCP_HEALTH_INTERVAL`, default 60 seconds) +- Health is cached in memory and exposed via the agent status API +- Prometheus gauge `daedalus_agent_health{instance, agent}` tracks health (1.0=ok, 0.5=degraded, 0.0=error) +- If health check fails entirely (connection error, timeout), status is treated as `error` + +### Chat Blocking + +- If the target agent's cached health is `error`, the chat endpoint returns HTTP 503 and the UI disables the message input +- If `degraded`, a warning bar appears but chat is allowed +- Users **can** create a workspace and attach an instance with unhealthy agents — health only blocks sending messages + +--- + +## 4. Agent Progress Notifications + +Agent tool calls can take tens of seconds to minutes when the agent enters an agentic loop — calling sub-agents, searching the web, querying knowledge graphs, etc. During this time, the MCP tool call has not yet returned. Without progress feedback, the user sees a dead spinner. + +MCP provides a built-in mechanism for this: `notifications/progress`. Pallas already emits these notifications during agent execution. Daedalus must opt in by sending a `progressToken` and rendering the notifications it receives. + +### How It Works + +``` +Daedalus Pallas (harper, port 24101) + │ │ + │── tools/call ─────────────────────────▶│ { message: "...", _meta: { progressToken: "abc123" } } + │ │ + │ │── LLM generates text + tool calls ──▶ + │ │ + │◀── notifications/progress ─────────────│ { progressToken: "abc123", progress: 0, message: "research/research__research: started" } + │ │ + │◀── notifications/progress ─────────────│ { progressToken: "abc123", progress: 1, message: "harper step 1 (tool)" } + │ │ + │◀── notifications/progress ─────────────│ { progressToken: "abc123", progress: 2, message: "harper step 2 (llm)" } + │ │ + │◀── notifications/progress ─────────────│ { progressToken: "abc123", progress: 1, total: 1, message: "research/research__research: completed" } + │ │ + │◀── notifications/progress ─────────────│ { progressToken: "abc123", progress: 1, total: 1, message: "tech_research/tech_research__tech_research: completed" } + │ │ + │◀── tools/call result ─────────────────│ { content: [{ type: "text", text: "..." }] } + │ │ +``` + +All messages flow over the existing SSE connection established by MCP Streamable HTTP. No additional transport is needed. + +### Daedalus Requirements + +#### Sending the Progress Token + +When calling any agent tool (except `get_health`), Daedalus **must** include a `progressToken` in the request's `_meta`: + +```python +result = await session.call_tool( + "harper", + arguments={"message": user_input}, + request_params={"_meta": {"progressToken": str(uuid4())}}, +) +``` + +Without the `progressToken`, Pallas skips all progress notifications and Daedalus receives nothing until the final result. + +#### Handling Progress Notifications + +Daedalus receives `notifications/progress` messages on the SSE stream during the tool call. Each notification contains: + +| Field | Type | Description | +|-------|------|-------------| +| `progressToken` | string/int | Matches the token sent in the request | +| `progress` | float | Monotonically increasing step counter | +| `total` | float \| null | `null` = indeterminate (loop in progress), `1.0` = task finished | +| `message` | string \| null | Human-readable status text | + +#### Message Format + +Progress messages follow predictable patterns: + +| Pattern | Meaning | Example | +|---------|---------|---------| +| `{server}/{tool}: started` | Tool invocation began | `research/research__research: started` | +| `{server}/{tool}: completed` | Tool invocation finished | `tech_research/tech_research__tech_research: completed` | +| `{server}/{tool}: failed` | Tool invocation failed | `argos/search_web: failed` | +| `{agent} step N (llm)` | Agent loop: LLM turn | `harper step 2 (llm)` | +| `{agent} step N (tool)` | Agent loop: tool execution | `harper step 3 (tool)` | + +#### Rendering Guidance + +- Display the `message` as a status line beneath the "thinking" indicator +- Replace the previous status on each new notification (not appended) +- When `total` is `null`, show an indeterminate progress indicator (spinner) +- When `total` equals `progress` (typically `1.0/1.0`), the specific tool/sub-task has completed — but the overall tool call may still be in progress +- Clear the progress indicator when the final `tools/call` result arrives + +### Pallas Guarantees + +- Progress notifications are emitted automatically by FastAgent's `MCPToolProgressManager` — no additional server-side configuration is needed +- Notifications are only sent when the client provides a `progressToken` +- At minimum, `on_tool_start` (progress 0) and `on_tool_complete` (progress 1/1) are emitted for every downstream tool invocation +- Loop step notifications are emitted when `emit_loop_progress=True` (the default for all Pallas agents) +- Progress notifications are best-effort — if one fails to send, the agent loop continues unaffected + +### Limitations + +- **LLM intermediate text is not streamed as progress.** When the agent says "Let me look into that..." before calling tools, this text is generated server-side during the LLM streaming step but is not forwarded as a progress notification. The text is included in the final tool result. A future enhancement may stream LLM text deltas as progress messages with a distinguishable prefix. +- **Parallel tool calls** emit interleaved progress messages. Each message includes a tool-specific prefix (`{server}/{tool}`), so Daedalus can track them independently if desired, or simply display the most recent message. + +--- + +## 5. Why MCP (Not REST) + +Pallas wraps each FastAgent instance in a `MultimodalAgentMCPServer` and serves it over StreamableHTTP. The MCP transport gives Daedalus: + +- **Tool discovery** — `session.list_tools()` returns the full capability manifest +- **Streaming** — MCP Streamable HTTP handles streaming natively +- **Health checks** — `get_health` is just another tool call, no separate API surface +- **Protocol alignment** — MCP is the abstraction boundary both above and below Pallas. No MCP→REST→MCP translation layer. + +The alternative (REST between Daedalus and Pallas) would require building a custom API layer in Pallas that reimplements what the MCP server already provides, with no simplification on the Daedalus side. diff --git a/docs/red_panda_standards.md b/docs/red_panda_standards.md new file mode 100644 index 0000000..405b695 --- /dev/null +++ b/docs/red_panda_standards.md @@ -0,0 +1,213 @@ +# Red Panda Approval™ Standards + +Quality and observability standards for the Ouranos Lab. All infrastructure code, application code, and LLM-generated code deployed into this environment must meet these standards. + +--- + +## 🐾 Red Panda Approval™ + +All implementations must meet the 5 Sacred Criteria: + +1. **Fresh Environment Test** — Clean runs on new systems without drift. No leftover state, no manual steps. +2. **Elegant Simplicity** — Modular, reusable, no copy-paste sprawl. One playbook per concern. +3. **Observable & Auditable** — Clear task names, proper logging, check mode compatible. You can see what happened. +4. **Idempotent Patterns** — Run multiple times with consistent results. No side effects on re-runs. +5. **Actually Provisions & Configures** — Resources work, dependencies resolve, services integrate. It does the thing. + +--- + +## Vault Security + +All sensitive information is encrypted using Ansible Vault with AES256 encryption. + +**Encrypted secrets:** +- Database passwords (PostgreSQL, Neo4j) +- API keys (OpenAI, Anthropic, Mistral, Groq) +- Application secrets (Grafana, SearXNG, Arke) +- Monitoring alerts (Pushover integration) + +**Security rules:** +- AES256 encryption with `ansible-vault` +- Password file for automation — never pass `--vault-password-file` inline in scripts +- Vault variables use the `vault_` prefix; map to friendly names in `group_vars/all/vars.yml` +- No secrets in plain text files, ever + +--- + +## Log Level Standards + +All services in the Ouranos Lab MUST follow these log level conventions. These rules apply to application code, infrastructure services, and any LLM-generated code deployed into this environment. Log output flows through Alloy → Loki → Grafana, so disciplined leveling is not cosmetic — it directly determines alert quality, dashboard usefulness, and on-call signal-to-noise ratio. + +### Level Definitions + +| Level | When to Use | What MUST Be Included | Loki / Grafana Role | +|-------|-------------|----------------------|---------------------| +| **ERROR** | Something is broken and requires human intervention. The service cannot fulfil the current request or operation. | Exception class, message, stack trace, and relevant context (request ID, user, resource identifier). Never a bare `"something failed"`. | AlertManager rules fire on `level=~"error\|fatal\|critical"`. These trigger Pushover notifications. | +| **WARNING** | Degraded but self-recovering: retries succeeding, fallback paths taken, thresholds approaching, deprecated features invoked. | What degraded, what recovery action was taken, current metric value vs. threshold. | Grafana dashboard panels. Rate-based alerting (e.g., >N warnings/min). | +| **INFO** | Significant lifecycle and business events: service start/stop, configuration loaded, deployment markers, user authentication, job completion, schema migrations. | The event and its outcome. This level tells the *story* of what the system did. | Default production visibility. The go-to level for post-incident timelines. | +| **DEBUG** | Diagnostic detail for active troubleshooting: request/response payloads, SQL queries, internal state, variable values. | **Actionable context is mandatory.** A DEBUG line with no detail is worse than no line at all. Include variable values, object states, or decision paths. | Never enabled in production by default. Used on-demand via per-service level override. | + +### Anti-Patterns + +These are explicit violations of Ouranos logging standards: + +| ❌ Anti-Pattern | Why It's Wrong | ✅ Correct Approach | +|----------------|---------------|-------------------| +| Health checks logged at INFO (`GET /health → 200 OK`) | Routine HAProxy/Prometheus probes flood syslog with thousands of identical lines per hour, burying real events. | Suppress health endpoints from access logs entirely, or demote to DEBUG. | +| DEBUG with no context (`logger.debug("error occurred")`) | Provides zero diagnostic value. If DEBUG is noisy *and* useless, nobody will ever enable it. | `logger.debug("PaymentService.process failed: order_id=%s, provider=%s, response=%r", oid, provider, resp)` | +| ERROR without exception details (`logger.error("task failed")`) | Cannot be triaged without reproduction steps. Wastes on-call time. | `logger.error("Celery task invoice_gen failed: order_id=%s", oid, exc_info=True)` | +| Logging sensitive data at any level | Passwords, tokens, API keys, and PII in Loki are a security incident. | Mask or redact: `api_key=sk-...a3f2`, `password=*****`. | +| Inconsistent level casing | Breaks LogQL filters and Grafana label selectors. | **Python / Django**: UPPERCASE (`INFO`, `WARNING`, `ERROR`, `DEBUG`). **Go / infrastructure** (HAProxy, Alloy, Gitea): lowercase (`info`, `warn`, `error`, `debug`). | +| Logging expected conditions as ERROR | A user entering a wrong password is not an error — it is normal business logic. | Use WARNING or INFO for expected-but-notable conditions. Reserve ERROR for things that are actually broken. | + +### Health Check Rule + +> All services exposed through HAProxy MUST suppress or demote health check endpoints (`/health`, `/healthz`, `/api/health`, `/metrics`, `/ping`) to DEBUG or below. Health check success is the *absence* of errors, not the presence of 200s. If your syslog shows a successful health probe, your log level is wrong. + +**Implementation guidance:** +- **Django / Gunicorn**: Filter health paths in the access log handler or use middleware that skips logging for probe user-agents. +- **Docker services**: Configure the application's internal logging to exclude health routes — the syslog driver forwards everything it receives. +- **HAProxy**: HAProxy's own health check logs (`option httpchk`) should remain at the HAProxy level for connection debugging, but backend application responses to those probes must not surface at INFO. + +### Background Worker & Queue Monitoring + +> **The most dangerous failure is the one that produces no logs.** + +When a background worker (Celery task consumer, RabbitMQ subscriber, Gitea Runner, cron job) fails to start or crashes on startup, it generates no ongoing log output. Error-rate dashboards stay green because there is no process running to produce errors. Meanwhile, queues grow unbounded and work silently stops being processed. + +**Required practices:** + +1. **Heartbeat logging** — Every long-running background worker MUST emit a periodic INFO-level heartbeat (e.g., `"worker alive, processed N jobs in last 5m, queue depth: M"`). The *absence* of this heartbeat is the alertable condition. + +2. **Startup and shutdown at INFO** — Worker start, ready, graceful shutdown, and crash-exit are significant lifecycle events. These MUST log at INFO. + +3. **Queue depth as a metric** — RabbitMQ queue depths and any application-level task queues MUST be exposed as Prometheus metrics. A growing queue with zero consumer activity is an **ERROR**-level alert, not a warning. + +4. **Grafana "last seen" alerts** — For every background worker, configure a Grafana alert using `absent_over_time()` or equivalent staleness detection: *"Worker X has not logged a heartbeat in >10 minutes"* → ERROR severity → Pushover notification. + +5. **Crash-on-start is ERROR** — If a worker exits within seconds of starting (missing config, failed DB connection, import error), the exit MUST be captured at ERROR level by the service manager (`systemd OnFailure=`, Docker restart policy logs). Do not rely on the crashing application to log its own death — it may never get the chance. + +### Production Defaults + +| Service Category | Default Level | Rationale | +|-----------------|---------------|-----------| +| Django apps (Angelia, Athena, Kairos, Icarlos, Spelunker, Peitho, MCP Switchboard) | `WARNING` | Business logic — only degraded or broken conditions surface. Lifecycle events (start/stop/deploy) still log at INFO via Gunicorn and systemd. | +| Gunicorn access logs | Suppress 2xx/3xx health probes | Routine request logging deferred to HAProxy access logs in Loki. | +| Infrastructure agents (Alloy, Prometheus, Node Exporter) | `warn` | Stable — do not change without cause. | +| HAProxy (Titania) | `warning` | Connection-level logging handled by HAProxy's own log format → Alloy → Loki. | +| Databases (PostgreSQL, Neo4j) | `warning` | Query-level logging only enabled for active troubleshooting. | +| Docker services (Gitea, LobeChat, Nextcloud, AnythingLLM, SearXNG) | `warn` / `warning` | Per-service default. Tune individually if needed. | +| LLM Proxy (Arke) | `info` | Token usage tracking and provider routing decisions justify INFO. Review periodically for noise. | +| Observability stack (Grafana, Loki, AlertManager) | `warn` | Should be quiet unless something is wrong with observability itself. | + +### Loki & Grafana Alignment + +**Label normalization**: Alloy pipelines (syslog listeners and journal relabeling) MUST extract and forward a `level` label on every log line. Without a `level` label, the log entry is invisible to level-based dashboard filters and alert rules. + +**LogQL conventions for dashboards:** +```logql +# Production error monitoring (default dashboard view) +{job="syslog", hostname="puck"} | json | level=~"error|fatal|critical" + +# Warning-and-above for a specific service +{service_name="haproxy"} | logfmt | level=~"warn|error|fatal" + +# Debug-level troubleshooting (temporary, never permanent dashboards) +{container="angelia"} | json | level="debug" +``` + +**Alerting rules** — Grafana alert rules MUST key off the normalized `level` label: +- `level=~"error|fatal|critical"` → Immediate Pushover notification via AlertManager +- `absent_over_time({service_name="celery_worker"}[10m])` → Worker heartbeat staleness → ERROR severity +- Rate-based: `rate({service_name="arke"} | json | level="error" [5m]) > 0.1` → Sustained error rate + +**Retention alignment**: Loki retention policies should preserve ERROR and WARNING logs longer than DEBUG. DEBUG-level logs generated during troubleshooting sessions should have a short TTL or be explicitly cleaned up. + +--- + +## Health Check Endpoints + +All services MUST expose Kubernetes-style health endpoints at these paths: + +| Endpoint | Purpose | Auth | +|----------|---------|------| +| `GET /live` | **Liveness** — process is running and accepting connections | None | +| `GET /ready` | **Readiness** — process is running AND all dependencies (DB, cache, upstream APIs) are healthy | None | +| `GET /metrics` | Prometheus metrics | IP-restricted (no JWT) | + +- HAProxy uses `health_path: /ready/` for backend health checks — return HTTP 200 when ready +- Health endpoints MUST NOT require authentication +- Third-party services use their native paths (`/api/health`, `/api/healthz`, `/-/healthy`, etc.) + +### Docker Compose Healthchecks + +Use `curl -f` (install curl in images if needed). Do not use `wget --spider`. + +```yaml +healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/live"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s +``` + +--- + +## Endpoint Protection + +| Protected (require valid JWT) | Unprotected | +|-------------------------------|-------------| +| All `/api/v1/*` routes | `GET /live` | +| | `GET /ready` | +| | `GET /metrics` (IP-restricted to internal networks) | +| | `GET /api/auth/login-url` | +| | `POST /api/auth/token` | +| | `POST /api/v1/telemetry` (sendBeacon cannot set headers) | + +> **Why `/api/v1/telemetry` is unprotected**: The browser `sendBeacon` API cannot set `Authorization` headers. The telemetry endpoint must be open to receive client-side error reports and performance data, or browser errors will be silently lost. + +--- + +## Prometheus Metrics + +All services SHOULD expose `GET /metrics` in Prometheus exposition format, scraped by Prospero's Prometheus at 15s intervals. + +- **IP-restricted** to internal networks: `10.10.0.0/24`, `172.16.0.0/12`, `127.0.0.0/8` +- No JWT required — HAProxy and Prometheus scrapers cannot authenticate +- Useful metrics to expose: request totals and durations, error rates, active connections, queue depths, dependency health + +--- + +## Browser Telemetry + +Frontend/browser code MUST report errors and performance data back to the server. + +- Send to `POST /api/v1/telemetry` — unprotected endpoint +- Capture: JavaScript exceptions, promise rejections, resource load failures, performance metrics +- The server MUST log client-side exceptions at **WARNING** level (they indicate user-facing problems but are not server failures) +- Include enough context to reproduce: URL, user agent, error message, stack trace (if available) + +--- + +## Docker Networking + +- Use the **default Docker bridge network** for simple deployments +- Add additional named networks only when required (e.g., isolating database traffic) or explicitly requested +- Do not define custom networks for single-service Docker Compose stacks + +--- + +## Documentation Standards + +Place documentation in the `/docs/` directory of the repository. + +### HTML Documents + +HTML documents must follow [docs/documentation_style_guide.html](documentation_style_guide.html). + +- Use Bootstrap CDN with Bootswatch theme **Flatly** +- Include a dark mode toggle button in the navbar +- Use Bootstrap Icons for icons +- Use Bootstrap CSS for styles — avoid custom CSS +- Use **Mermaid** for diagrams diff --git a/pallas/health.py b/pallas/health.py index c031388..0a12f1f 100644 --- a/pallas/health.py +++ b/pallas/health.py @@ -7,6 +7,7 @@ Validates LLM provider API keys and model availability at startup. import asyncio import json +import logging import os import re from datetime import datetime, timezone @@ -15,6 +16,8 @@ from pathlib import Path import httpx import yaml +logger = logging.getLogger(__name__) + def _config_root() -> Path: """Return the working directory where agents.yaml and fastagent configs live.""" @@ -31,7 +34,6 @@ def _load_deployment_name() -> str: _DEPLOY_NAME = _load_deployment_name() -_PREFIX = f"[{_DEPLOY_NAME}]" # ── Provider API endpoints ─────────────────────────────────────────────────── @@ -170,22 +172,22 @@ async def validate_llm_providers(timeout: float = 5.0) -> dict[str, dict]: err = await _check_anthropic(client, anthropic_key, model_id) if err: results["anthropic"] = {"status": "error", "model": model_id, "message": err} - print(f"{_PREFIX} WARNING: anthropic: {err}") + logger.warning("anthropic: %s", err) else: results["anthropic"] = {"status": "ok", "model": model_id} - print(f"{_PREFIX} anthropic: {model_id} ✓") + logger.info("anthropic: %s ready", model_id) else: # Key is set but Anthropic isn't the active provider — just verify API access err = await _check_anthropic(client, anthropic_key, "claude-sonnet-4-5") if err and "not found" not in err: results["anthropic"] = {"status": "error", "message": err} - print(f"{_PREFIX} WARNING: anthropic: {err}") + logger.warning("anthropic: %s", err) else: results["anthropic"] = {"status": "ok"} - print(f"{_PREFIX} anthropic: API key valid ✓") + logger.info("anthropic: API key valid") elif active_provider == "anthropic": results["anthropic"] = {"status": "error", "message": "API key not configured"} - print(f"{_PREFIX} WARNING: anthropic: API key not configured") + logger.warning("anthropic: API key not configured") # ── OpenAI ─────────────────────────────────────────────────────── if openai_key: @@ -193,22 +195,22 @@ async def validate_llm_providers(timeout: float = 5.0) -> dict[str, dict]: err, models = await _list_openai_models(client, openai_key, openai_base) if err: results["openai"] = {"status": "error", "message": err} - print(f"{_PREFIX} WARNING: openai ({openai_base}): {err}") + logger.warning("openai (%s): %s", openai_base, err) elif model_id: if model_id in models: results["openai"] = {"status": "ok", "model": model_id} - print(f"{_PREFIX} openai ({openai_base}): {model_id} ✓") + logger.info("openai (%s): %s ready", openai_base, model_id) else: label = ", ".join(models) if models else "none" results["openai"] = {"status": "error", "model": model_id, "message": f"model '{model_id}' not found (available: {label})"} - print(f"{_PREFIX} WARNING: openai ({openai_base}): model '{model_id}' not found (available: {label})") + logger.warning("openai (%s): model '%s' not found (available: %s)", openai_base, model_id, label) else: results["openai"] = {"status": "ok", "models": models} label = ", ".join(models) if models else "no models loaded" - print(f"{_PREFIX} openai ({openai_base}): {label} ✓") + logger.info("openai (%s): %s", openai_base, label) elif active_provider == "openai": results["openai"] = {"status": "error", "message": "API key not configured"} - print(f"{_PREFIX} WARNING: openai: API key not configured") + logger.warning("openai: API key not configured") _llm_status.clear() _llm_status.update(results) diff --git a/pallas/log.py b/pallas/log.py new file mode 100644 index 0000000..fe4beaf --- /dev/null +++ b/pallas/log.py @@ -0,0 +1,75 @@ +""" +Logging setup for Pallas. + +Configures structured JSON output so Alloy can extract a ``level`` label from +every log line and feed it into Loki. Call ``setup_logging()`` once from +``server.py:main()`` before any other module emits a log record. + +Log format (one JSON object per line): + {"time": "2026-01-01T00:00:00Z", "level": "INFO", "logger": "pallas.server", "message": "..."} + +Level conventions (Ouranos Lab — Python services use UPPERCASE): + ERROR — something is broken and requires human intervention + WARNING — degraded but self-recovering; retries, missing optional config + INFO — lifecycle events: start, ready, shutdown, LLM preflight + DEBUG — per-request detail; never enabled in production by default +""" + +import json +import logging +from datetime import datetime, timezone + + +class _JSONFormatter(logging.Formatter): + """Single-line JSON formatter compatible with Alloy's ``| json`` pipeline.""" + + def format(self, record: logging.LogRecord) -> str: + return json.dumps( + { + "time": datetime.fromtimestamp(record.created, tz=timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ), + "level": record.levelname, + "logger": record.name, + "message": record.getMessage(), + } + ) + + +class _HealthAccessFilter(logging.Filter): + """Drop uvicorn access log lines for health/metrics endpoints. + + Health check success is the *absence* of errors, not the presence of 200s. + Logging every HAProxy probe at INFO floods syslog with noise. + """ + + _HEALTH_PATHS = (" GET /live ", " GET /ready ", " GET /metrics ") + + def filter(self, record: logging.LogRecord) -> bool: + msg = record.getMessage() + return not any(path in msg for path in self._HEALTH_PATHS) + + +def setup_logging() -> None: + """Configure Pallas logging. + + - ``pallas.*`` logger: INFO, JSON to stdout + - ``httpx`` / ``httpcore``: WARNING (prevent request-level debug flooding) + - ``uvicorn.access``: health path filter applied + """ + handler = logging.StreamHandler() + handler.setFormatter(_JSONFormatter()) + + pallas_logger = logging.getLogger("pallas") + pallas_logger.setLevel(logging.INFO) + if not pallas_logger.handlers: + pallas_logger.addHandler(handler) + pallas_logger.propagate = False + + # Silence noisy HTTP client internals — only surface warnings and above. + for noisy in ("httpx", "httpcore"): + logging.getLogger(noisy).setLevel(logging.WARNING) + + # Suppress successful health probe access log entries. + health_filter = _HealthAccessFilter() + logging.getLogger("uvicorn.access").addFilter(health_filter) diff --git a/pallas/multimodal_server.py b/pallas/multimodal_server.py index 7d70476..001a88f 100644 --- a/pallas/multimodal_server.py +++ b/pallas/multimodal_server.py @@ -28,6 +28,8 @@ from fast_agent.types import PromptMessageExtended, RequestParams from fastmcp import Context as MCPContext from fastmcp.prompts import Message from mcp.types import ImageContent, TextContent +from prometheus_client import CONTENT_TYPE_LATEST, generate_latest +from starlette.responses import JSONResponse, Response logger = get_logger(__name__) @@ -58,6 +60,31 @@ def _history_to_fastmcp_messages( class MultimodalAgentMCPServer(AgentMCPServer): """AgentMCPServer with optional image attachment support on send_message.""" + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._register_health_routes() + + def _register_health_routes(self) -> None: + """Add /live, /ready, and /metrics to this agent's HTTP server. + + Uses FastMCP's custom_route decorator — the same mechanism used by + fast-agent itself for the root ``/`` info route. HAProxy can health + check individual agent backends at ``/ready``. + """ + + @self.mcp_server.custom_route("/live", methods=["GET"]) + async def live(request): + return JSONResponse({"status": "alive"}) + + @self.mcp_server.custom_route("/ready", methods=["GET"]) + async def ready(request): + return JSONResponse({"status": "ready"}) + + @self.mcp_server.custom_route("/metrics", methods=["GET"]) + async def metrics(request): + data = generate_latest() + return Response(content=data, media_type=CONTENT_TYPE_LATEST) + def register_agent_tools(self, agent_name: str) -> None: """Register a send_message tool that accepts text + optional images.""" self._registered_agents.add(agent_name) @@ -116,7 +143,7 @@ class MultimodalAgentMCPServer(AgentMCPServer): async def execute_send() -> str: start = time.perf_counter() - logger.info( + logger.debug( f"MCP request received for agent '{agent_name}'", name="mcp_request_start", agent=agent_name, @@ -124,7 +151,7 @@ class MultimodalAgentMCPServer(AgentMCPServer): ) response = await agent.send(payload, request_params=request_params) duration = time.perf_counter() - start - logger.info( + logger.debug( f"Agent '{agent_name}' completed MCP request", name="mcp_request_complete", agent=agent_name, diff --git a/pallas/registry.py b/pallas/registry.py index acb90ce..42b8f9f 100644 --- a/pallas/registry.py +++ b/pallas/registry.py @@ -4,17 +4,29 @@ Agent Registry Serves GET /.well-known/mcp/server.json for agent discovery. Reads agent topology from agents.yaml and model capabilities from fastagent.config.yaml in the working directory. + +Also exposes standard health and observability endpoints: + GET /live — liveness probe (always 200 while process is up) + GET /ready — readiness probe (200 when all configured agents reachable) + GET /metrics — Prometheus metrics in text exposition format """ +import asyncio +import logging import os from datetime import datetime, timezone from pathlib import Path +import httpx import yaml +from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Gauge, generate_latest from starlette.applications import Starlette -from starlette.responses import JSONResponse +from starlette.requests import Request +from starlette.responses import JSONResponse, PlainTextResponse, Response from starlette.routing import Route +logger = logging.getLogger(__name__) + def _config_root() -> Path: """Return the working directory where agents.yaml and fastagent configs live.""" @@ -106,17 +118,103 @@ def _build_registry(config: dict) -> dict: return {"servers": entries} -# ── Starlette app ───────────────────────────────────────────────────────────── +# ── Prometheus metrics ──────────────────────────────────────────────────────── + +_metrics_registry = CollectorRegistry() +_pallas_up = Gauge( + "pallas_up", + "1 when the Pallas registry is running", + registry=_metrics_registry, +) +_pallas_up.set(1) + + +def _init_agent_metrics(config: dict) -> None: + """Register per-agent info gauges once at startup.""" + agents = config.get("agents", {}) + if not agents: + return + + agent_info = Gauge( + "pallas_agent_info", + "Static info about configured Pallas agents", + labelnames=["agent", "port"], + registry=_metrics_registry, + ) + for name, agent in agents.items(): + agent_info.labels(agent=name, port=str(agent["port"])).set(1) + + +# ── Route handlers ──────────────────────────────────────────────────────────── _deployment_config = _load_deployment_config() +_init_agent_metrics(_deployment_config) -async def server_json(request): +async def server_json(request: Request) -> JSONResponse: return JSONResponse(_build_registry(_deployment_config)) +async def live(request: Request) -> JSONResponse: + """Liveness probe — always 200 while the process is running.""" + return JSONResponse({"status": "alive"}) + + +async def ready(request: Request) -> Response: + """Readiness probe — 200 when all configured agents are reachable.""" + agents = _deployment_config.get("agents", {}) + if not agents: + return JSONResponse({"status": "ready"}) + + missing: list[str] = [] + async with httpx.AsyncClient(timeout=2.0) as client: + checks = await asyncio.gather( + *(_probe_agent(client, name, agent["port"]) for name, agent in agents.items()), + return_exceptions=True, + ) + + for name, result in zip(agents.keys(), checks): + if isinstance(result, Exception) or result is False: + missing.append(name) + + if missing: + return Response( + content=_json_bytes({"status": "unavailable", "missing": missing}), + status_code=503, + media_type="application/json", + ) + return JSONResponse({"status": "ready"}) + + +async def _probe_agent(client: httpx.AsyncClient, name: str, port: int) -> bool: + """Return True if the agent's MCP port is accepting connections.""" + try: + await client.get(f"http://127.0.0.1:{port}/mcp") + return True + except Exception: + return False + + +async def metrics(request: Request) -> Response: + """Prometheus metrics in text exposition format.""" + data = generate_latest(_metrics_registry) + return Response(content=data, media_type=CONTENT_TYPE_LATEST) + + +def _json_bytes(obj: dict) -> bytes: + import json + return json.dumps(obj).encode() + + +# ── Starlette app ───────────────────────────────────────────────────────────── + app = Starlette( - routes=[Route("/.well-known/mcp/server.json", server_json)], + routes=[ + Route("/.well-known/mcp/server.json", server_json), + Route("/live", live), + Route("/ready", ready), + Route("/metrics", metrics), + ], ) @@ -128,8 +226,13 @@ async def run_registry( import uvicorn deploy_name = _deployment_config.get("name", "pallas") - print(f"[{deploy_name}] Registry on port {port}") + logger.info( + "Registry started: %s, port %d, %d agent(s)", + deploy_name, + port, + len(_deployment_config.get("agents", {})), + ) - config = uvicorn.Config(app, host=host, port=port, log_level="info") + config = uvicorn.Config(app, host=host, port=port, log_level="warning") server = uvicorn.Server(config) await server.serve() diff --git a/pallas/server.py b/pallas/server.py index 826344f..77df015 100644 --- a/pallas/server.py +++ b/pallas/server.py @@ -18,8 +18,11 @@ from pathlib import Path import yaml +from pallas.log import setup_logging from pallas.multimodal_server import MultimodalAgentMCPServer +logger = logging.getLogger(__name__) + def _config_root() -> Path: """Return the working directory where agents.yaml and fastagent configs live.""" @@ -32,13 +35,13 @@ def _load_deployment_config() -> dict: """Load agents.yaml — single source of truth for deployment topology.""" config_path = _config_root() / os.environ.get("PALLAS_AGENTS_CONFIG", "agents.yaml") if not config_path.exists(): - raise SystemExit(f"[pallas] ERROR: deployment config not found: {config_path}") + raise SystemExit(f"deployment config not found: {config_path}") with open(config_path) as f: config = yaml.safe_load(f) or {} if "agents" not in config: - raise SystemExit(f"[pallas] ERROR: no 'agents' section in {config_path}") + raise SystemExit(f"no 'agents' section in {config_path}") return config @@ -98,9 +101,9 @@ def _preflight_mcp_servers(agent_name: str, servers: dict[str, dict]) -> None: for var in unresolved: val = os.environ.get(var, "") if not val: - print( - f"[pallas] WARNING: {agent_name} → {server_name}: " - f"{header_key} references ${{{var}}} but it is not set" + logger.warning( + "%s → %s: %s references ${%s} but it is not set", + agent_name, server_name, header_key, var, ) @@ -140,10 +143,10 @@ def _register_unknown_models() -> None: if is_vision: tokenizes = list(ModelDatabase.QWEN_MULTIMODAL) - print(f"[pallas] Registered model '{model_name}' with vision capabilities") + logger.info("Registered model '%s' with vision capabilities", model_name) else: tokenizes = list(ModelDatabase.TEXT_ONLY) - print(f"[pallas] Registered model '{model_name}' as text-only") + logger.info("Registered model '%s' as text-only", model_name) ModelDatabase.register_runtime_model_params( model_name, @@ -171,7 +174,7 @@ async def _start_agent(name: str, agents: dict[str, tuple[str, int]]) -> None: module = importlib.import_module(module_path) fast_instance = module.fast - print(f"[pallas] Starting {name} agent on port {port} ...") + logger.info("Starting %s agent on port %d", name, port) async with fast_instance.run(): primary_instance = fast_instance._server_managed_instances[0] @@ -207,11 +210,11 @@ async def _wait_for_agent( try: async with httpx.AsyncClient(timeout=2.0) as client: await client.get(url) - print(f"[pallas] {name} is ready ✓") + logger.info("%s is ready", name) return except Exception: await asyncio.sleep(1.0) - print(f"[pallas] WARNING: {name} did not become ready within {timeout}s") + logger.warning("%s did not become ready within %.0fs", name, timeout) async def _run_single(name: str, agents: dict[str, tuple[str, int]]) -> None: @@ -282,18 +285,19 @@ def main() -> None: ) args = parser.parse_args() - logging.getLogger("httpx").setLevel(logging.DEBUG) - logging.getLogger("httpcore").setLevel(logging.DEBUG) + setup_logging() if args.agent: _, port = agents[args.agent] - print(f"[{deploy_name}] Starting {args.agent} agent on port {port} ...") + logger.info("Starting %s agent on port %d", args.agent, port) asyncio.run(_run_single(args.agent, agents)) else: - print(f"[{deploy_name}] Starting all agents + registry ...") - print(f" {'registry':16s} → http://0.0.0.0:{registry_port}/.well-known/mcp/server.json") + logger.info("Starting all agents + registry for %s", deploy_name) + logger.info( + "registry → http://0.0.0.0:%d/.well-known/mcp/server.json", registry_port + ) for name, (_, port) in agents.items(): - print(f" {name:16s} → http://0.0.0.0:{port}/mcp") + logger.info("%-16s → http://0.0.0.0:%d/mcp", name, port) asyncio.run(_start_all(config)) diff --git a/pyproject.toml b/pyproject.toml index a2d1b5d..f289254 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,10 +2,11 @@ name = "pallas-mcp" version = "0.1.0" description = "FastAgent MCP Bridge — generic runtime for serving FastAgent agents over StreamableHTTP" -requires-python = ">=3.13" +requires-python = ">=3.13.5" dependencies = [ "fast-agent-mcp>=0.6.10", "httpx", + "prometheus-client", "pyyaml", "starlette", "uvicorn",