From f92f3fc71094f6356a4c985dec9d703073016bf1 Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Mon, 11 May 2026 13:52:25 -0400 Subject: [PATCH] feat(log): add service/project/component labels for Loki Inject stable JSON fields (`service`, `project`, `component`) on every log record via a `logging.Filter` so Alloy-forwarded logs share the same label shape as Mnemosyne/Daedalus. The `project` label is set once from `agents.yaml`, and `component` is stored in a ContextVar so each agent's asyncio task carries its own value without leaking across sibling agents. Also switch the opt-in console sink from `sys.__stderr__` to `sys.__stdout__` (`PALLAS_LOG_STDOUT=1`) for cleaner journald capture in systemd deployments. --- pallas/log.py | 158 +++++++++++++++++++++++++++++++++++++---------- pallas/server.py | 17 ++++- 2 files changed, 143 insertions(+), 32 deletions(-) diff --git a/pallas/log.py b/pallas/log.py index 0c483ab..25ed3eb 100644 --- a/pallas/log.py +++ b/pallas/log.py @@ -15,6 +15,7 @@ Level conventions (Ouranos Lab — Python services use UPPERCASE): DEBUG — per-request detail; never enabled in production by default """ +import contextvars import json import logging import logging.handlers @@ -25,6 +26,70 @@ from datetime import datetime, timezone from pathlib import Path +# ── Static / contextual fields ──────────────────────────────────────────────── +# +# Every log line Alloy forwards to Loki gets a small, stable set of JSON +# fields so operators can pivot between services with the same LogQL label +# shape used by Mnemosyne and Daedalus: +# +# service — always "pallas" +# project — the calling project's ``agents.yaml`` ``name:`` (e.g. "kottos", +# "mentor", "iolaus"), set once from ``server.main()`` +# component — the agent currently executing (e.g. "harper", "scotty", +# "research"). Stored in a ContextVar so each asyncio task +# started by ``_start_agent`` carries its own value without +# leaking across agents. +# +# These are added by a ``logging.Filter`` rather than being baked into the +# formatter so DEBUG messages from fast-agent, fastmcp, the MCP SDK and +# anthropic/openai clients all get the same attribution — exactly what Loki +# query filters depend on. + +_PROJECT: str = "pallas" # overridden once by set_project() +_COMPONENT_CTX: contextvars.ContextVar[str] = contextvars.ContextVar( + "pallas_component", default="runtime" +) + + +def set_project(name: str) -> None: + """Set the project label on all subsequent log records. + + Called once from ``pallas.server.main()`` after reading ``agents.yaml``. + """ + global _PROJECT + if name: + _PROJECT = name + + +def set_agent_component(name: str) -> None: + """Bind the current asyncio task's component label to ``name``. + + ``pallas.server._start_agent`` calls this immediately before entering + ``fast_instance.run()`` so every log line emitted by that agent's + task — including records that bubble up from fast-agent / fastmcp / + the MCP SDK — is tagged with the agent's name. ContextVar semantics + mean sibling agent tasks started from the same ``asyncio.gather`` call + each carry their own value without interference. + """ + _COMPONENT_CTX.set(name) + + +class _StaticFieldsFilter(logging.Filter): + """Inject ``service`` / ``project`` / ``component`` onto every record. + + Runs before the JSON formatter so the fields appear as top-level keys + (not nested under ``extra``). The filter is attached to every handler + — both on the root logger and on ``pallas.*`` — so third-party library + records pick up the same labels that Pallas's own records carry. + """ + + def filter(self, record: logging.LogRecord) -> bool: + record.service = "pallas" + record.project = _PROJECT + record.component = _COMPONENT_CTX.get() + return True + + class _JSONFormatter(logging.Formatter): """Single-line JSON formatter compatible with Alloy's ``| json`` pipeline. @@ -51,6 +116,12 @@ class _JSONFormatter(logging.Formatter): "level": record.levelname, "logger": record.name, "message": record.getMessage(), + # Set by _StaticFieldsFilter; fall back to module-level defaults + # in case the filter is somehow bypassed (e.g. third-party handler + # that doesn't inherit our filters). + "service": getattr(record, "service", "pallas"), + "project": getattr(record, "project", _PROJECT), + "component": getattr(record, "component", _COMPONENT_CTX.get()), } if record.exc_info: if not record.exc_text: @@ -173,20 +244,26 @@ def setup_logging() -> None: stdout and swallows plain ``print`` / stderr writes mid-render, so a file sink is the only guaranteed way to see DEBUG diagnostics and tracebacks. - * ``StreamHandler`` → ``sys.__stderr__`` (opt-in). Uses the *original* - stderr FD captured before Rich installed its Live display, so records - would bypass the progress renderer and land in the terminal or journal - verbatim. In interactive ``fast-agent go`` sessions this still - smashes the Rich UI (Rich redraws on top but our JSON lines leak - through every repaint), so we only attach it when ``PALLAS_LOG_STDERR`` - is set to a truthy value — production systemd deployments that want - journal capture should set ``PALLAS_LOG_STDERR=1``; interactive users - get a clean TUI and rely on the rotating file sink. + * ``StreamHandler`` → ``sys.__stdout__`` (opt-in via ``PALLAS_LOG_STDOUT=1``). + The preferred sink for a systemd-managed deployment: stdout becomes a + journald entry with ``PRIORITY=6`` (INFO) and SyslogIdentifier matching + the unit, which Alloy's ``loki.source.journal`` then relabels into + Loki. Stdout is also what Docker's ``json-file`` driver captures, so + this same flag works if Pallas is ever containerised. + * ``StreamHandler`` → ``sys.__stderr__`` (opt-in via ``PALLAS_LOG_STDERR=1``). + Uses the *original* stderr FD captured before Rich installed its Live + display, so records would bypass the progress renderer and land in the + terminal or journal verbatim. In interactive ``fast-agent go`` sessions + this still smashes the Rich UI (Rich redraws on top but our JSON lines + leak through every repaint), so we only attach it when the operator + explicitly opts in. Retained for backwards compatibility; new + deployments should prefer ``PALLAS_LOG_STDOUT=1``. - ``httpx`` / ``httpcore``: WARNING (prevent request-level debug flooding) - ``uvicorn.access``: health path filter applied """ formatter = _JSONFormatter() + static_fields = _StaticFieldsFilter() # Pallas's own diagnostics level — drives the ``pallas.*`` loggers and # the file sink verbosity. Unchanged behaviour for operators. @@ -218,32 +295,51 @@ def setup_logging() -> None: # so third-party libraries' records land in the same file sink. handlers: list[logging.Handler] = [] log_file: Path | None = None - try: - log_file = _resolve_log_file() - log_file.parent.mkdir(parents=True, exist_ok=True) - file_handler = logging.handlers.RotatingFileHandler( - log_file, - maxBytes=10 * 1024 * 1024, - backupCount=5, - encoding="utf-8", - ) - file_handler.setFormatter(formatter) - handlers.append(file_handler) - except Exception as exc: # pragma: no cover — never fatal - logging.getLogger(__name__).warning( - "could not attach file log handler: %s", exc - ) + # Rotating file sink. Disable by pointing ``PALLAS_LOG_FILE`` at + # ``/dev/null`` — systemd deployments that rely on journal capture can do + # this to avoid writing a second copy of every record to disk. + log_path_override = os.environ.get("PALLAS_LOG_FILE", "") + if log_path_override == "/dev/null": + log_file = None + else: + try: + log_file = _resolve_log_file() + log_file.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.handlers.RotatingFileHandler( + log_file, + maxBytes=10 * 1024 * 1024, + backupCount=5, + encoding="utf-8", + ) + file_handler.setFormatter(formatter) + file_handler.addFilter(static_fields) + handlers.append(file_handler) + except Exception as exc: # pragma: no cover — never fatal + logging.getLogger(__name__).warning( + "could not attach file log handler: %s", exc + ) - # Stderr sink — *opt-in* only. Interactive ``fast-agent go`` sessions - # install Rich's Live display which owns the terminal; emitting JSON - # log records to stderr during that session corrupts the TUI on every - # redraw and made Pallas effectively unusable in dev. Operators who - # want systemd/journal capture can re-enable by exporting - # ``PALLAS_LOG_STDERR=1`` before launching. The rotating file sink - # above is the always-on durable capture. + # Stdout sink — preferred for systemd + journald deployments (PPLG path: + # journald → Alloy journal source → Loki). Emits the same JSON as the + # rotating file sink so nothing is lost when operators disable the file. + if os.environ.get("PALLAS_LOG_STDOUT", "").lower() in ("1", "true", "yes"): + stdout_handler = logging.StreamHandler(stream=sys.__stdout__) + stdout_handler.setFormatter(formatter) + stdout_handler.addFilter(static_fields) + handlers.append(stdout_handler) + + # Stderr sink — *opt-in* only, retained for backwards compatibility. + # Interactive ``fast-agent go`` sessions install Rich's Live display + # which owns the terminal; emitting JSON log records to stderr during + # that session corrupts the TUI on every redraw and made Pallas + # effectively unusable in dev. New systemd deployments should prefer + # ``PALLAS_LOG_STDOUT=1`` — stderr remains available when operators + # want to keep stdout reserved for non-log output (rare, but cheap + # to support). if os.environ.get("PALLAS_LOG_STDERR", "").lower() in ("1", "true", "yes"): stream_handler = logging.StreamHandler(stream=sys.__stderr__) stream_handler.setFormatter(formatter) + stream_handler.addFilter(static_fields) handlers.append(stream_handler) # Root logger carries everything from libraries we do NOT own. We diff --git a/pallas/server.py b/pallas/server.py index b05447b..be88a8f 100644 --- a/pallas/server.py +++ b/pallas/server.py @@ -18,7 +18,7 @@ from pathlib import Path import yaml -from pallas.log import setup_logging +from pallas.log import set_agent_component, set_project, setup_logging from pallas.multimodal_server import MultimodalAgentMCPServer logger = logging.getLogger(__name__) @@ -198,6 +198,14 @@ async def _preflight(deployment_config: dict) -> None: async def _start_agent(name: str, agents: dict[str, dict]) -> None: from pallas.health import register_health_tool + # Tag every log record emitted from this asyncio task with the agent + # name (``component`` field). ContextVar scope is per-task, so each + # agent started from ``asyncio.gather`` carries its own value and the + # registry's own task sees ``component="runtime"`` by default. Fast- + # agent, fastmcp, anthropic and openai records all inherit this tag + # because the _StaticFieldsFilter is attached to every handler. + set_agent_component(name) + entry = agents[name] module_path = entry["module"] port = entry["port"] @@ -337,6 +345,13 @@ def main() -> None: ) args = parser.parse_args() + # Project label (e.g. "kottos", "mentor", "iolaus") is read from + # ``agents.yaml`` and stamped onto every log record via + # ``pallas.log._StaticFieldsFilter``. Must be set before + # ``setup_logging()`` so the "log file: …" bootstrap record already + # carries the right project label. + set_project(deploy_name) + setup_logging() if args.agent: