log: route third-party + traceback records through the pallas log

The existing setup only attached the file/stderr handlers to the 'pallas' namespace, so every record emitted by fast-agent, fastmcp, the MCP SDK, Anthropic, uvicorn etc. disappeared into Rich's progress display and never hit pallas.log. When one of those libraries raised and logged 'something failed' via logger.error(..., exc_info=True), we ended up grepping a Rich-overwritten TTY for a traceback that was already long gone -- exactly the situation blocking the current Mnemosyne debug. This patch: * Extends _JSONFormatter to serialise exc_info/stack_info as a 'traceback' field when present, so Loki/grep sees the full stack. * Attaches the same file+stderr handlers to the *root* logger so every library's records (and any uncaught logger.error tracebacks) land in pallas.log with the stack attached. * Keeps the 'pallas' logger's own handlers (propagate=False) so our records are unaffected by any later root-handler manipulation. * Tags our handlers with _pallas_attached so repeated setup_logging() calls are idempotent -- important because uvicorn workers and fast-agent subagent subprocesses each reinitialise logging. httpx/httpcore stay at WARNING so we don't flood the log with per- request body traces on a DEBUG deployment. Demote third-party namespaces further in a follow-up if needed.
2026-05-05 22:46:14 -04:00
parent 66b5dd7bdd
commit ac4af942ab
1 changed files with 105 additions and 43 deletions
--- a/pallas/log.py
+++ b/pallas/log.py
@@ -26,19 +26,39 @@ from pathlib import Path


 class _JSONFormatter(logging.Formatter):
-    """Single-line JSON formatter compatible with Alloy's ``| json`` pipeline."""
+    """Single-line JSON formatter compatible with Alloy's ``| json`` pipeline.
+
+    Captures ``exc_info`` when present so callers that do
+    ``logger.error("...", exc_info=True)`` (or let an unhandled exception
+    bubble into a log handler) get the full traceback preserved as a
+    ``traceback`` field.  Without this, every ``logger.error`` in
+    fast-agent / fastmcp / the MCP SDK loses its stack trace and we end
+    up guessing from the single-line message — which is exactly the
+    rabbit hole we spent hours in during the bearer-forwarding debug.
+
+    Also pulls in ``record.exc_text`` if the formatter was already
+    populated upstream (e.g. by another handler), avoiding duplicate
+    ``traceback.format_exception`` work.
+    """

    def format(self, record: logging.LogRecord) -> str:
-        return json.dumps(
-            {
-                "time": datetime.fromtimestamp(record.created, tz=timezone.utc).strftime(
-                    "%Y-%m-%dT%H:%M:%SZ"
-                ),
-                "level": record.levelname,
-                "logger": record.name,
-                "message": record.getMessage(),
-            }
-        )
+        payload: dict[str, object] = {
+            "time": datetime.fromtimestamp(record.created, tz=timezone.utc).strftime(
+                "%Y-%m-%dT%H:%M:%SZ"
+            ),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+        if record.exc_info:
+            if not record.exc_text:
+                record.exc_text = self.formatException(record.exc_info)
+            payload["traceback"] = record.exc_text
+        elif record.exc_text:
+            payload["traceback"] = record.exc_text
+        if record.stack_info:
+            payload["stack"] = self.formatStack(record.stack_info)
+        return json.dumps(payload)


 class _HealthAccessFilter(logging.Filter):
@@ -131,13 +151,26 @@ def setup_logging() -> None:
    ``pallas.auth`` loggers — essential when troubleshooting Mnemosyne /
    workspace-scoped agent calls.

+    Scope of handlers:
+
+    * The ``pallas`` logger gets its own copies (``propagate=False``) so our
+      own records never double-log if the root handlers are ever changed.
+    * The **root** logger also gets the same file+stderr handlers.  This is
+      what captures everything emitted by fast-agent / fastmcp / the MCP SDK
+      / Anthropic / uvicorn — critically including their tracebacks via
+      ``_JSONFormatter``'s ``exc_info`` support.  Without this, a
+      ``logger.error("send_request failed", exc_info=True)`` inside
+      fast-agent disappears into Rich's progress display and we're left
+      guessing from a single-line message.
+
    Handlers attached:

    * ``RotatingFileHandler`` → ``PALLAS_LOG_FILE`` (or
      ``~/.local/state/pallas/pallas.log``).  Durable capture that survives
      Rich's console takeover — fast-agent's ``progress_display`` hijacks
      stdout and swallows plain ``print`` / stderr writes mid-render, so a
-      file sink is the only guaranteed way to see DEBUG diagnostics.
+      file sink is the only guaranteed way to see DEBUG diagnostics and
+      tracebacks.
    * ``StreamHandler`` → ``sys.__stderr__``.  Uses the *original* stderr
      file descriptor captured before Rich installed its Live display, so
      records bypass the progress renderer and land in the terminal or
@@ -155,43 +188,72 @@ def setup_logging() -> None:
    ).upper()
    level = getattr(logging, level_name, logging.INFO)

+    # Build the shared handlers once — they get attached to both the
+    # ``pallas`` logger (with ``propagate=False``) *and* the root logger,
+    # so third-party libraries' records land in the same file sink.
+    handlers: list[logging.Handler] = []
+    log_file: Path | None = None
+    try:
+        log_file = _resolve_log_file()
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.handlers.RotatingFileHandler(
+            log_file,
+            maxBytes=10 * 1024 * 1024,
+            backupCount=5,
+            encoding="utf-8",
+        )
+        file_handler.setFormatter(formatter)
+        handlers.append(file_handler)
+    except Exception as exc:  # pragma: no cover — never fatal
+        logging.getLogger(__name__).warning(
+            "could not attach file log handler: %s", exc
+        )
+
+    # Stderr sink — writes to the *original* FD Rich captured at import
+    # time, so our records skip the progress display and hit the TTY
+    # (or systemd journal) directly.
+    stream_handler = logging.StreamHandler(stream=sys.__stderr__)
+    stream_handler.setFormatter(formatter)
+    handlers.append(stream_handler)
+
+    # Root logger carries everything from libraries we do NOT own.
+    # We set the level low enough to accept our configured level; each
+    # noisy namespace is dialled back below.  Handlers are attached here,
+    # not on child loggers, so records propagate to a single sink.
+    root = logging.getLogger()
+    root.setLevel(level)
+    # Avoid duplicate handlers across re-entrant setup_logging() calls
+    # (e.g. if the agent subprocess re-imports pallas with an altered
+    # config).  We tag ours so a safe idempotent re-attach is possible.
+    for h in list(root.handlers):
+        if getattr(h, "_pallas_attached", False):
+            root.removeHandler(h)
+    for h in handlers:
+        setattr(h, "_pallas_attached", True)
+        root.addHandler(h)
+
    pallas_logger = logging.getLogger("pallas")
    pallas_logger.setLevel(level)
+    # Keep propagate=False so our own records don't appear twice (once via
+    # the dedicated handlers below, once via the root logger).  We give
+    # ``pallas`` its own copies of the handlers instead of relying on
+    # propagation — this keeps its behaviour stable even if some other
+    # bootstrap code clears the root handlers later.
    pallas_logger.propagate = False
+    for h in list(pallas_logger.handlers):
+        if getattr(h, "_pallas_attached", False):
+            pallas_logger.removeHandler(h)
+    for h in handlers:
+        pallas_logger.addHandler(h)

-    if not pallas_logger.handlers:
-        # File sink — always present so DEBUG output is never lost to a
-        # Rich-owned terminal.  Rotate at 10 MiB, keep 5 archives (≈50 MiB
-        # max per agent) which easily spans a multi-day debugging session.
-        try:
-            log_file = _resolve_log_file()
-            log_file.parent.mkdir(parents=True, exist_ok=True)
-            file_handler = logging.handlers.RotatingFileHandler(
-                log_file,
-                maxBytes=10 * 1024 * 1024,
-                backupCount=5,
-                encoding="utf-8",
-            )
-            file_handler.setFormatter(formatter)
-            pallas_logger.addHandler(file_handler)
-            # Announce the sink so operators can tail the right path without
-            # guessing — emitted at INFO so even a WARNING-only deployment
-            # won't see it; at DEBUG it's easy to grep for.
-            pallas_logger.info("log file: %s", log_file)
-        except Exception as exc:  # pragma: no cover — never fatal
-            # Fall back silently; the stderr handler below still works.
-            logging.getLogger(__name__).warning(
-                "could not attach file log handler: %s", exc
-            )
-
-        # Stderr sink — writes to the *original* FD Rich captured at import
-        # time, so our records skip the progress display and hit the TTY
-        # (or systemd journal) directly.
-        stream_handler = logging.StreamHandler(stream=sys.__stderr__)
-        stream_handler.setFormatter(formatter)
-        pallas_logger.addHandler(stream_handler)
+    if log_file is not None:
+        # Announce the sink so operators can tail the right path without
+        # guessing — emitted at INFO so even a WARNING-only deployment
+        # won't see it; at DEBUG it's easy to grep for.
+        pallas_logger.info("log file: %s", log_file)

    # Silence noisy HTTP client internals — only surface warnings and above.
+    # Applied *after* root-level configuration so these win.
    for noisy in ("httpx", "httpcore"):
        logging.getLogger(noisy).setLevel(logging.WARNING)