chore(compose): add shared json-file logging config and component labels

Introduce x-logging anchor with json-file driver, size/file caps, and container name tagging so Alloy on puck can reliably tail every service through the Docker socket. Apply to all services and inject MNEMOSYNE_COMPONENT env vars (init/app/mcp/worker) for consistent log attribution both
2026-05-11 13:52:00 -04:00
parent 8ddbcf4612
commit 551c641e90
5 changed files with 225 additions and 37 deletions
--- a/mnemosyne/mnemosyne/log_filters.py
+++ b/mnemosyne/mnemosyne/log_filters.py
@@ -0,0 +1,67 @@
+"""Logging filters shared across Mnemosyne processes.
+
+These are project-level (not tied to a Django app) so Celery workers and
+the FastMCP ASGI app can reuse them without importing app modules.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+
+# Paths that should not show up in INFO when the response is a success.
+# Anything >= 400 still flows through — a failing probe is a real signal.
+_SUPPRESS_PATHS = frozenset(
+    {
+        "/live/",
+        "/live",
+        "/ready/",
+        "/ready",
+        "/healthz",
+        "/metrics",
+    }
+)
+
+
+class SuppressHealthAccessFilter(logging.Filter):
+    """Demote successful access-log records for health endpoints to DEBUG.
+
+    Applied to ``django.server`` (runserver) and ``gunicorn.access`` via
+    the ``access`` handler in :data:`mnemosyne.settings.LOGGING`.  The filter
+    returns ``False`` (drop the record) only when the request path is a
+    health endpoint AND the HTTP status is 1xx/2xx/3xx.  Any failure on
+    ``/ready/`` or ``/live/`` still propagates so an operator sees
+    readiness flaps.
+
+    The two access loggers format their messages differently:
+
+    * ``django.server`` emits ``'"GET /live/ HTTP/1.1" 200 0'`` as the
+      message with no args.
+    * ``gunicorn.access`` typically has the path in ``record.args`` when
+      the access log format is configured, but many deployments fall
+      back to a pre-formatted message.  We parse the final rendered
+      message in both cases to keep the filter portable across Mnemosyne
+      containers (which run gunicorn) and local dev (``runserver``).
+    """
+
+    # Matches the path portion of the quoted request line inside either
+    # format.  Tolerant of missing trailing slashes and query strings.
+    _REQUEST_RE = re.compile(r'"\s*(?:GET|POST|HEAD|OPTIONS|PUT|PATCH|DELETE)\s+(\S+)')
+    _STATUS_RE = re.compile(r'"\s+(\d{3})\b')
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        msg = record.getMessage()
+        path_match = self._REQUEST_RE.search(msg)
+        status_match = self._STATUS_RE.search(msg)
+        if not path_match or not status_match:
+            return True
+
+        path = path_match.group(1).split("?", 1)[0]
+        status = int(status_match.group(1))
+
+        # Only suppress successful probes; surface any 4xx/5xx on a
+        # health endpoint so operators see readiness flaps.
+        if path in _SUPPRESS_PATHS and status < 400:
+            return False
+        return True
--- a/mnemosyne/mnemosyne/settings.py
+++ b/mnemosyne/mnemosyne/settings.py
@@ -278,35 +278,75 @@ THEMIS_NOTIFICATION_POLL_INTERVAL = 60
 THEMIS_NOTIFICATION_MAX_AGE_DAYS = 90

 # --- Structured Logging ---
+# All log output is line-delimited JSON on stdout, one record per line.
+# Alloy (running on the host / container sidecar) tails the container's
+# stdout stream and ships to Loki.  No log files, no syslog — a single,
+# uniform transport across every service on this host.
+#
+# Labels attached by Alloy (NOT embedded here): service, component,
+# environment, hostname.  "component" is injected by the formatter as
+# a static field based on the MNEMOSYNE_COMPONENT env var set per
+# docker-compose service (app | mcp | worker).  This keeps the label
+# shape consistent with Pallas and future services.
+#
+# Level policy (Ouranos Lab standard):
+#   ERROR   — broken; requires human attention
+#   WARNING — degraded but self-recovering; retries, skipped items
+#   INFO    — lifecycle events and failures; no 200 OK health probes
+#   DEBUG   — health-probe success, per-request detail, verbose traces
 LOGGING_LEVEL = env("LOGGING_LEVEL", default="INFO")
 CELERY_LOGGING_LEVEL = env("CELERY_LOGGING_LEVEL", default="INFO")
 DJANGO_LOGGING_LEVEL = env("DJANGO_LOGGING_LEVEL", default="WARNING")
+MNEMOSYNE_COMPONENT = env("MNEMOSYNE_COMPONENT", default="app")

 LOGGING = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
-        "structured": {
-            "format": (
-                "[%(levelname)s] %(asctime)s "
-                "service=mnemosyne "
-                "module=%(name)s "
-                "func=%(funcName)s "
-                "line=%(lineno)d "
-                "%(message)s"
+        # JSON formatter — one line of JSON per record.  Alloy's ``| json``
+        # pipeline in LogQL will parse these fields into queryable columns
+        # (level, logger, funcName, lineno, message, plus anything passed
+        # via ``logger.info("...", extra={...})``).
+        "json": {
+            "()": "pythonjsonlogger.json.JsonFormatter",
+            "fmt": (
+                "%(asctime)s %(levelname)s %(name)s "
+                "%(funcName)s %(lineno)d %(message)s"
            ),
-            "datefmt": "%Y-%m-%d %H:%M:%S",
+            "rename_fields": {
+                "asctime": "time",
+                "levelname": "level",
+                "name": "logger",
+            },
+            "static_fields": {
+                "service": "mnemosyne",
+                "component": MNEMOSYNE_COMPONENT,
+            },
        },
-        "simple": {
-            "format": "[%(levelname)s] %(name)s: %(message)s",
+    },
+    "filters": {
+        # Demotes successful health-probe access log lines from INFO to
+        # DEBUG so production INFO output stays signal-only.  Applied to
+        # django.server and gunicorn.access; uvicorn does its own thing
+        # via the structlog-style filter in mcp_server.
+        "suppress_health_access": {
+            "()": "mnemosyne.log_filters.SuppressHealthAccessFilter",
        },
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
-            "formatter": "structured",
+            "formatter": "json",
            "stream": "ext://sys.stdout",
        },
+        # Separate handler for django/gunicorn access logs so we can apply
+        # the health-path filter without affecting application loggers.
+        "access": {
+            "class": "logging.StreamHandler",
+            "formatter": "json",
+            "stream": "ext://sys.stdout",
+            "filters": ["suppress_health_access"],
+        },
    },
    "loggers": {
        "library": {
@@ -324,6 +364,11 @@ LOGGING = {
            "level": LOGGING_LEVEL,
            "propagate": False,
        },
+        "mcp_server": {
+            "handlers": ["console"],
+            "level": LOGGING_LEVEL,
+            "propagate": False,
+        },
        "celery": {
            "handlers": ["console"],
            "level": CELERY_LOGGING_LEVEL,
@@ -339,9 +384,31 @@ LOGGING = {
            "level": DJANGO_LOGGING_LEVEL,
            "propagate": False,
        },
+        # Django's runserver / gunicorn access logs — demote health probes
+        # to DEBUG so "5xx on /ready/" is easy to spot in INFO.
+        "django.server": {
+            "handlers": ["access"],
+            "level": DJANGO_LOGGING_LEVEL,
+            "propagate": False,
+        },
+        "gunicorn.access": {
+            "handlers": ["access"],
+            "level": DJANGO_LOGGING_LEVEL,
+            "propagate": False,
+        },
+        # Noisy library internals — pin to WARNING regardless of root level
+        # so we don't drown in HTTP-client debug spam when LOGGING_LEVEL=DEBUG.
+        "httpx": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "httpcore": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "openai": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "urllib3": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "botocore": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "boto3": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "s3transfer": {"handlers": ["console"], "level": "WARNING", "propagate": False},
+        "neo4j": {"handlers": ["console"], "level": "WARNING", "propagate": False},
    },
    "root": {
        "handlers": ["console"],
        "level": LOGGING_LEVEL,
    },
-}
+}