mnemosyne/mnemosyne/library/health_collector.py

"""
Scrape-time Prometheus collector for system-default model reachability.

The ingest-pipeline counters in ``library/metrics.py`` live in the Celery
worker process and only move during an active ingest, so they cannot signal
"models down" on an idle queue. This collector runs in the **web** process
(where ``/metrics`` is served by ``django_prometheus``) and probes the four
system-default models at scrape time, emitting an up/down gauge that is
present regardless of queue activity.

Probe results are cached for a short TTL so rapid scrapes — or multiple
gunicorn workers each scraped in turn — cannot hammer the GPU endpoints.
"""

import logging
import threading
import time

from prometheus_client.core import GaugeMetricFamily

from library.services.model_health import probe_system_models

logger = logging.getLogger(__name__)

# Cache probe results so repeated scrapes don't re-probe the router. The
# value is comfortably above a 15s scrape_interval but bounded so a recovered
# model shows green within a minute.
_CACHE_TTL_SECONDS = 55

_lock = threading.Lock()
_cache: dict = {"ts": 0.0, "results": None}


def _cached_probe() -> list[dict]:
    """Return probe results, re-probing only when the cache has expired."""
    now = time.monotonic()
    with _lock:
        if _cache["results"] is not None and (now - _cache["ts"]) < _CACHE_TTL_SECONDS:
            return _cache["results"]
        try:
            results = probe_system_models()
        except Exception as exc:  # never let a probe failure break /metrics
            logger.warning("Model health probe failed during scrape: %s", exc)
            # Serve the stale cache if we have one; otherwise report nothing.
            return _cache["results"] or []
        _cache["ts"] = now
        _cache["results"] = results
        return results


class SystemModelHealthCollector:
    """prometheus_client custom collector for system-default model health."""

    def collect(self):
        results = _cached_probe()

        up = GaugeMetricFamily(
            "mnemosyne_system_default_model_up",
            "System-default model endpoint reachable (1) or not (0)",
            labels=["role", "model", "api"],
        )
        configured = GaugeMetricFamily(
            "mnemosyne_system_default_model_configured",
            "A system-default model is configured for this role (1) or not (0)",
            labels=["role"],
        )
        latency = GaugeMetricFamily(
            "mnemosyne_system_default_model_probe_latency_seconds",
            "Latency of the last reachability probe for this role",
            labels=["role"],
        )

        for r in results:
            role = r["role"]
            configured.add_metric([role], 1 if r["configured"] else 0)
            if not r["configured"]:
                continue
            up.add_metric(
                [role, r["model_name"] or "", r["api_name"] or ""],
                1 if r["ok"] else 0,
            )
            if r["latency_ms"] is not None:
                latency.add_metric([role], r["latency_ms"] / 1000.0)

        yield configured
        yield up
        yield latency


def register():
    """Register the collector against the default registry (idempotent)."""
    from prometheus_client import REGISTRY

    # Guard against duplicate registration (autoreload, repeated ready()).
    for collector in list(getattr(REGISTRY, "_collector_to_names", {})):
        if isinstance(collector, SystemModelHealthCollector):
            return
    REGISTRY.register(SystemModelHealthCollector())
    logger.info("Registered SystemModelHealthCollector on Prometheus default registry")