feat(metrics): add scrape-time system model health collector
All checks were successful
CVE Scan & Docker Build / security-scan (push) Successful in 3m49s
Build & Deploy Docs / build-and-deploy (push) Successful in 1m9s
CVE Scan & Docker Build / build-and-push (push) Successful in 3m32s

Add a Prometheus custom collector that probes the four system-default
models (chat, vision, embedding, reranker) at /metrics scrape time and
emits up/down, configured, and probe-latency gauges. This complements
the ingest-pipeline counters in the Celery worker, which only move
during active ingests and cannot signal model outages on an idle queue.

- New `library/health_collector.py` registers a custom collector with
  a 55s in-process cache to avoid hammering GPU endpoints on rapid
  scrapes or across multiple gunicorn workers.
- New `library/services/model_health.py` centralises the probe logic,
  resolving system-default models via SystemSettings and dispatching
  to chat/embedding/rerank endpoints with a short timeout.
- Register the collector only in the web process (gunicorn/runserver)
  via `LibraryConfig.ready`, excluding Celery, pytest, and management
  commands to prevent duplicate registration and stray probes.
- Add unit tests covering the collector cache, metric shape, and
  per-role probe dispatch.
This commit is contained in:
2026-06-17 09:06:11 -04:00
parent 4dde063299
commit a90c6e7479
6 changed files with 277 additions and 1 deletions

View File

@@ -88,6 +88,29 @@ def _should_skip_probe() -> bool:
return False return False
def _is_web_process() -> bool:
"""
True when running inside the web (gunicorn / runserver) process.
The reachability collector must only register here: ``/metrics`` is served
by the web process, and registering in the Celery worker would both probe
the GPU endpoints from a process whose metrics nobody scrapes and risk
duplicate registration. Celery launches via ``celery`` argv; management
commands are excluded above.
"""
argv0 = sys.argv[0]
if "celery" in argv0 or (len(sys.argv) >= 2 and sys.argv[1] == "celery"):
return False
if "pytest" in argv0 or "PYTEST_CURRENT_TEST" in os.environ:
return False
# gunicorn (prod) or runserver (dev).
if "gunicorn" in argv0:
return True
if len(sys.argv) >= 2 and sys.argv[1] == "runserver":
return True
return False
def _run_startup_probe(): def _run_startup_probe():
""" """
Emit ERROR/WARNING logs if the stack is misconfigured for search. Emit ERROR/WARNING logs if the stack is misconfigured for search.
@@ -199,4 +222,7 @@ class LibraryConfig(AppConfig):
verbose_name = "Library" verbose_name = "Library"
def ready(self): def ready(self):
pass if _is_web_process():
from library.health_collector import register
register()

View File

@@ -0,0 +1,99 @@
"""
Scrape-time Prometheus collector for system-default model reachability.
The ingest-pipeline counters in ``library/metrics.py`` live in the Celery
worker process and only move during an active ingest, so they cannot signal
"models down" on an idle queue. This collector runs in the **web** process
(where ``/metrics`` is served by ``django_prometheus``) and probes the four
system-default models at scrape time, emitting an up/down gauge that is
present regardless of queue activity.
Probe results are cached for a short TTL so rapid scrapes — or multiple
gunicorn workers each scraped in turn — cannot hammer the GPU endpoints.
"""
import logging
import threading
import time
from prometheus_client.core import GaugeMetricFamily
from library.services.model_health import probe_system_models
logger = logging.getLogger(__name__)
# Cache probe results so repeated scrapes don't re-probe the router. The
# value is comfortably above a 15s scrape_interval but bounded so a recovered
# model shows green within a minute.
_CACHE_TTL_SECONDS = 55
_lock = threading.Lock()
_cache: dict = {"ts": 0.0, "results": None}
def _cached_probe() -> list[dict]:
"""Return probe results, re-probing only when the cache has expired."""
now = time.monotonic()
with _lock:
if _cache["results"] is not None and (now - _cache["ts"]) < _CACHE_TTL_SECONDS:
return _cache["results"]
try:
results = probe_system_models()
except Exception as exc: # never let a probe failure break /metrics
logger.warning("Model health probe failed during scrape: %s", exc)
# Serve the stale cache if we have one; otherwise report nothing.
return _cache["results"] or []
_cache["ts"] = now
_cache["results"] = results
return results
class SystemModelHealthCollector:
"""prometheus_client custom collector for system-default model health."""
def collect(self):
results = _cached_probe()
up = GaugeMetricFamily(
"mnemosyne_system_default_model_up",
"System-default model endpoint reachable (1) or not (0)",
labels=["role", "model", "api"],
)
configured = GaugeMetricFamily(
"mnemosyne_system_default_model_configured",
"A system-default model is configured for this role (1) or not (0)",
labels=["role"],
)
latency = GaugeMetricFamily(
"mnemosyne_system_default_model_probe_latency_seconds",
"Latency of the last reachability probe for this role",
labels=["role"],
)
for r in results:
role = r["role"]
configured.add_metric([role], 1 if r["configured"] else 0)
if not r["configured"]:
continue
up.add_metric(
[role, r["model_name"] or "", r["api_name"] or ""],
1 if r["ok"] else 0,
)
if r["latency_ms"] is not None:
latency.add_metric([role], r["latency_ms"] / 1000.0)
yield configured
yield up
yield latency
def register():
"""Register the collector against the default registry (idempotent)."""
from prometheus_client import REGISTRY
# Guard against duplicate registration (autoreload, repeated ready()).
for collector in list(getattr(REGISTRY, "_collector_to_names", {})):
if isinstance(collector, SystemModelHealthCollector):
return
REGISTRY.register(SystemModelHealthCollector())
logger.info("Registered SystemModelHealthCollector on Prometheus default registry")

View File

@@ -0,0 +1,119 @@
"""
System-default model reachability probes.
Provides a cheap, bounded liveness check for the four system-default models
(embedding, chat, vision, reranker) so the embedding dashboard and the
scrape-time Prometheus collector can surface "model not responding" without
running an ingest.
The probe deliberately hits ``GET {base_url}/models`` as its primary check:
on an OpenAI-compatible router (e.g. the llama-router) this answers instantly
without loading a model, so repeated probes never burn GPU time. This mirrors
the GPU-avoidance principle in ``mcp_server/tools/health.py``.
"""
import logging
import time
from typing import Optional
import requests
logger = logging.getLogger(__name__)
# api_type values whose endpoints expose an OpenAI-compatible ``/models`` list.
_OPENAI_COMPATIBLE = {"openai", "azure", "ollama", "llama-cpp", "vllm"}
# (role, getter method name) pairs — order is the dashboard/metrics order.
ROLE_GETTERS = [
("embedding", "get_system_embedding_model"),
("chat", "get_system_chat_model"),
("vision", "get_system_vision_model"),
("reranker", "get_system_reranker_model"),
]
def probe_api(api, timeout: int = 5) -> tuple[bool, str]:
"""Check whether an ``LLMApi`` endpoint is responding.
Args:
api: ``LLMApi`` instance (provides base_url, api_key, api_type).
timeout: Per-request timeout in seconds.
Returns:
``(ok, detail)`` — ok is True if the endpoint answered acceptably;
detail is a short human-readable status (HTTP code, error, or "ok").
"""
base_url = api.base_url.rstrip("/")
headers = {}
if api.api_key:
headers["Authorization"] = f"Bearer {api.api_key}"
if api.api_type not in _OPENAI_COMPATIBLE:
# bedrock / anthropic have no equivalent cheap unauthenticated list;
# treat a reachable host as the liveness signal via a HEAD on base_url.
try:
resp = requests.head(base_url, headers=headers, timeout=timeout)
return True, f"reachable (HTTP {resp.status_code})"
except requests.RequestException as exc:
return False, type(exc).__name__
url = f"{base_url}/models"
try:
resp = requests.get(url, headers=headers, timeout=timeout)
except requests.Timeout:
return False, f"timeout after {timeout}s"
except requests.RequestException as exc:
return False, type(exc).__name__
if resp.status_code == 200:
return True, "ok"
return False, f"HTTP {resp.status_code}"
def probe_system_models(timeout: int = 5) -> list[dict]:
"""Probe all four system-default models for reachability.
Returns:
One dict per role with keys: ``role``, ``configured``, ``model_name``,
``api_name``, ``base_url``, ``ok``, ``detail``, ``latency_ms``.
For an unconfigured role, ``configured`` is False and the probe is
skipped (``ok`` is None).
"""
from llm_manager.models import LLMModel
results: list[dict] = []
for role, getter_name in ROLE_GETTERS:
model = getattr(LLMModel, getter_name)()
if model is None:
results.append(
{
"role": role,
"configured": False,
"model_name": None,
"api_name": None,
"base_url": None,
"ok": None,
"detail": "not configured",
"latency_ms": None,
}
)
continue
api = model.api
start = time.monotonic()
ok, detail = probe_api(api, timeout=timeout)
latency_ms = round((time.monotonic() - start) * 1000, 1)
results.append(
{
"role": role,
"configured": True,
"model_name": model.name,
"api_name": api.name,
"base_url": api.base_url,
"ok": ok,
"detail": detail,
"latency_ms": latency_ms,
}
)
return results

View File

@@ -0,0 +1,18 @@
{% comment %}
Reachability badge for a system-default model. Expects `h` = one entry from
the `model_health` dict (keys: configured, ok, detail, latency_ms). Renders
nothing when the role is absent from model_health (probe failed entirely).
Text-only badges to match the existing dashboard palette (no emoji per house
HTML rule).
{% endcomment %}
{% if h %}
{% if not h.configured %}
<span class="badge badge-ghost badge-sm ml-2" title="No system-default model set for this role">NOT CONFIGURED</span>
{% elif h.ok %}
<span class="badge badge-success badge-sm ml-2" title="{{ h.detail }}">REACHABLE</span>
{% if h.latency_ms is not None %}<span class="text-xs opacity-50 ml-1">{{ h.latency_ms }} ms</span>{% endif %}
{% else %}
<span class="badge badge-error badge-sm ml-2" title="Probe detail: {{ h.detail }}">NOT RESPONDING</span>
<span class="text-xs opacity-60 ml-1">{{ h.detail }}</span>
{% endif %}
{% endif %}

View File

@@ -28,6 +28,7 @@
{% if system_embedding_model.supports_multimodal %} {% if system_embedding_model.supports_multimodal %}
<span class="badge badge-accent badge-sm ml-1">Multimodal</span> <span class="badge badge-accent badge-sm ml-1">Multimodal</span>
{% endif %} {% endif %}
{% include "library/_model_health_badge.html" with h=model_health.embedding %}
{% else %} {% else %}
<div class="flex items-center gap-2"> <div class="flex items-center gap-2">
<span class="badge badge-error">NOT CONFIGURED</span> <span class="badge badge-error">NOT CONFIGURED</span>
@@ -41,6 +42,7 @@
<td> <td>
{% if system_chat_model %} {% if system_chat_model %}
<span class="font-semibold">{{ system_chat_model.api.name }}: {{ system_chat_model.name }}</span> <span class="font-semibold">{{ system_chat_model.api.name }}: {{ system_chat_model.name }}</span>
{% include "library/_model_health_badge.html" with h=model_health.chat %}
{% else %} {% else %}
<span class="text-sm opacity-60">Not configured — concept extraction disabled</span> <span class="text-sm opacity-60">Not configured — concept extraction disabled</span>
{% endif %} {% endif %}
@@ -51,6 +53,7 @@
<td> <td>
{% if system_reranker_model %} {% if system_reranker_model %}
<span class="font-semibold">{{ system_reranker_model.api.name }}: {{ system_reranker_model.name }}</span> <span class="font-semibold">{{ system_reranker_model.api.name }}: {{ system_reranker_model.name }}</span>
{% include "library/_model_health_badge.html" with h=model_health.reranker %}
{% else %} {% else %}
<span class="text-sm opacity-60">Not configured — Phase 3</span> <span class="text-sm opacity-60">Not configured — Phase 3</span>
{% endif %} {% endif %}
@@ -64,6 +67,7 @@
{% if system_vision_model.supports_vision %} {% if system_vision_model.supports_vision %}
<span class="badge badge-accent badge-sm ml-1">Vision</span> <span class="badge badge-accent badge-sm ml-1">Vision</span>
{% endif %} {% endif %}
{% include "library/_model_health_badge.html" with h=model_health.vision %}
{% else %} {% else %}
<span class="text-sm opacity-60">Not configured — image analysis disabled</span> <span class="text-sm opacity-60">Not configured — image analysis disabled</span>
{% endif %} {% endif %}

View File

@@ -729,6 +729,16 @@ def embedding_dashboard(request):
except Exception as exc: except Exception as exc:
logger.warning("Could not load system models: %s", exc) logger.warning("Could not load system models: %s", exc)
# Reachability of the system-default models (keyed by role for the
# template). A probe failure must never 500 the dashboard.
context["model_health"] = {}
try:
from library.services.model_health import probe_system_models
context["model_health"] = {r["role"]: r for r in probe_system_models()}
except Exception as exc:
logger.warning("Could not probe system model health: %s", exc)
# Get item status counts and node counts from Neo4j # Get item status counts and node counts from Neo4j
if neo4j_available(): if neo4j_available():
context["neo4j_available"] = True context["neo4j_available"] = True