Add a Prometheus custom collector that probes the four system-default models (chat, vision, embedding, reranker) at /metrics scrape time and emits up/down, configured, and probe-latency gauges. This complements the ingest-pipeline counters in the Celery worker, which only move during active ingests and cannot signal model outages on an idle queue. - New `library/health_collector.py` registers a custom collector with a 55s in-process cache to avoid hammering GPU endpoints on rapid scrapes or across multiple gunicorn workers. - New `library/services/model_health.py` centralises the probe logic, resolving system-default models via SystemSettings and dispatching to chat/embedding/rerank endpoints with a short timeout. - Register the collector only in the web process (gunicorn/runserver) via `LibraryConfig.ready`, excluding Celery, pytest, and management commands to prevent duplicate registration and stray probes. - Add unit tests covering the collector cache, metric shape, and per-role probe dispatch.
120 lines
4.0 KiB
Python
120 lines
4.0 KiB
Python
"""
|
|
System-default model reachability probes.
|
|
|
|
Provides a cheap, bounded liveness check for the four system-default models
|
|
(embedding, chat, vision, reranker) so the embedding dashboard and the
|
|
scrape-time Prometheus collector can surface "model not responding" without
|
|
running an ingest.
|
|
|
|
The probe deliberately hits ``GET {base_url}/models`` as its primary check:
|
|
on an OpenAI-compatible router (e.g. the llama-router) this answers instantly
|
|
without loading a model, so repeated probes never burn GPU time. This mirrors
|
|
the GPU-avoidance principle in ``mcp_server/tools/health.py``.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# api_type values whose endpoints expose an OpenAI-compatible ``/models`` list.
|
|
_OPENAI_COMPATIBLE = {"openai", "azure", "ollama", "llama-cpp", "vllm"}
|
|
|
|
# (role, getter method name) pairs — order is the dashboard/metrics order.
|
|
ROLE_GETTERS = [
|
|
("embedding", "get_system_embedding_model"),
|
|
("chat", "get_system_chat_model"),
|
|
("vision", "get_system_vision_model"),
|
|
("reranker", "get_system_reranker_model"),
|
|
]
|
|
|
|
|
|
def probe_api(api, timeout: int = 5) -> tuple[bool, str]:
|
|
"""Check whether an ``LLMApi`` endpoint is responding.
|
|
|
|
Args:
|
|
api: ``LLMApi`` instance (provides base_url, api_key, api_type).
|
|
timeout: Per-request timeout in seconds.
|
|
|
|
Returns:
|
|
``(ok, detail)`` — ok is True if the endpoint answered acceptably;
|
|
detail is a short human-readable status (HTTP code, error, or "ok").
|
|
"""
|
|
base_url = api.base_url.rstrip("/")
|
|
headers = {}
|
|
if api.api_key:
|
|
headers["Authorization"] = f"Bearer {api.api_key}"
|
|
|
|
if api.api_type not in _OPENAI_COMPATIBLE:
|
|
# bedrock / anthropic have no equivalent cheap unauthenticated list;
|
|
# treat a reachable host as the liveness signal via a HEAD on base_url.
|
|
try:
|
|
resp = requests.head(base_url, headers=headers, timeout=timeout)
|
|
return True, f"reachable (HTTP {resp.status_code})"
|
|
except requests.RequestException as exc:
|
|
return False, type(exc).__name__
|
|
|
|
url = f"{base_url}/models"
|
|
try:
|
|
resp = requests.get(url, headers=headers, timeout=timeout)
|
|
except requests.Timeout:
|
|
return False, f"timeout after {timeout}s"
|
|
except requests.RequestException as exc:
|
|
return False, type(exc).__name__
|
|
|
|
if resp.status_code == 200:
|
|
return True, "ok"
|
|
return False, f"HTTP {resp.status_code}"
|
|
|
|
|
|
def probe_system_models(timeout: int = 5) -> list[dict]:
|
|
"""Probe all four system-default models for reachability.
|
|
|
|
Returns:
|
|
One dict per role with keys: ``role``, ``configured``, ``model_name``,
|
|
``api_name``, ``base_url``, ``ok``, ``detail``, ``latency_ms``.
|
|
For an unconfigured role, ``configured`` is False and the probe is
|
|
skipped (``ok`` is None).
|
|
"""
|
|
from llm_manager.models import LLMModel
|
|
|
|
results: list[dict] = []
|
|
for role, getter_name in ROLE_GETTERS:
|
|
model = getattr(LLMModel, getter_name)()
|
|
if model is None:
|
|
results.append(
|
|
{
|
|
"role": role,
|
|
"configured": False,
|
|
"model_name": None,
|
|
"api_name": None,
|
|
"base_url": None,
|
|
"ok": None,
|
|
"detail": "not configured",
|
|
"latency_ms": None,
|
|
}
|
|
)
|
|
continue
|
|
|
|
api = model.api
|
|
start = time.monotonic()
|
|
ok, detail = probe_api(api, timeout=timeout)
|
|
latency_ms = round((time.monotonic() - start) * 1000, 1)
|
|
results.append(
|
|
{
|
|
"role": role,
|
|
"configured": True,
|
|
"model_name": model.name,
|
|
"api_name": api.name,
|
|
"base_url": api.base_url,
|
|
"ok": ok,
|
|
"detail": detail,
|
|
"latency_ms": latency_ms,
|
|
}
|
|
)
|
|
|
|
return results
|