mnemosyne/mnemosyne/library/services/model_health.py

"""
System-default model reachability probes.

Provides a cheap, bounded liveness check for the four system-default models
(embedding, chat, vision, reranker) so the embedding dashboard and the
scrape-time Prometheus collector can surface "model not responding" without
running an ingest.

The probe deliberately hits ``GET {base_url}/models`` as its primary check:
on an OpenAI-compatible router (e.g. the llama-router) this answers instantly
without loading a model, so repeated probes never burn GPU time. This mirrors
the GPU-avoidance principle in ``mcp_server/tools/health.py``.
"""

import logging
import time
from typing import Optional

import requests

logger = logging.getLogger(__name__)

# api_type values whose endpoints expose an OpenAI-compatible ``/models`` list.
_OPENAI_COMPATIBLE = {"openai", "azure", "ollama", "llama-cpp", "vllm"}

# (role, getter method name) pairs — order is the dashboard/metrics order.
ROLE_GETTERS = [
    ("embedding", "get_system_embedding_model"),
    ("chat", "get_system_chat_model"),
    ("vision", "get_system_vision_model"),
    ("reranker", "get_system_reranker_model"),
]


def probe_api(api, timeout: int = 5) -> tuple[bool, str]:
    """Check whether an ``LLMApi`` endpoint is responding.

    Args:
        api: ``LLMApi`` instance (provides base_url, api_key, api_type).
        timeout: Per-request timeout in seconds.

    Returns:
        ``(ok, detail)`` — ok is True if the endpoint answered acceptably;
        detail is a short human-readable status (HTTP code, error, or "ok").
    """
    base_url = api.base_url.rstrip("/")
    headers = {}
    if api.api_key:
        headers["Authorization"] = f"Bearer {api.api_key}"

    if api.api_type not in _OPENAI_COMPATIBLE:
        # bedrock / anthropic have no equivalent cheap unauthenticated list;
        # treat a reachable host as the liveness signal via a HEAD on base_url.
        try:
            resp = requests.head(base_url, headers=headers, timeout=timeout)
            return True, f"reachable (HTTP {resp.status_code})"
        except requests.RequestException as exc:
            return False, type(exc).__name__

    url = f"{base_url}/models"
    try:
        resp = requests.get(url, headers=headers, timeout=timeout)
    except requests.Timeout:
        return False, f"timeout after {timeout}s"
    except requests.RequestException as exc:
        return False, type(exc).__name__

    if resp.status_code == 200:
        return True, "ok"
    return False, f"HTTP {resp.status_code}"


def probe_system_models(timeout: int = 5) -> list[dict]:
    """Probe all four system-default models for reachability.

    Returns:
        One dict per role with keys: ``role``, ``configured``, ``model_name``,
        ``api_name``, ``base_url``, ``ok``, ``detail``, ``latency_ms``.
        For an unconfigured role, ``configured`` is False and the probe is
        skipped (``ok`` is None).
    """
    from llm_manager.models import LLMModel

    results: list[dict] = []
    for role, getter_name in ROLE_GETTERS:
        model = getattr(LLMModel, getter_name)()
        if model is None:
            results.append(
                {
                    "role": role,
                    "configured": False,
                    "model_name": None,
                    "api_name": None,
                    "base_url": None,
                    "ok": None,
                    "detail": "not configured",
                    "latency_ms": None,
                }
            )
            continue

        api = model.api
        start = time.monotonic()
        ok, detail = probe_api(api, timeout=timeout)
        latency_ms = round((time.monotonic() - start) * 1000, 1)
        results.append(
            {
                "role": role,
                "configured": True,
                "model_name": model.name,
                "api_name": api.name,
                "base_url": api.base_url,
                "ok": ok,
                "detail": detail,
                "latency_ms": latency_ms,
            }
        )

    return results