pallas/pallas/registry.py

"""
Agent Registry

Serves GET /.well-known/mcp/server.json for agent discovery.
Reads agent topology from agents.yaml and model capabilities from
fastagent.config.yaml in the working directory.

Also exposes standard health and observability endpoints:
    GET /live     — liveness probe (always 200 while process is up)
    GET /ready    — readiness probe (200 when all configured agents reachable)
    GET /metrics  — Prometheus metrics in text exposition format
"""

import asyncio
import logging
import os
from datetime import datetime, timezone
from pathlib import Path

import httpx
import yaml
from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Gauge, generate_latest
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import JSONResponse, PlainTextResponse, Response
from starlette.routing import Route

logger = logging.getLogger(__name__)


def _config_root() -> Path:
    """Return the working directory where agents.yaml and fastagent configs live."""
    return Path.cwd()


def _load_deployment_config() -> dict:
    """Load agents.yaml — single source of truth for deployment topology."""
    config_path = _config_root() / os.environ.get("PALLAS_AGENTS_CONFIG", "agents.yaml")
    if not config_path.exists():
        return {}

    with open(config_path) as f:
        return yaml.safe_load(f) or {}


def _load_model_capabilities() -> dict:
    """Read model info and capabilities from the active fastagent.config.yaml."""
    config_path = _config_root() / "fastagent.config.yaml"
    if not config_path.exists():
        return {}

    with open(config_path) as f:
        config = yaml.safe_load(f) or {}

    default_model = config.get("default_model", "")
    capabilities = config.get("model_capabilities", {})

    if not default_model and not capabilities:
        return {}

    model_name = (
        default_model.split(".", 1)[-1] if "." in default_model else default_model
    )

    return {
        "model": model_name or None,
        "vision": capabilities.get("vision", False),
        "context_window": capabilities.get("context_window", None),
        "max_output_tokens": capabilities.get("max_output_tokens", None),
    }


def _build_registry(config: dict) -> dict:
    """Build the registry JSON from agents.yaml + fastagent.config.yaml."""
    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    model_caps = _load_model_capabilities()

    host = config.get("host", "localhost")
    namespace = config.get("namespace", "")
    version = config.get("version", "1.0.0")
    agents = config.get("agents", {})

    entries = []
    for name, agent in agents.items():
        # Build registry name: namespace/slug (e.g. ca.helu.mentor/jarvis)
        slug = name.replace("_", "-")
        registry_name = f"{namespace}/{slug}" if namespace else slug

        server_entry: dict = {
            "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
            "name": registry_name,
            "title": agent.get("title", name.title()),
            "description": agent.get("description", ""),
            "version": version,
            "remotes": [
                {
                    "type": "streamable-http",
                    "url": f"http://{host}:{agent['port']}/mcp",
                }
            ],
        }
        if model_caps:
            server_entry["capabilities"] = model_caps

        entries.append(
            {
                "server": server_entry,
                "_meta": {
                    "io.modelcontextprotocol.registry/official": {
                        "status": "active",
                        "updatedAt": now,
                        "isLatest": True,
                    }
                },
            }
        )

    return {"servers": entries}


# ── Prometheus metrics ────────────────────────────────────────────────────────

_metrics_registry = CollectorRegistry()
_pallas_up = Gauge(
    "pallas_up",
    "1 when the Pallas registry is running",
    registry=_metrics_registry,
)
_pallas_up.set(1)


def _init_agent_metrics(config: dict) -> None:
    """Register per-agent info gauges once at startup."""
    agents = config.get("agents", {})
    if not agents:
        return

    agent_info = Gauge(
        "pallas_agent_info",
        "Static info about configured Pallas agents",
        labelnames=["agent", "port"],
        registry=_metrics_registry,
    )
    for name, agent in agents.items():
        agent_info.labels(agent=name, port=str(agent["port"])).set(1)


# ── Route handlers ────────────────────────────────────────────────────────────

_deployment_config = _load_deployment_config()
_init_agent_metrics(_deployment_config)


async def server_json(request: Request) -> JSONResponse:
    return JSONResponse(_build_registry(_deployment_config))


async def live(request: Request) -> JSONResponse:
    """Liveness probe — always 200 while the process is running."""
    return JSONResponse({"status": "alive"})


async def ready(request: Request) -> Response:
    """Readiness probe — 200 when all configured agents are reachable."""
    agents = _deployment_config.get("agents", {})
    if not agents:
        return JSONResponse({"status": "ready"})

    missing: list[str] = []
    async with httpx.AsyncClient(timeout=2.0) as client:
        checks = await asyncio.gather(
            *(_probe_agent(client, name, agent["port"]) for name, agent in agents.items()),
            return_exceptions=True,
        )

    for name, result in zip(agents.keys(), checks):
        if isinstance(result, Exception) or result is False:
            missing.append(name)

    if missing:
        return Response(
            content=_json_bytes({"status": "unavailable", "missing": missing}),
            status_code=503,
            media_type="application/json",
        )
    return JSONResponse({"status": "ready"})


async def _probe_agent(client: httpx.AsyncClient, name: str, port: int) -> bool:
    """Return True if the agent's MCP port is accepting connections."""
    try:
        await client.get(f"http://127.0.0.1:{port}/mcp")
        return True
    except Exception:
        return False


async def metrics(request: Request) -> Response:
    """Prometheus metrics in text exposition format."""
    data = generate_latest(_metrics_registry)
    return Response(content=data, media_type=CONTENT_TYPE_LATEST)


def _json_bytes(obj: dict) -> bytes:
    import json
    return json.dumps(obj).encode()


# ── Starlette app ─────────────────────────────────────────────────────────────

app = Starlette(
    routes=[
        Route("/.well-known/mcp/server.json", server_json),
        Route("/live", live),
        Route("/ready", ready),
        Route("/metrics", metrics),
    ],
)


async def run_registry(
    host: str = "0.0.0.0",
    port: int = 24200,
) -> None:
    """Run the registry server."""
    import uvicorn

    deploy_name = _deployment_config.get("name", "pallas")
    logger.info(
        "Registry started: %s, port %d, %d agent(s)",
        deploy_name,
        port,
        len(_deployment_config.get("agents", {})),
    )

    config = uvicorn.Config(app, host=host, port=port, log_level="warning")
    server = uvicorn.Server(config)
    await server.serve()