Files
pallas/pallas/registry.py
Robert Helewka 0cea5ece3a feat: add /healthz and /metrics endpoints, replace print with logging
- Add /healthz endpoint returning LLM provider validation status
- Add /metrics endpoint serving Prometheus metrics via prometheus_client
- Replace all print() calls in health.py with proper logging module
- Remove _PREFIX variable in favor of structured logger context
2026-04-10 11:22:26 +00:00

239 lines
7.5 KiB
Python

"""
Agent Registry
Serves GET /.well-known/mcp/server.json for agent discovery.
Reads agent topology from agents.yaml and model capabilities from
fastagent.config.yaml in the working directory.
Also exposes standard health and observability endpoints:
GET /live — liveness probe (always 200 while process is up)
GET /ready — readiness probe (200 when all configured agents reachable)
GET /metrics — Prometheus metrics in text exposition format
"""
import asyncio
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
import httpx
import yaml
from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Gauge, generate_latest
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import JSONResponse, PlainTextResponse, Response
from starlette.routing import Route
logger = logging.getLogger(__name__)
def _config_root() -> Path:
"""Return the working directory where agents.yaml and fastagent configs live."""
return Path.cwd()
def _load_deployment_config() -> dict:
"""Load agents.yaml — single source of truth for deployment topology."""
config_path = _config_root() / os.environ.get("PALLAS_AGENTS_CONFIG", "agents.yaml")
if not config_path.exists():
return {}
with open(config_path) as f:
return yaml.safe_load(f) or {}
def _load_model_capabilities() -> dict:
"""Read model info and capabilities from the active fastagent.config.yaml."""
config_path = _config_root() / "fastagent.config.yaml"
if not config_path.exists():
return {}
with open(config_path) as f:
config = yaml.safe_load(f) or {}
default_model = config.get("default_model", "")
capabilities = config.get("model_capabilities", {})
if not default_model and not capabilities:
return {}
model_name = (
default_model.split(".", 1)[-1] if "." in default_model else default_model
)
return {
"model": model_name or None,
"vision": capabilities.get("vision", False),
"context_window": capabilities.get("context_window", None),
"max_output_tokens": capabilities.get("max_output_tokens", None),
}
def _build_registry(config: dict) -> dict:
"""Build the registry JSON from agents.yaml + fastagent.config.yaml."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
model_caps = _load_model_capabilities()
host = config.get("host", "localhost")
namespace = config.get("namespace", "")
version = config.get("version", "1.0.0")
agents = config.get("agents", {})
entries = []
for name, agent in agents.items():
# Build registry name: namespace/slug (e.g. ca.helu.mentor/jarvis)
slug = name.replace("_", "-")
registry_name = f"{namespace}/{slug}" if namespace else slug
server_entry: dict = {
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
"name": registry_name,
"title": agent.get("title", name.title()),
"description": agent.get("description", ""),
"version": version,
"remotes": [
{
"type": "streamable-http",
"url": f"http://{host}:{agent['port']}/mcp",
}
],
}
if model_caps:
server_entry["capabilities"] = model_caps
entries.append(
{
"server": server_entry,
"_meta": {
"io.modelcontextprotocol.registry/official": {
"status": "active",
"updatedAt": now,
"isLatest": True,
}
},
}
)
return {"servers": entries}
# ── Prometheus metrics ────────────────────────────────────────────────────────
_metrics_registry = CollectorRegistry()
_pallas_up = Gauge(
"pallas_up",
"1 when the Pallas registry is running",
registry=_metrics_registry,
)
_pallas_up.set(1)
def _init_agent_metrics(config: dict) -> None:
"""Register per-agent info gauges once at startup."""
agents = config.get("agents", {})
if not agents:
return
agent_info = Gauge(
"pallas_agent_info",
"Static info about configured Pallas agents",
labelnames=["agent", "port"],
registry=_metrics_registry,
)
for name, agent in agents.items():
agent_info.labels(agent=name, port=str(agent["port"])).set(1)
# ── Route handlers ────────────────────────────────────────────────────────────
_deployment_config = _load_deployment_config()
_init_agent_metrics(_deployment_config)
async def server_json(request: Request) -> JSONResponse:
return JSONResponse(_build_registry(_deployment_config))
async def live(request: Request) -> JSONResponse:
"""Liveness probe — always 200 while the process is running."""
return JSONResponse({"status": "alive"})
async def ready(request: Request) -> Response:
"""Readiness probe — 200 when all configured agents are reachable."""
agents = _deployment_config.get("agents", {})
if not agents:
return JSONResponse({"status": "ready"})
missing: list[str] = []
async with httpx.AsyncClient(timeout=2.0) as client:
checks = await asyncio.gather(
*(_probe_agent(client, name, agent["port"]) for name, agent in agents.items()),
return_exceptions=True,
)
for name, result in zip(agents.keys(), checks):
if isinstance(result, Exception) or result is False:
missing.append(name)
if missing:
return Response(
content=_json_bytes({"status": "unavailable", "missing": missing}),
status_code=503,
media_type="application/json",
)
return JSONResponse({"status": "ready"})
async def _probe_agent(client: httpx.AsyncClient, name: str, port: int) -> bool:
"""Return True if the agent's MCP port is accepting connections."""
try:
await client.get(f"http://127.0.0.1:{port}/mcp")
return True
except Exception:
return False
async def metrics(request: Request) -> Response:
"""Prometheus metrics in text exposition format."""
data = generate_latest(_metrics_registry)
return Response(content=data, media_type=CONTENT_TYPE_LATEST)
def _json_bytes(obj: dict) -> bytes:
import json
return json.dumps(obj).encode()
# ── Starlette app ─────────────────────────────────────────────────────────────
app = Starlette(
routes=[
Route("/.well-known/mcp/server.json", server_json),
Route("/live", live),
Route("/ready", ready),
Route("/metrics", metrics),
],
)
async def run_registry(
host: str = "0.0.0.0",
port: int = 24200,
) -> None:
"""Run the registry server."""
import uvicorn
deploy_name = _deployment_config.get("name", "pallas")
logger.info(
"Registry started: %s, port %d, %d agent(s)",
deploy_name,
port,
len(_deployment_config.get("agents", {})),
)
config = uvicorn.Config(app, host=host, port=port, log_level="warning")
server = uvicorn.Server(config)
await server.serve()