- Add /healthz endpoint returning LLM provider validation status - Add /metrics endpoint serving Prometheus metrics via prometheus_client - Replace all print() calls in health.py with proper logging module - Remove _PREFIX variable in favor of structured logger context
239 lines
7.5 KiB
Python
239 lines
7.5 KiB
Python
"""
|
|
Agent Registry
|
|
|
|
Serves GET /.well-known/mcp/server.json for agent discovery.
|
|
Reads agent topology from agents.yaml and model capabilities from
|
|
fastagent.config.yaml in the working directory.
|
|
|
|
Also exposes standard health and observability endpoints:
|
|
GET /live — liveness probe (always 200 while process is up)
|
|
GET /ready — readiness probe (200 when all configured agents reachable)
|
|
GET /metrics — Prometheus metrics in text exposition format
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import yaml
|
|
from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Gauge, generate_latest
|
|
from starlette.applications import Starlette
|
|
from starlette.requests import Request
|
|
from starlette.responses import JSONResponse, PlainTextResponse, Response
|
|
from starlette.routing import Route
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _config_root() -> Path:
|
|
"""Return the working directory where agents.yaml and fastagent configs live."""
|
|
return Path.cwd()
|
|
|
|
|
|
def _load_deployment_config() -> dict:
|
|
"""Load agents.yaml — single source of truth for deployment topology."""
|
|
config_path = _config_root() / os.environ.get("PALLAS_AGENTS_CONFIG", "agents.yaml")
|
|
if not config_path.exists():
|
|
return {}
|
|
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f) or {}
|
|
|
|
|
|
def _load_model_capabilities() -> dict:
|
|
"""Read model info and capabilities from the active fastagent.config.yaml."""
|
|
config_path = _config_root() / "fastagent.config.yaml"
|
|
if not config_path.exists():
|
|
return {}
|
|
|
|
with open(config_path) as f:
|
|
config = yaml.safe_load(f) or {}
|
|
|
|
default_model = config.get("default_model", "")
|
|
capabilities = config.get("model_capabilities", {})
|
|
|
|
if not default_model and not capabilities:
|
|
return {}
|
|
|
|
model_name = (
|
|
default_model.split(".", 1)[-1] if "." in default_model else default_model
|
|
)
|
|
|
|
return {
|
|
"model": model_name or None,
|
|
"vision": capabilities.get("vision", False),
|
|
"context_window": capabilities.get("context_window", None),
|
|
"max_output_tokens": capabilities.get("max_output_tokens", None),
|
|
}
|
|
|
|
|
|
def _build_registry(config: dict) -> dict:
|
|
"""Build the registry JSON from agents.yaml + fastagent.config.yaml."""
|
|
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
model_caps = _load_model_capabilities()
|
|
|
|
host = config.get("host", "localhost")
|
|
namespace = config.get("namespace", "")
|
|
version = config.get("version", "1.0.0")
|
|
agents = config.get("agents", {})
|
|
|
|
entries = []
|
|
for name, agent in agents.items():
|
|
# Build registry name: namespace/slug (e.g. ca.helu.mentor/jarvis)
|
|
slug = name.replace("_", "-")
|
|
registry_name = f"{namespace}/{slug}" if namespace else slug
|
|
|
|
server_entry: dict = {
|
|
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
|
|
"name": registry_name,
|
|
"title": agent.get("title", name.title()),
|
|
"description": agent.get("description", ""),
|
|
"version": version,
|
|
"remotes": [
|
|
{
|
|
"type": "streamable-http",
|
|
"url": f"http://{host}:{agent['port']}/mcp",
|
|
}
|
|
],
|
|
}
|
|
if model_caps:
|
|
server_entry["capabilities"] = model_caps
|
|
|
|
entries.append(
|
|
{
|
|
"server": server_entry,
|
|
"_meta": {
|
|
"io.modelcontextprotocol.registry/official": {
|
|
"status": "active",
|
|
"updatedAt": now,
|
|
"isLatest": True,
|
|
}
|
|
},
|
|
}
|
|
)
|
|
|
|
return {"servers": entries}
|
|
|
|
|
|
# ── Prometheus metrics ────────────────────────────────────────────────────────
|
|
|
|
_metrics_registry = CollectorRegistry()
|
|
_pallas_up = Gauge(
|
|
"pallas_up",
|
|
"1 when the Pallas registry is running",
|
|
registry=_metrics_registry,
|
|
)
|
|
_pallas_up.set(1)
|
|
|
|
|
|
def _init_agent_metrics(config: dict) -> None:
|
|
"""Register per-agent info gauges once at startup."""
|
|
agents = config.get("agents", {})
|
|
if not agents:
|
|
return
|
|
|
|
agent_info = Gauge(
|
|
"pallas_agent_info",
|
|
"Static info about configured Pallas agents",
|
|
labelnames=["agent", "port"],
|
|
registry=_metrics_registry,
|
|
)
|
|
for name, agent in agents.items():
|
|
agent_info.labels(agent=name, port=str(agent["port"])).set(1)
|
|
|
|
|
|
# ── Route handlers ────────────────────────────────────────────────────────────
|
|
|
|
_deployment_config = _load_deployment_config()
|
|
_init_agent_metrics(_deployment_config)
|
|
|
|
|
|
async def server_json(request: Request) -> JSONResponse:
|
|
return JSONResponse(_build_registry(_deployment_config))
|
|
|
|
|
|
async def live(request: Request) -> JSONResponse:
|
|
"""Liveness probe — always 200 while the process is running."""
|
|
return JSONResponse({"status": "alive"})
|
|
|
|
|
|
async def ready(request: Request) -> Response:
|
|
"""Readiness probe — 200 when all configured agents are reachable."""
|
|
agents = _deployment_config.get("agents", {})
|
|
if not agents:
|
|
return JSONResponse({"status": "ready"})
|
|
|
|
missing: list[str] = []
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
checks = await asyncio.gather(
|
|
*(_probe_agent(client, name, agent["port"]) for name, agent in agents.items()),
|
|
return_exceptions=True,
|
|
)
|
|
|
|
for name, result in zip(agents.keys(), checks):
|
|
if isinstance(result, Exception) or result is False:
|
|
missing.append(name)
|
|
|
|
if missing:
|
|
return Response(
|
|
content=_json_bytes({"status": "unavailable", "missing": missing}),
|
|
status_code=503,
|
|
media_type="application/json",
|
|
)
|
|
return JSONResponse({"status": "ready"})
|
|
|
|
|
|
async def _probe_agent(client: httpx.AsyncClient, name: str, port: int) -> bool:
|
|
"""Return True if the agent's MCP port is accepting connections."""
|
|
try:
|
|
await client.get(f"http://127.0.0.1:{port}/mcp")
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
async def metrics(request: Request) -> Response:
|
|
"""Prometheus metrics in text exposition format."""
|
|
data = generate_latest(_metrics_registry)
|
|
return Response(content=data, media_type=CONTENT_TYPE_LATEST)
|
|
|
|
|
|
def _json_bytes(obj: dict) -> bytes:
|
|
import json
|
|
return json.dumps(obj).encode()
|
|
|
|
|
|
# ── Starlette app ─────────────────────────────────────────────────────────────
|
|
|
|
app = Starlette(
|
|
routes=[
|
|
Route("/.well-known/mcp/server.json", server_json),
|
|
Route("/live", live),
|
|
Route("/ready", ready),
|
|
Route("/metrics", metrics),
|
|
],
|
|
)
|
|
|
|
|
|
async def run_registry(
|
|
host: str = "0.0.0.0",
|
|
port: int = 24200,
|
|
) -> None:
|
|
"""Run the registry server."""
|
|
import uvicorn
|
|
|
|
deploy_name = _deployment_config.get("name", "pallas")
|
|
logger.info(
|
|
"Registry started: %s, port %d, %d agent(s)",
|
|
deploy_name,
|
|
port,
|
|
len(_deployment_config.get("agents", {})),
|
|
)
|
|
|
|
config = uvicorn.Config(app, host=host, port=port, log_level="warning")
|
|
server = uvicorn.Server(config)
|
|
await server.serve()
|