feat: add per-agent loop safeguards for tool-call turns

Introduce three optional per-agent config fields to bound tool-call loop execution: `max_iterations` (default 15), `streaming_timeout` (default 120s), and `turn_timeout` (default 300s wall-clock). - Plumb limits from agent config through `_build_agents_table` and `_start_agent` into `MultimodalAgentMCPServer` via `request_limits` - Apply `max_iterations` and `streaming_timeout` to `RequestParams` - Wrap turn dispatch in `asyncio.wait_for` to enforce `turn_timeout`, logging a warning on timeout - Document the new fields in README
2026-05-27 05:41:08 -04:00
parent ca7d714a31
commit 440f7fb60c
4 changed files with 57 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -75,6 +75,28 @@ agents:
    description: "Web search and knowledge graph"
 ```
 ### Loop safeguards
 Three optional fields bound how long an agent's tool-call loop can run:
 | Field | Type | Default | Purpose |
 |---|---|---|---|
 | `max_iterations` | int | 15 | Maximum tool calls in a single agent turn |
 | `streaming_timeout` | float | 120 | Max idle seconds between streaming events |
 | `turn_timeout` | float | 300 | Hard wall-clock limit for a full turn (seconds) |
 All three are optional. Agents that omit them use the defaults shown above.
 ```yaml
 agents:
  research:
    module: agents.research
    port: 8250
    max_iterations: 10      # this agent only needs a few search calls
    streaming_timeout: 60   # fail fast on a slow search MCP
    turn_timeout: 120       # research turns should not take more than 2 min
 ```
 ---
 ## `fastagent.config.yaml` extensions
--- a/pallas/multimodal_server.py
+++ b/pallas/multimodal_server.py
@@ -19,6 +19,7 @@ fast-agent instance whose ``message_history`` is seeded from the caller's
 memory, no restart amnesia.
 """
 import asyncio
 import time
 from typing import Any
@@ -125,8 +126,9 @@ def _history_payload_to_multipart(
 class MultimodalAgentMCPServer(AgentMCPServer):
    """AgentMCPServer with optional image + history support on send_message."""
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args, request_limits: dict | None = None, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self._request_limits = request_limits or {}
        self._register_health_routes()
    def _register_health_routes(self) -> None:
@@ -210,6 +212,8 @@ class MultimodalAgentMCPServer(AgentMCPServer):
            request_params = RequestParams(
                tool_execution_handler=EnrichedMCPToolProgressManager(report_progress),
                emit_loop_progress=True,
                max_iterations=self._request_limits.get("max_iterations", 15),
                streaming_timeout=self._request_limits.get("streaming_timeout", 120.0),
            )
            instance = await self._acquire_instance(ctx)
            agent = instance.app[agent_name]
@@ -271,11 +275,26 @@ class MultimodalAgentMCPServer(AgentMCPServer):
                    )
                    return response
-                if agent_context and ctx:
+                turn_timeout = self._request_limits.get("turn_timeout", 300.0)
-                    return await self.with_bridged_context(
+
-                        agent_context, ctx, execute_send
+                async def _dispatch() -> str:
                    if agent_context and ctx:
                        return await self.with_bridged_context(
                            agent_context, ctx, execute_send
                        )
                    return await execute_send()
                try:
                    return await asyncio.wait_for(_dispatch(), timeout=turn_timeout)
                except asyncio.TimeoutError:
                    logger.warning(
                        f"Agent '{agent_name}' turn exceeded {turn_timeout}s wall-clock limit",
                        name="turn_timeout",
                        agent=agent_name,
                        turn_timeout=turn_timeout,
                        conversation_id=conversation_id,
                    )
-                return await execute_send()
+                    raise
            except BaseException:
                metrics_outcome = "error"
                raise
--- a/pallas/server.py
+++ b/pallas/server.py
@@ -62,6 +62,9 @@ def _build_agents_table(config: dict) -> dict[str, dict]:
            "port": agent["port"],
            "model": agent.get("model"),
            "model_capabilities": agent.get("model_capabilities"),
            "max_iterations": agent.get("max_iterations"),
            "streaming_timeout": agent.get("streaming_timeout"),
            "turn_timeout": agent.get("turn_timeout"),
        }
        for name, agent in config["agents"].items()
    }
@@ -259,6 +262,12 @@ async def _start_agent(name: str, agents: dict[str, dict]) -> None:
        #     breaking the "Pallas is ephemeral" contract.
        # With "request" the Pallas process holds no per-conversation state
        # and the LLM sees exactly what Daedalus asks it to see.
        request_limits = {
            k: entry[k]
            for k in ("max_iterations", "streaming_timeout", "turn_timeout")
            if entry.get(k) is not None
        }
        server = MultimodalAgentMCPServer(
            primary_instance=primary_instance,
            create_instance=fast_instance._server_instance_factory,
@@ -267,6 +276,7 @@ async def _start_agent(name: str, agents: dict[str, dict]) -> None:
            server_name=f"{fast_instance.name}-MCP-Server",
            host="0.0.0.0",
            get_registry_version=fast_instance._get_registry_version,
            request_limits=request_limits,
        )
        downstream_servers = _resolve_downstream_servers(fast_instance)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pallas-mcp"
-version = "0.2.2"
+version = "0.3.0"
 description = "FastAgent MCP Bridge — generic runtime for serving FastAgent agents over StreamableHTTP"
 requires-python = ">=3.13.5"
 dependencies = [