feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions
--- a/services/call_flow_learner.py
+++ b/services/call_flow_learner.py
@@ -0,0 +1,339 @@
+"""
+Call Flow Learner — Builds and refines call flows from exploration data.
+
+When Hold Slayer runs in exploration mode, it discovers IVR steps.
+This service takes those discoveries and:
+1. Builds a CallFlow tree that can be reused next time
+2. Merges new discoveries into existing flows (refining them)
+3. Uses LLM to label steps and infer menu structure
+
+Over time, each phone number builds up a reliable call flow
+that makes future calls faster and more accurate.
+"""
+
+import logging
+import re
+from datetime import datetime
+from typing import Any, Optional
+
+from models.call_flow import ActionType, CallFlow, CallFlowStep
+
+logger = logging.getLogger(__name__)
+
+
+class CallFlowLearner:
+    """
+    Learns IVR call flows from exploration data.
+
+    Usage:
+        learner = CallFlowLearner(llm_client=llm)
+
+        # After an exploration call completes:
+        flow = await learner.build_flow(
+            phone_number="+18005551234",
+            discovered_steps=steps_from_exploration,
+            intent="cancel my card",
+        )
+
+        # Next time we call, merge new discoveries:
+        updated = await learner.merge_discoveries(
+            existing_flow=flow,
+            new_steps=new_discoveries,
+        )
+    """
+
+    def __init__(self, llm_client=None):
+        self._llm = llm_client
+
+    # ================================================================
+    # Build Flow from Exploration
+    # ================================================================
+
+    async def build_flow(
+        self,
+        phone_number: str,
+        discovered_steps: list[dict],
+        intent: Optional[str] = None,
+        company_name: Optional[str] = None,
+    ) -> CallFlow:
+        """
+        Build a CallFlow from exploration discoveries.
+
+        Args:
+            phone_number: The number that was called.
+            discovered_steps: List of step dicts from exploration mode:
+                [{"timestamp": ..., "audio_type": "ivr_prompt",
+                  "transcript": "Press 1 for...", "action_taken": {"dtmf": "1"}}, ...]
+            intent: What the caller was trying to accomplish.
+            company_name: Optional company name for labeling.
+
+        Returns:
+            A CallFlow that can be stored and reused.
+        """
+        logger.info(
+            f"🧠 Building call flow from {len(discovered_steps)} discoveries "
+            f"for {phone_number}"
+        )
+
+        # Phase 1: Extract meaningful steps (skip silence, ringing)
+        meaningful = [
+            s for s in discovered_steps
+            if s.get("audio_type") in ("ivr_prompt", "live_human", "music")
+            or s.get("action_taken")
+        ]
+
+        if not meaningful:
+            logger.warning("  No meaningful steps discovered")
+            return self._empty_flow(phone_number, company_name)
+
+        # Phase 2: Convert discoveries to CallFlowSteps
+        flow_steps = []
+        for i, step in enumerate(meaningful):
+            flow_step = self._discovery_to_step(step, i, meaningful)
+            if flow_step:
+                flow_steps.append(flow_step)
+
+        # Phase 3: Link steps together (next_step pointers)
+        for i, step in enumerate(flow_steps[:-1]):
+            step.next_step = flow_steps[i + 1].id
+
+        # Phase 4: Use LLM to enhance step labels if available
+        if self._llm and flow_steps:
+            flow_steps = await self._llm_enhance_steps(flow_steps, intent)
+
+        # Build the flow
+        name = company_name or self._guess_company_name(phone_number)
+        flow = CallFlow(
+            id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
+            name=f"{name} — {intent or 'General'}",
+            phone_number=phone_number,
+            description=f"Auto-learned flow for {name}. Intent: {intent or 'general'}",
+            steps=flow_steps,
+            tags=["auto-learned"],
+            notes=f"Learned from exploration on {datetime.now().isoformat()}",
+            times_used=1,
+            last_used=datetime.now(),
+        )
+
+        logger.info(
+            f"  ✅ Built flow '{flow.name}' with {len(flow_steps)} steps"
+        )
+        return flow
+
+    def _discovery_to_step(
+        self,
+        discovery: dict,
+        index: int,
+        all_discoveries: list[dict],
+    ) -> Optional[CallFlowStep]:
+        """Convert a single exploration discovery to a CallFlowStep."""
+        audio_type = discovery.get("audio_type", "")
+        transcript = discovery.get("transcript", "")
+        action_taken = discovery.get("action_taken")
+
+        step_id = f"step_{index:03d}"
+
+        if audio_type == "ivr_prompt" and action_taken:
+            # IVR menu where we pressed a button
+            dtmf = action_taken.get("dtmf", "")
+            return CallFlowStep(
+                id=step_id,
+                description=self._summarize_menu(transcript) or f"IVR menu (pressed {dtmf})",
+                action=ActionType.DTMF,
+                action_value=dtmf,
+                expect=self._extract_expect_pattern(transcript),
+                timeout=15,
+            )
+
+        elif audio_type == "ivr_prompt" and not action_taken:
+            # IVR prompt we just listened to
+            return CallFlowStep(
+                id=step_id,
+                description=self._summarize_menu(transcript) or "IVR announcement",
+                action=ActionType.LISTEN,
+                timeout=30,
+            )
+
+        elif audio_type == "music":
+            # Hold music
+            return CallFlowStep(
+                id=step_id,
+                description="Hold music — waiting for agent",
+                action=ActionType.HOLD,
+                timeout=3600,
+            )
+
+        elif audio_type == "live_human":
+            # Human detected — this is the transfer point
+            return CallFlowStep(
+                id=step_id,
+                description="Live agent detected — transfer",
+                action=ActionType.TRANSFER,
+                action_value="preferred_device",
+            )
+
+        return None
+
+    # ================================================================
+    # Merge New Discoveries into Existing Flow
+    # ================================================================
+
+    async def merge_discoveries(
+        self,
+        existing_flow: CallFlow,
+        new_steps: list[dict],
+        intent: Optional[str] = None,
+    ) -> CallFlow:
+        """
+        Merge new exploration discoveries into an existing flow.
+
+        This refines the flow over time — updating timeouts,
+        confirming step order, adding alternative paths.
+        """
+        logger.info(
+            f"🔄 Merging {len(new_steps)} new discoveries into "
+            f"flow '{existing_flow.name}'"
+        )
+
+        # Build a new flow from the discoveries
+        new_flow = await self.build_flow(
+            phone_number=existing_flow.phone_number,
+            discovered_steps=new_steps,
+            intent=intent,
+        )
+
+        # Simple merge strategy: keep existing steps but update timeouts
+        # and add any new steps that weren't in the original
+        existing_by_action = {
+            (s.action, s.action_value): s for s in existing_flow.steps
+        }
+
+        for new_step in new_flow.steps:
+            key = (new_step.action, new_step.action_value)
+            if key in existing_by_action:
+                # Update timeout to be the average
+                old_step = existing_by_action[key]
+                if old_step.timeout and new_step.timeout:
+                    old_step.timeout = int(
+                        (old_step.timeout + new_step.timeout) / 2
+                    )
+            # New steps that don't exist are noted but not auto-added
+            # (to avoid corrupting a working flow)
+
+        # Update metadata
+        existing_flow.times_used = (existing_flow.times_used or 0) + 1
+        existing_flow.last_used = datetime.now()
+
+        logger.info(f"  ✅ Merged. Flow now has {len(existing_flow.steps)} steps")
+        return existing_flow
+
+    # ================================================================
+    # LLM Enhancement
+    # ================================================================
+
+    async def _llm_enhance_steps(
+        self,
+        steps: list[CallFlowStep],
+        intent: Optional[str],
+    ) -> list[CallFlowStep]:
+        """Use LLM to improve step descriptions and structure."""
+        if not self._llm:
+            return steps
+
+        try:
+            # Build a summary of the steps for the LLM
+            step_descriptions = []
+            for s in steps:
+                desc = f"- {s.action.value}"
+                if s.action_value:
+                    desc += f" ({s.action_value})"
+                if s.description:
+                    desc += f": {s.description}"
+                step_descriptions.append(desc)
+
+            prompt = (
+                f"These are steps discovered while navigating a phone IVR system.\n"
+                f"Intent: {intent or 'general inquiry'}\n\n"
+                f"Steps:\n" + "\n".join(step_descriptions) + "\n\n"
+                f"For each step, provide a clear, concise description of what "
+                f"that step does. Return JSON array of objects with 'step_index' "
+                f"and 'description' fields."
+            )
+
+            result = await self._llm.chat_json(
+                prompt,
+                system="You are labeling IVR phone menu steps for a call flow database.",
+            )
+
+            # Apply LLM descriptions
+            if isinstance(result, list):
+                for item in result:
+                    idx = item.get("step_index", -1)
+                    desc = item.get("description", "")
+                    if 0 <= idx < len(steps) and desc:
+                        steps[idx].description = desc
+            elif isinstance(result, dict) and "steps" in result:
+                for item in result["steps"]:
+                    idx = item.get("step_index", -1)
+                    desc = item.get("description", "")
+                    if 0 <= idx < len(steps) and desc:
+                        steps[idx].description = desc
+
+        except Exception as e:
+            logger.warning(f"  LLM enhancement failed (non-fatal): {e}")
+
+        return steps
+
+    # ================================================================
+    # Helpers
+    # ================================================================
+
+    @staticmethod
+    def _summarize_menu(transcript: str) -> Optional[str]:
+        """Create a short summary of an IVR menu transcript."""
+        if not transcript:
+            return None
+
+        # Count how many options
+        options = re.findall(r'press\s+\d+', transcript.lower())
+        if options:
+            return f"IVR menu with {len(options)} options"
+
+        # Truncate long transcripts
+        if len(transcript) > 80:
+            return transcript[:77] + "..."
+        return transcript
+
+    @staticmethod
+    def _extract_expect_pattern(transcript: str) -> Optional[str]:
+        """Extract a regex pattern to match this prompt next time."""
+        if not transcript:
+            return None
+
+        # Find the most distinctive phrase (>4 words, not generic)
+        words = transcript.split()
+        if len(words) >= 4:
+            # Use first meaningful phrase
+            phrase = " ".join(words[:6])
+            # Escape for regex
+            return re.escape(phrase.lower())
+
+        return None
+
+    @staticmethod
+    def _guess_company_name(phone_number: str) -> str:
+        """Guess company name from phone number (placeholder)."""
+        # In production, this would do a reverse lookup
+        return f"Company {phone_number[-4:]}"
+
+    @staticmethod
+    def _empty_flow(phone_number: str, company_name: Optional[str]) -> CallFlow:
+        """Create an empty flow placeholder."""
+        return CallFlow(
+            id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
+            name=f"{company_name or phone_number} — Empty",
+            phone_number=phone_number,
+            description="Empty flow — no meaningful steps discovered",
+            steps=[],
+            tags=["auto-learned", "empty"],
+        )