feat: add initial Hold Slayer AI telephony gateway implementation
Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
This commit is contained in:
339
services/call_flow_learner.py
Normal file
339
services/call_flow_learner.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Call Flow Learner — Builds and refines call flows from exploration data.
|
||||
|
||||
When Hold Slayer runs in exploration mode, it discovers IVR steps.
|
||||
This service takes those discoveries and:
|
||||
1. Builds a CallFlow tree that can be reused next time
|
||||
2. Merges new discoveries into existing flows (refining them)
|
||||
3. Uses LLM to label steps and infer menu structure
|
||||
|
||||
Over time, each phone number builds up a reliable call flow
|
||||
that makes future calls faster and more accurate.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from models.call_flow import ActionType, CallFlow, CallFlowStep
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CallFlowLearner:
|
||||
"""
|
||||
Learns IVR call flows from exploration data.
|
||||
|
||||
Usage:
|
||||
learner = CallFlowLearner(llm_client=llm)
|
||||
|
||||
# After an exploration call completes:
|
||||
flow = await learner.build_flow(
|
||||
phone_number="+18005551234",
|
||||
discovered_steps=steps_from_exploration,
|
||||
intent="cancel my card",
|
||||
)
|
||||
|
||||
# Next time we call, merge new discoveries:
|
||||
updated = await learner.merge_discoveries(
|
||||
existing_flow=flow,
|
||||
new_steps=new_discoveries,
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, llm_client=None):
|
||||
self._llm = llm_client
|
||||
|
||||
# ================================================================
|
||||
# Build Flow from Exploration
|
||||
# ================================================================
|
||||
|
||||
async def build_flow(
|
||||
self,
|
||||
phone_number: str,
|
||||
discovered_steps: list[dict],
|
||||
intent: Optional[str] = None,
|
||||
company_name: Optional[str] = None,
|
||||
) -> CallFlow:
|
||||
"""
|
||||
Build a CallFlow from exploration discoveries.
|
||||
|
||||
Args:
|
||||
phone_number: The number that was called.
|
||||
discovered_steps: List of step dicts from exploration mode:
|
||||
[{"timestamp": ..., "audio_type": "ivr_prompt",
|
||||
"transcript": "Press 1 for...", "action_taken": {"dtmf": "1"}}, ...]
|
||||
intent: What the caller was trying to accomplish.
|
||||
company_name: Optional company name for labeling.
|
||||
|
||||
Returns:
|
||||
A CallFlow that can be stored and reused.
|
||||
"""
|
||||
logger.info(
|
||||
f"🧠 Building call flow from {len(discovered_steps)} discoveries "
|
||||
f"for {phone_number}"
|
||||
)
|
||||
|
||||
# Phase 1: Extract meaningful steps (skip silence, ringing)
|
||||
meaningful = [
|
||||
s for s in discovered_steps
|
||||
if s.get("audio_type") in ("ivr_prompt", "live_human", "music")
|
||||
or s.get("action_taken")
|
||||
]
|
||||
|
||||
if not meaningful:
|
||||
logger.warning(" No meaningful steps discovered")
|
||||
return self._empty_flow(phone_number, company_name)
|
||||
|
||||
# Phase 2: Convert discoveries to CallFlowSteps
|
||||
flow_steps = []
|
||||
for i, step in enumerate(meaningful):
|
||||
flow_step = self._discovery_to_step(step, i, meaningful)
|
||||
if flow_step:
|
||||
flow_steps.append(flow_step)
|
||||
|
||||
# Phase 3: Link steps together (next_step pointers)
|
||||
for i, step in enumerate(flow_steps[:-1]):
|
||||
step.next_step = flow_steps[i + 1].id
|
||||
|
||||
# Phase 4: Use LLM to enhance step labels if available
|
||||
if self._llm and flow_steps:
|
||||
flow_steps = await self._llm_enhance_steps(flow_steps, intent)
|
||||
|
||||
# Build the flow
|
||||
name = company_name or self._guess_company_name(phone_number)
|
||||
flow = CallFlow(
|
||||
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
||||
name=f"{name} — {intent or 'General'}",
|
||||
phone_number=phone_number,
|
||||
description=f"Auto-learned flow for {name}. Intent: {intent or 'general'}",
|
||||
steps=flow_steps,
|
||||
tags=["auto-learned"],
|
||||
notes=f"Learned from exploration on {datetime.now().isoformat()}",
|
||||
times_used=1,
|
||||
last_used=datetime.now(),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f" ✅ Built flow '{flow.name}' with {len(flow_steps)} steps"
|
||||
)
|
||||
return flow
|
||||
|
||||
def _discovery_to_step(
|
||||
self,
|
||||
discovery: dict,
|
||||
index: int,
|
||||
all_discoveries: list[dict],
|
||||
) -> Optional[CallFlowStep]:
|
||||
"""Convert a single exploration discovery to a CallFlowStep."""
|
||||
audio_type = discovery.get("audio_type", "")
|
||||
transcript = discovery.get("transcript", "")
|
||||
action_taken = discovery.get("action_taken")
|
||||
|
||||
step_id = f"step_{index:03d}"
|
||||
|
||||
if audio_type == "ivr_prompt" and action_taken:
|
||||
# IVR menu where we pressed a button
|
||||
dtmf = action_taken.get("dtmf", "")
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description=self._summarize_menu(transcript) or f"IVR menu (pressed {dtmf})",
|
||||
action=ActionType.DTMF,
|
||||
action_value=dtmf,
|
||||
expect=self._extract_expect_pattern(transcript),
|
||||
timeout=15,
|
||||
)
|
||||
|
||||
elif audio_type == "ivr_prompt" and not action_taken:
|
||||
# IVR prompt we just listened to
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description=self._summarize_menu(transcript) or "IVR announcement",
|
||||
action=ActionType.LISTEN,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
elif audio_type == "music":
|
||||
# Hold music
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description="Hold music — waiting for agent",
|
||||
action=ActionType.HOLD,
|
||||
timeout=3600,
|
||||
)
|
||||
|
||||
elif audio_type == "live_human":
|
||||
# Human detected — this is the transfer point
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description="Live agent detected — transfer",
|
||||
action=ActionType.TRANSFER,
|
||||
action_value="preferred_device",
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
# ================================================================
|
||||
# Merge New Discoveries into Existing Flow
|
||||
# ================================================================
|
||||
|
||||
async def merge_discoveries(
|
||||
self,
|
||||
existing_flow: CallFlow,
|
||||
new_steps: list[dict],
|
||||
intent: Optional[str] = None,
|
||||
) -> CallFlow:
|
||||
"""
|
||||
Merge new exploration discoveries into an existing flow.
|
||||
|
||||
This refines the flow over time — updating timeouts,
|
||||
confirming step order, adding alternative paths.
|
||||
"""
|
||||
logger.info(
|
||||
f"🔄 Merging {len(new_steps)} new discoveries into "
|
||||
f"flow '{existing_flow.name}'"
|
||||
)
|
||||
|
||||
# Build a new flow from the discoveries
|
||||
new_flow = await self.build_flow(
|
||||
phone_number=existing_flow.phone_number,
|
||||
discovered_steps=new_steps,
|
||||
intent=intent,
|
||||
)
|
||||
|
||||
# Simple merge strategy: keep existing steps but update timeouts
|
||||
# and add any new steps that weren't in the original
|
||||
existing_by_action = {
|
||||
(s.action, s.action_value): s for s in existing_flow.steps
|
||||
}
|
||||
|
||||
for new_step in new_flow.steps:
|
||||
key = (new_step.action, new_step.action_value)
|
||||
if key in existing_by_action:
|
||||
# Update timeout to be the average
|
||||
old_step = existing_by_action[key]
|
||||
if old_step.timeout and new_step.timeout:
|
||||
old_step.timeout = int(
|
||||
(old_step.timeout + new_step.timeout) / 2
|
||||
)
|
||||
# New steps that don't exist are noted but not auto-added
|
||||
# (to avoid corrupting a working flow)
|
||||
|
||||
# Update metadata
|
||||
existing_flow.times_used = (existing_flow.times_used or 0) + 1
|
||||
existing_flow.last_used = datetime.now()
|
||||
|
||||
logger.info(f" ✅ Merged. Flow now has {len(existing_flow.steps)} steps")
|
||||
return existing_flow
|
||||
|
||||
# ================================================================
|
||||
# LLM Enhancement
|
||||
# ================================================================
|
||||
|
||||
async def _llm_enhance_steps(
|
||||
self,
|
||||
steps: list[CallFlowStep],
|
||||
intent: Optional[str],
|
||||
) -> list[CallFlowStep]:
|
||||
"""Use LLM to improve step descriptions and structure."""
|
||||
if not self._llm:
|
||||
return steps
|
||||
|
||||
try:
|
||||
# Build a summary of the steps for the LLM
|
||||
step_descriptions = []
|
||||
for s in steps:
|
||||
desc = f"- {s.action.value}"
|
||||
if s.action_value:
|
||||
desc += f" ({s.action_value})"
|
||||
if s.description:
|
||||
desc += f": {s.description}"
|
||||
step_descriptions.append(desc)
|
||||
|
||||
prompt = (
|
||||
f"These are steps discovered while navigating a phone IVR system.\n"
|
||||
f"Intent: {intent or 'general inquiry'}\n\n"
|
||||
f"Steps:\n" + "\n".join(step_descriptions) + "\n\n"
|
||||
f"For each step, provide a clear, concise description of what "
|
||||
f"that step does. Return JSON array of objects with 'step_index' "
|
||||
f"and 'description' fields."
|
||||
)
|
||||
|
||||
result = await self._llm.chat_json(
|
||||
prompt,
|
||||
system="You are labeling IVR phone menu steps for a call flow database.",
|
||||
)
|
||||
|
||||
# Apply LLM descriptions
|
||||
if isinstance(result, list):
|
||||
for item in result:
|
||||
idx = item.get("step_index", -1)
|
||||
desc = item.get("description", "")
|
||||
if 0 <= idx < len(steps) and desc:
|
||||
steps[idx].description = desc
|
||||
elif isinstance(result, dict) and "steps" in result:
|
||||
for item in result["steps"]:
|
||||
idx = item.get("step_index", -1)
|
||||
desc = item.get("description", "")
|
||||
if 0 <= idx < len(steps) and desc:
|
||||
steps[idx].description = desc
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" LLM enhancement failed (non-fatal): {e}")
|
||||
|
||||
return steps
|
||||
|
||||
# ================================================================
|
||||
# Helpers
|
||||
# ================================================================
|
||||
|
||||
@staticmethod
|
||||
def _summarize_menu(transcript: str) -> Optional[str]:
|
||||
"""Create a short summary of an IVR menu transcript."""
|
||||
if not transcript:
|
||||
return None
|
||||
|
||||
# Count how many options
|
||||
options = re.findall(r'press\s+\d+', transcript.lower())
|
||||
if options:
|
||||
return f"IVR menu with {len(options)} options"
|
||||
|
||||
# Truncate long transcripts
|
||||
if len(transcript) > 80:
|
||||
return transcript[:77] + "..."
|
||||
return transcript
|
||||
|
||||
@staticmethod
|
||||
def _extract_expect_pattern(transcript: str) -> Optional[str]:
|
||||
"""Extract a regex pattern to match this prompt next time."""
|
||||
if not transcript:
|
||||
return None
|
||||
|
||||
# Find the most distinctive phrase (>4 words, not generic)
|
||||
words = transcript.split()
|
||||
if len(words) >= 4:
|
||||
# Use first meaningful phrase
|
||||
phrase = " ".join(words[:6])
|
||||
# Escape for regex
|
||||
return re.escape(phrase.lower())
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _guess_company_name(phone_number: str) -> str:
|
||||
"""Guess company name from phone number (placeholder)."""
|
||||
# In production, this would do a reverse lookup
|
||||
return f"Company {phone_number[-4:]}"
|
||||
|
||||
@staticmethod
|
||||
def _empty_flow(phone_number: str, company_name: Optional[str]) -> CallFlow:
|
||||
"""Create an empty flow placeholder."""
|
||||
return CallFlow(
|
||||
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
||||
name=f"{company_name or phone_number} — Empty",
|
||||
phone_number=phone_number,
|
||||
description="Empty flow — no meaningful steps discovered",
|
||||
steps=[],
|
||||
tags=["auto-learned", "empty"],
|
||||
)
|
||||
Reference in New Issue
Block a user