feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered
telephony system that calls companies, navigates IVR menus, waits on
hold, and transfers to the user when a human answers.

Key components:
- FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces
- SIP/VoIP call management via PJSUA2 with RTP audio streaming
- LLM-powered IVR navigation using OpenAI/Anthropic with tool calling
- Hold detection service combining audio analysis and silence detection
- Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines
- Call recording with per-channel and mixed audio capture
- Event bus (asyncio pub/sub) for real-time client updates
- Web dashboard with live call monitoring
- SQLite persistence via SQLAlchemy with call history and analytics
- Notification support (email, SMS, webhook, desktop)
- Docker Compose deployment with Opal VoIP and Opal Media containers
- Comprehensive test suite with unit, integration, and E2E tests
- Simplified .gitignore and full project documentation in README
This commit is contained in:
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions

717
services/hold_slayer.py Normal file
View File

@@ -0,0 +1,717 @@
"""
Hold Slayer Service — The main event.
Navigate IVR trees, wait on hold, detect when a human picks up,
and transfer you in. This is the state machine that orchestrates
the entire hold-slaying process.
Two modes:
1. run_with_flow(): Follow a stored call flow tree (fast, reliable)
2. run_exploration(): No stored flow — listen, transcribe, and figure it out
"""
import asyncio
import logging
import re
import time
from typing import Optional
from config import Settings
from core.call_manager import CallManager
from core.sip_engine import SIPEngine
from models.call import ActiveCall, AudioClassification, CallStatus, ClassificationResult
from models.call_flow import ActionType, CallFlow, CallFlowStep
from models.events import EventType, GatewayEvent
from services.audio_classifier import AudioClassifier
from services.transcription import TranscriptionService
logger = logging.getLogger(__name__)
# LLM client is optional — imported at use time
_llm_client = None
def _get_llm():
"""Lazy-load LLM client (optional dependency)."""
global _llm_client
if _llm_client is None:
try:
from config import get_settings
from services.llm_client import LLMClient
settings = get_settings()
_llm_client = LLMClient(
base_url=settings.llm.base_url,
model=settings.llm.model,
api_key=settings.llm.api_key,
timeout=settings.llm.timeout,
)
except Exception as e:
logger.debug(f"LLM client not available: {e}")
_llm_client = False # Sentinel: don't retry
return _llm_client if _llm_client is not False else None
class HoldSlayerService:
"""
The Hold Slayer.
Navigates IVR menus, waits on hold, detects live humans,
and transfers the call to your device.
"""
def __init__(
self,
gateway, # AIPSTNGateway (avoid circular import)
call_manager: CallManager,
sip_engine: SIPEngine,
classifier: AudioClassifier,
transcription: TranscriptionService,
settings: Settings,
):
self.gateway = gateway
self.call_manager = call_manager
self.sip_engine = sip_engine
self.classifier = classifier
self.transcription = transcription
self.settings = settings
async def run(
self,
call: ActiveCall,
sip_leg_id: str,
call_flow_id: Optional[str] = None,
) -> bool:
"""
Main entry point. Run the Hold Slayer on a call.
Args:
call: The active call to work on
sip_leg_id: SIP leg ID for the PSTN call
call_flow_id: Optional stored call flow to follow
Returns:
True if successfully transferred to user, False otherwise
"""
logger.info(f"🗡️ Hold Slayer activated for {call.remote_number}")
logger.info(f" Intent: {call.intent}")
logger.info(f" Call Flow: {call_flow_id or 'exploration mode'}")
try:
# Wait for call to be connected
await self._wait_for_connection(call, timeout=60)
if call_flow_id:
# Load the stored call flow from the database
flow = await self._load_call_flow(call_flow_id)
if flow:
return await self.run_with_flow(call, sip_leg_id, flow)
else:
logger.warning(f"Call flow '{call_flow_id}' not found, switching to exploration")
# No flow or flow not found — explore
return await self.run_exploration(call, sip_leg_id)
except asyncio.CancelledError:
logger.info(f"Hold Slayer cancelled for {call.id}")
return False
except Exception as e:
logger.error(f"Hold Slayer error: {e}", exc_info=True)
await self.call_manager.update_status(call.id, CallStatus.FAILED)
return False
# ================================================================
# Mode 1: Follow a Stored Call Flow
# ================================================================
async def run_with_flow(
self,
call: ActiveCall,
sip_leg_id: str,
flow: CallFlow,
) -> bool:
"""
Navigate using a stored call flow tree.
Falls back to exploration for unknown steps.
"""
logger.info(f"📋 Following call flow: {flow.name}")
steps = flow.steps_by_id()
current_step_id = flow.steps[0].id if flow.steps else None
while current_step_id:
step = steps.get(current_step_id)
if not step:
logger.error(f"Step '{current_step_id}' not found in flow")
break
call.current_step_id = current_step_id
logger.info(f"📍 Step: {step.description}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_STEP,
call_id=call.id,
data={"step_id": step.id, "description": step.description, "action": step.action.value},
message=f"📍 IVR Step: {step.description}",
))
# === Execute the step based on its action type ===
if step.action == ActionType.HOLD:
# HOLD MODE: Audio classifier takes over
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
logger.info(f"⏳ On hold. Activating hold detection...")
human_detected = await self._wait_for_human(
call, sip_leg_id, timeout=step.timeout
)
if human_detected:
current_step_id = step.next_step
else:
logger.warning("⏰ Hold timeout reached!")
break
elif step.action == ActionType.DTMF:
# Wait for the expected prompt, then send DTMF
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
if step.expect:
heard = await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
if not heard and step.fallback_step:
logger.info(f"⚠️ Didn't hear expected prompt, falling back")
current_step_id = step.fallback_step
continue
# Send the DTMF digits
if step.action_value:
await self.sip_engine.send_dtmf(sip_leg_id, step.action_value)
logger.info(f"📱 Pressed: {step.action_value}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_DTMF_SENT,
call_id=call.id,
data={"digits": step.action_value, "step": step.id},
message=f"📱 DTMF sent: {step.action_value}",
))
# Small delay after DTMF for the IVR to process
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.WAIT:
# Just wait for a prompt
if step.expect:
await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
else:
await asyncio.sleep(step.timeout)
current_step_id = step.next_step
elif step.action == ActionType.LISTEN:
# Listen and decide — regex first, LLM fallback
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
transcript = await self._listen_for_menu(
call, sip_leg_id, step.timeout
)
# Phase 1: Try regex-based keyword matching (fast, no API call)
decision = self._decide_menu_option(
transcript, call.intent or "", step.expect
)
# Phase 2: LLM fallback if regex couldn't decide
if not decision and transcript:
llm = _get_llm()
if llm:
try:
logger.info("🤖 Regex inconclusive, asking LLM...")
llm_result = await llm.analyze_ivr_menu(
transcript=transcript,
intent=call.intent or "",
previous_selections=list(call.dtmf_history) if hasattr(call, 'dtmf_history') else None,
)
decision = llm_result.get("digit")
if decision:
confidence = llm_result.get("confidence", 0)
reason = llm_result.get("reason", "")
logger.info(
f"🤖 LLM decided: press {decision} "
f"(confidence={confidence}, reason='{reason}')"
)
except Exception as e:
logger.warning(f"🤖 LLM fallback failed: {e}")
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
logger.info(f"🧠 Decided: press {decision} (heard: '{transcript[:60]}...')")
else:
# Default: press 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
logger.info(f"🧠 No clear match, pressing 0 for agent")
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.SPEAK:
# Say something into the call (TTS)
# TODO: Implement TTS integration
logger.info(f"🗣️ Would say: '{step.action_value}' (TTS not yet implemented)")
await asyncio.sleep(3.0)
current_step_id = step.next_step
elif step.action == ActionType.TRANSFER:
# We did it! Transfer to user's device
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
logger.info(f"🚨 TRANSFERRING TO {step.action_value}")
device_target = step.action_value or call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device_target)
return True
else:
logger.warning(f"Unknown action type: {step.action}")
current_step_id = step.next_step
return False
# ================================================================
# Mode 2: Exploration (No Stored Flow)
# ================================================================
async def run_exploration(
self,
call: ActiveCall,
sip_leg_id: str,
) -> bool:
"""
No stored flow — explore the IVR blind.
Records what it discovers so we can build a flow for next time.
"""
logger.info(f"🔍 Exploration mode: discovering IVR for {call.remote_number}")
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
discovered_steps: list[dict] = []
max_time = self.settings.hold_slayer.max_hold_time
start_time = time.time()
while time.time() - start_time < max_time:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
break
# Get audio and classify
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception as e:
logger.error(f"Audio stream error: {e}")
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify the audio
classification = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(classification.audio_type)
await self.call_manager.add_classification(call.id, classification)
# Transcribe if it sounds like speech
transcript = ""
if classification.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
transcript = await self.transcription.transcribe(
audio_chunk,
prompt="Phone IVR menu, customer service, press 1 for..."
)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# Record discovery
discovered_steps.append({
"timestamp": time.time(),
"audio_type": classification.audio_type.value,
"confidence": classification.confidence,
"transcript": transcript,
"action_taken": None,
})
# === Decision Logic ===
if classification.audio_type == AudioClassification.LIVE_HUMAN:
# HUMAN DETECTED! Transfer!
logger.info("🚨 LIVE HUMAN DETECTED!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
logger.info(f"📋 Discovered {len(discovered_steps)} IVR steps")
return True
elif classification.audio_type == AudioClassification.MUSIC:
# On hold — just keep monitoring
if current_call.status != CallStatus.ON_HOLD:
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
# Check for hold→human transition
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
return True
elif classification.audio_type == AudioClassification.IVR_PROMPT and transcript:
# IVR menu — try to navigate
decision = self._decide_menu_option(
transcript, call.intent or "", None
)
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
discovered_steps[-1]["action_taken"] = {"dtmf": decision}
logger.info(f"🧠 Exploration: pressed {decision}")
await asyncio.sleep(2.0)
else:
# Try pressing 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
discovered_steps[-1]["action_taken"] = {"dtmf": "0", "reason": "default_agent"}
logger.info("🧠 Exploration: pressed 0 (trying for agent)")
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.SILENCE:
# Silence — wait a bit
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.RINGING:
# Still ringing
await asyncio.sleep(1.0)
logger.warning(f"Hold Slayer timed out after {max_time}s")
return False
# ================================================================
# Core Detection Methods
# ================================================================
async def _wait_for_human(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 7200,
) -> bool:
"""
Wait on hold until a live human is detected.
Continuously classifies audio and watches for the
music → speech transition.
"""
check_interval = self.settings.hold_slayer.hold_check_interval
start_time = time.time()
while time.time() - start_time < timeout:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
return False
# Get audio chunk
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= int(16000 * 2 * check_interval):
break
except Exception:
await asyncio.sleep(check_interval)
continue
if not audio_chunk:
await asyncio.sleep(check_interval)
continue
# Classify
result = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(result.audio_type)
await self.call_manager.add_classification(call.id, result)
# Check for human
if result.audio_type == AudioClassification.LIVE_HUMAN:
# Verify with transcription
transcript = await self.transcription.transcribe(audio_chunk)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# If we got meaningful speech, it's probably a real person
if len(transcript.split()) >= 3:
logger.info(f"🚨 Human confirmed! Said: '{transcript[:100]}'")
return True
# Check for the music→speech transition pattern
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
return True
# Log progress periodically
elapsed = int(time.time() - start_time)
if elapsed > 0 and elapsed % 60 == 0:
logger.info(
f"⏳ Still on hold... {elapsed}s "
f"(audio: {result.audio_type.value}, {result.confidence:.0%})"
)
return False
async def _wait_for_prompt(
self,
call: ActiveCall,
sip_leg_id: str,
expected_pattern: str,
timeout: int = 30,
) -> bool:
"""
Wait for an expected IVR prompt.
Listens, transcribes, and checks if the transcript matches
the expected pattern (regex or keywords).
"""
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify first
result = self.classifier.classify_chunk(audio_chunk)
if result.audio_type not in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
continue
# Transcribe
transcript = await self.transcription.transcribe(audio_chunk)
if not transcript:
continue
await self.call_manager.add_transcript(call.id, transcript)
# Check if it matches expected pattern
try:
if re.search(expected_pattern, transcript, re.IGNORECASE):
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
except re.error:
# Treat as keyword search if regex is invalid
if expected_pattern.lower() in transcript.lower():
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
logger.warning(f"⚠️ Didn't hear expected prompt within {timeout}s")
return False
async def _listen_for_menu(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 30,
) -> str:
"""Listen for an IVR menu and return the full transcript."""
transcript_parts: list[str] = []
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 5: # 5 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
break
result = self.classifier.classify_chunk(audio_chunk)
# If we're getting silence after speech, the menu prompt is done
if result.audio_type == AudioClassification.SILENCE and transcript_parts:
break
if result.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
text = await self.transcription.transcribe(audio_chunk)
if text:
transcript_parts.append(text)
full_transcript = " ".join(transcript_parts)
if full_transcript:
await self.call_manager.add_transcript(call.id, full_transcript)
return full_transcript
async def _wait_for_connection(self, call: ActiveCall, timeout: int = 60) -> None:
"""Wait for the call to be connected (answered)."""
start = time.time()
while time.time() - start < timeout:
current = self.call_manager.get_call(call.id)
if not current:
raise RuntimeError(f"Call {call.id} disappeared")
if current.status in (CallStatus.CONNECTED, CallStatus.NAVIGATING_IVR):
return
if current.status in (CallStatus.FAILED, CallStatus.CANCELLED):
raise RuntimeError(f"Call {call.id} failed: {current.status}")
await asyncio.sleep(0.5)
raise TimeoutError(f"Call {call.id} not connected within {timeout}s")
# ================================================================
# Menu Navigation Logic
# ================================================================
def _decide_menu_option(
self,
transcript: str,
intent: str,
expected_options: Optional[str],
) -> Optional[str]:
"""
Decide which menu option to select based on transcript and intent.
Simple keyword-based matching. This is where an LLM integration
would massively improve navigation accuracy.
Returns:
DTMF digit(s) to press, or None if can't decide
"""
transcript_lower = transcript.lower()
intent_lower = intent.lower()
# Common IVR patterns: "press 1 for X, press 2 for Y"
# Extract options
options = re.findall(
r'(?:press|dial|say)\s+(\d+)\s+(?:for|to)\s+(.+?)(?:\.|,|press|dial|$)',
transcript_lower,
)
if not options:
# Try alternate patterns: "for X, press 1"
options = re.findall(
r'for\s+(.+?),?\s*(?:press|dial)\s+(\d+)',
transcript_lower,
)
# Swap order to be (digit, description)
options = [(digit, desc) for desc, digit in options]
if not options:
return None
# Score each option against the intent
best_match = None
best_score = 0
# Keywords that map intents to IVR options
intent_keywords = {
"cancel": ["cancel", "close", "end", "terminate"],
"dispute": ["dispute", "charge", "billing", "transaction", "statement"],
"balance": ["balance", "account", "summary"],
"agent": ["agent", "representative", "operator", "speak", "person", "human"],
"payment": ["payment", "pay", "bill"],
"card": ["card", "credit", "debit"],
"fraud": ["fraud", "unauthorized", "stolen", "lost"],
"transfer": ["transfer", "move", "send"],
}
for digit, description in options:
score = 0
# Direct keyword match in description
for keyword_group, keywords in intent_keywords.items():
if any(kw in intent_lower for kw in keywords):
if any(kw in description for kw in keywords):
score += 10
# Fuzzy: any word overlap between intent and description
intent_words = set(intent_lower.split())
desc_words = set(description.split())
overlap = intent_words & desc_words
score += len(overlap) * 3
# "Speak to agent" is usually what we want if nothing else matches
if any(w in description for w in ["agent", "representative", "operator", "person"]):
score += 5
if score > best_score:
best_score = score
best_match = digit
if best_match and best_score >= 3:
return best_match
# Default: look for "agent" or "representative" option
for digit, description in options:
if any(w in description for w in ["agent", "representative", "operator"]):
return digit
return None
async def _load_call_flow(self, flow_id: str) -> Optional[CallFlow]:
"""Load a stored call flow from the database."""
from db.database import get_session_factory, StoredCallFlow
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if row:
from models.call_flow import CallFlowStep
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
except Exception as e:
logger.error(f"Failed to load call flow '{flow_id}': {e}")
return None