""" Hold Slayer Service β€” The main event. Navigate IVR trees, wait on hold, detect when a human picks up, and transfer you in. This is the state machine that orchestrates the entire hold-slaying process. Two modes: 1. run_with_flow(): Follow a stored call flow tree (fast, reliable) 2. run_exploration(): No stored flow β€” listen, transcribe, and figure it out """ import asyncio import logging import re import time from typing import Optional from config import Settings from core.call_manager import CallManager from core.sip_engine import SIPEngine from models.call import ActiveCall, AudioClassification, CallStatus, ClassificationResult from models.call_flow import ActionType, CallFlow, CallFlowStep from models.events import EventType, GatewayEvent from services.audio_classifier import AudioClassifier from services.transcription import TranscriptionService logger = logging.getLogger(__name__) # LLM client is optional β€” imported at use time _llm_client = None def _get_llm(): """Lazy-load LLM client (optional dependency).""" global _llm_client if _llm_client is None: try: from config import get_settings from services.llm_client import LLMClient settings = get_settings() _llm_client = LLMClient( base_url=settings.llm.base_url, model=settings.llm.model, api_key=settings.llm.api_key, timeout=settings.llm.timeout, ) except Exception as e: logger.debug(f"LLM client not available: {e}") _llm_client = False # Sentinel: don't retry return _llm_client if _llm_client is not False else None class HoldSlayerService: """ The Hold Slayer. Navigates IVR menus, waits on hold, detects live humans, and transfers the call to your device. """ def __init__( self, gateway, # AIPSTNGateway (avoid circular import) call_manager: CallManager, sip_engine: SIPEngine, classifier: AudioClassifier, transcription: TranscriptionService, settings: Settings, ): self.gateway = gateway self.call_manager = call_manager self.sip_engine = sip_engine self.classifier = classifier self.transcription = transcription self.settings = settings async def run( self, call: ActiveCall, sip_leg_id: str, call_flow_id: Optional[str] = None, ) -> bool: """ Main entry point. Run the Hold Slayer on a call. Args: call: The active call to work on sip_leg_id: SIP leg ID for the PSTN call call_flow_id: Optional stored call flow to follow Returns: True if successfully transferred to user, False otherwise """ logger.info(f"πŸ—‘οΈ Hold Slayer activated for {call.remote_number}") logger.info(f" Intent: {call.intent}") logger.info(f" Call Flow: {call_flow_id or 'exploration mode'}") try: # Wait for call to be connected await self._wait_for_connection(call, timeout=60) if call_flow_id: # Load the stored call flow from the database flow = await self._load_call_flow(call_flow_id) if flow: return await self.run_with_flow(call, sip_leg_id, flow) else: logger.warning(f"Call flow '{call_flow_id}' not found, switching to exploration") # No flow or flow not found β€” explore return await self.run_exploration(call, sip_leg_id) except asyncio.CancelledError: logger.info(f"Hold Slayer cancelled for {call.id}") return False except Exception as e: logger.error(f"Hold Slayer error: {e}", exc_info=True) await self.call_manager.update_status(call.id, CallStatus.FAILED) return False # ================================================================ # Mode 1: Follow a Stored Call Flow # ================================================================ async def run_with_flow( self, call: ActiveCall, sip_leg_id: str, flow: CallFlow, ) -> bool: """ Navigate using a stored call flow tree. Falls back to exploration for unknown steps. """ logger.info(f"πŸ“‹ Following call flow: {flow.name}") steps = flow.steps_by_id() current_step_id = flow.steps[0].id if flow.steps else None while current_step_id: step = steps.get(current_step_id) if not step: logger.error(f"Step '{current_step_id}' not found in flow") break call.current_step_id = current_step_id logger.info(f"πŸ“ Step: {step.description}") await self.call_manager.event_bus.publish(GatewayEvent( type=EventType.IVR_STEP, call_id=call.id, data={"step_id": step.id, "description": step.description, "action": step.action.value}, message=f"πŸ“ IVR Step: {step.description}", )) # === Execute the step based on its action type === if step.action == ActionType.HOLD: # HOLD MODE: Audio classifier takes over await self.call_manager.update_status(call.id, CallStatus.ON_HOLD) logger.info(f"⏳ On hold. Activating hold detection...") human_detected = await self._wait_for_human( call, sip_leg_id, timeout=step.timeout ) if human_detected: current_step_id = step.next_step else: logger.warning("⏰ Hold timeout reached!") break elif step.action == ActionType.DTMF: # Wait for the expected prompt, then send DTMF await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR) if step.expect: heard = await self._wait_for_prompt( call, sip_leg_id, step.expect, step.timeout ) if not heard and step.fallback_step: logger.info(f"⚠️ Didn't hear expected prompt, falling back") current_step_id = step.fallback_step continue # Send the DTMF digits if step.action_value: await self.sip_engine.send_dtmf(sip_leg_id, step.action_value) logger.info(f"πŸ“± Pressed: {step.action_value}") await self.call_manager.event_bus.publish(GatewayEvent( type=EventType.IVR_DTMF_SENT, call_id=call.id, data={"digits": step.action_value, "step": step.id}, message=f"πŸ“± DTMF sent: {step.action_value}", )) # Small delay after DTMF for the IVR to process await asyncio.sleep(2.0) current_step_id = step.next_step elif step.action == ActionType.WAIT: # Just wait for a prompt if step.expect: await self._wait_for_prompt( call, sip_leg_id, step.expect, step.timeout ) else: await asyncio.sleep(step.timeout) current_step_id = step.next_step elif step.action == ActionType.LISTEN: # Listen and decide β€” regex first, LLM fallback await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR) transcript = await self._listen_for_menu( call, sip_leg_id, step.timeout ) # Phase 1: Try regex-based keyword matching (fast, no API call) decision = self._decide_menu_option( transcript, call.intent or "", step.expect ) # Phase 2: LLM fallback if regex couldn't decide if not decision and transcript: llm = _get_llm() if llm: try: logger.info("πŸ€– Regex inconclusive, asking LLM...") llm_result = await llm.analyze_ivr_menu( transcript=transcript, intent=call.intent or "", previous_selections=list(call.dtmf_history) if hasattr(call, 'dtmf_history') else None, ) decision = llm_result.get("digit") if decision: confidence = llm_result.get("confidence", 0) reason = llm_result.get("reason", "") logger.info( f"πŸ€– LLM decided: press {decision} " f"(confidence={confidence}, reason='{reason}')" ) except Exception as e: logger.warning(f"πŸ€– LLM fallback failed: {e}") if decision: await self.sip_engine.send_dtmf(sip_leg_id, decision) logger.info(f"🧠 Decided: press {decision} (heard: '{transcript[:60]}...')") else: # Default: press 0 for agent await self.sip_engine.send_dtmf(sip_leg_id, "0") logger.info(f"🧠 No clear match, pressing 0 for agent") await asyncio.sleep(2.0) current_step_id = step.next_step elif step.action == ActionType.SPEAK: # Say something into the call (TTS) # TODO: Implement TTS integration logger.info(f"πŸ—£οΈ Would say: '{step.action_value}' (TTS not yet implemented)") await asyncio.sleep(3.0) current_step_id = step.next_step elif step.action == ActionType.TRANSFER: # We did it! Transfer to user's device await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED) logger.info(f"🚨 TRANSFERRING TO {step.action_value}") device_target = step.action_value or call.device or self.settings.hold_slayer.default_transfer_device await self.gateway.transfer_call(call.id, device_target) return True else: logger.warning(f"Unknown action type: {step.action}") current_step_id = step.next_step return False # ================================================================ # Mode 2: Exploration (No Stored Flow) # ================================================================ async def run_exploration( self, call: ActiveCall, sip_leg_id: str, ) -> bool: """ No stored flow β€” explore the IVR blind. Records what it discovers so we can build a flow for next time. """ logger.info(f"πŸ” Exploration mode: discovering IVR for {call.remote_number}") await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR) discovered_steps: list[dict] = [] max_time = self.settings.hold_slayer.max_hold_time start_time = time.time() while time.time() - start_time < max_time: # Check if call is still active current_call = self.call_manager.get_call(call.id) if not current_call or current_call.status in ( CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED ): break # Get audio and classify audio_chunk = b"" try: async for chunk in self.sip_engine.get_audio_stream(sip_leg_id): audio_chunk += chunk if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds break except Exception as e: logger.error(f"Audio stream error: {e}") await asyncio.sleep(1.0) continue if not audio_chunk: await asyncio.sleep(1.0) continue # Classify the audio classification = self.classifier.classify_chunk(audio_chunk) self.classifier.update_history(classification.audio_type) await self.call_manager.add_classification(call.id, classification) # Transcribe if it sounds like speech transcript = "" if classification.audio_type in ( AudioClassification.IVR_PROMPT, AudioClassification.LIVE_HUMAN, ): transcript = await self.transcription.transcribe( audio_chunk, prompt="Phone IVR menu, customer service, press 1 for..." ) if transcript: await self.call_manager.add_transcript(call.id, transcript) # Record discovery discovered_steps.append({ "timestamp": time.time(), "audio_type": classification.audio_type.value, "confidence": classification.confidence, "transcript": transcript, "action_taken": None, }) # === Decision Logic === if classification.audio_type == AudioClassification.LIVE_HUMAN: # HUMAN DETECTED! Transfer! logger.info("🚨 LIVE HUMAN DETECTED!") await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED) device = call.device or self.settings.hold_slayer.default_transfer_device await self.gateway.transfer_call(call.id, device) logger.info(f"πŸ“‹ Discovered {len(discovered_steps)} IVR steps") return True elif classification.audio_type == AudioClassification.MUSIC: # On hold β€” just keep monitoring if current_call.status != CallStatus.ON_HOLD: await self.call_manager.update_status(call.id, CallStatus.ON_HOLD) # Check for holdβ†’human transition if self.classifier.detect_hold_to_human_transition(): logger.info("🚨 Hold-to-human transition detected!") await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED) device = call.device or self.settings.hold_slayer.default_transfer_device await self.gateway.transfer_call(call.id, device) return True elif classification.audio_type == AudioClassification.IVR_PROMPT and transcript: # IVR menu β€” try to navigate decision = self._decide_menu_option( transcript, call.intent or "", None ) if decision: await self.sip_engine.send_dtmf(sip_leg_id, decision) discovered_steps[-1]["action_taken"] = {"dtmf": decision} logger.info(f"🧠 Exploration: pressed {decision}") await asyncio.sleep(2.0) else: # Try pressing 0 for agent await self.sip_engine.send_dtmf(sip_leg_id, "0") discovered_steps[-1]["action_taken"] = {"dtmf": "0", "reason": "default_agent"} logger.info("🧠 Exploration: pressed 0 (trying for agent)") await asyncio.sleep(2.0) elif classification.audio_type == AudioClassification.SILENCE: # Silence β€” wait a bit await asyncio.sleep(2.0) elif classification.audio_type == AudioClassification.RINGING: # Still ringing await asyncio.sleep(1.0) logger.warning(f"Hold Slayer timed out after {max_time}s") return False # ================================================================ # Core Detection Methods # ================================================================ async def _wait_for_human( self, call: ActiveCall, sip_leg_id: str, timeout: int = 7200, ) -> bool: """ Wait on hold until a live human is detected. Continuously classifies audio and watches for the music β†’ speech transition. """ check_interval = self.settings.hold_slayer.hold_check_interval start_time = time.time() while time.time() - start_time < timeout: # Check if call is still active current_call = self.call_manager.get_call(call.id) if not current_call or current_call.status in ( CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED ): return False # Get audio chunk audio_chunk = b"" try: async for chunk in self.sip_engine.get_audio_stream(sip_leg_id): audio_chunk += chunk if len(audio_chunk) >= int(16000 * 2 * check_interval): break except Exception: await asyncio.sleep(check_interval) continue if not audio_chunk: await asyncio.sleep(check_interval) continue # Classify result = self.classifier.classify_chunk(audio_chunk) self.classifier.update_history(result.audio_type) await self.call_manager.add_classification(call.id, result) # Check for human if result.audio_type == AudioClassification.LIVE_HUMAN: # Verify with transcription transcript = await self.transcription.transcribe(audio_chunk) if transcript: await self.call_manager.add_transcript(call.id, transcript) # If we got meaningful speech, it's probably a real person if len(transcript.split()) >= 3: logger.info(f"🚨 Human confirmed! Said: '{transcript[:100]}'") return True # Check for the musicβ†’speech transition pattern if self.classifier.detect_hold_to_human_transition(): logger.info("🚨 Hold-to-human transition detected!") return True # Log progress periodically elapsed = int(time.time() - start_time) if elapsed > 0 and elapsed % 60 == 0: logger.info( f"⏳ Still on hold... {elapsed}s " f"(audio: {result.audio_type.value}, {result.confidence:.0%})" ) return False async def _wait_for_prompt( self, call: ActiveCall, sip_leg_id: str, expected_pattern: str, timeout: int = 30, ) -> bool: """ Wait for an expected IVR prompt. Listens, transcribes, and checks if the transcript matches the expected pattern (regex or keywords). """ start_time = time.time() while time.time() - start_time < timeout: audio_chunk = b"" try: async for chunk in self.sip_engine.get_audio_stream(sip_leg_id): audio_chunk += chunk if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds break except Exception: await asyncio.sleep(1.0) continue if not audio_chunk: await asyncio.sleep(1.0) continue # Classify first result = self.classifier.classify_chunk(audio_chunk) if result.audio_type not in ( AudioClassification.IVR_PROMPT, AudioClassification.LIVE_HUMAN, ): continue # Transcribe transcript = await self.transcription.transcribe(audio_chunk) if not transcript: continue await self.call_manager.add_transcript(call.id, transcript) # Check if it matches expected pattern try: if re.search(expected_pattern, transcript, re.IGNORECASE): logger.info(f"βœ… Heard expected: '{transcript[:80]}'") return True except re.error: # Treat as keyword search if regex is invalid if expected_pattern.lower() in transcript.lower(): logger.info(f"βœ… Heard expected: '{transcript[:80]}'") return True logger.warning(f"⚠️ Didn't hear expected prompt within {timeout}s") return False async def _listen_for_menu( self, call: ActiveCall, sip_leg_id: str, timeout: int = 30, ) -> str: """Listen for an IVR menu and return the full transcript.""" transcript_parts: list[str] = [] start_time = time.time() while time.time() - start_time < timeout: audio_chunk = b"" try: async for chunk in self.sip_engine.get_audio_stream(sip_leg_id): audio_chunk += chunk if len(audio_chunk) >= 16000 * 2 * 5: # 5 seconds break except Exception: await asyncio.sleep(1.0) continue if not audio_chunk: break result = self.classifier.classify_chunk(audio_chunk) # If we're getting silence after speech, the menu prompt is done if result.audio_type == AudioClassification.SILENCE and transcript_parts: break if result.audio_type in ( AudioClassification.IVR_PROMPT, AudioClassification.LIVE_HUMAN, ): text = await self.transcription.transcribe(audio_chunk) if text: transcript_parts.append(text) full_transcript = " ".join(transcript_parts) if full_transcript: await self.call_manager.add_transcript(call.id, full_transcript) return full_transcript async def _wait_for_connection(self, call: ActiveCall, timeout: int = 60) -> None: """Wait for the call to be connected (answered).""" start = time.time() while time.time() - start < timeout: current = self.call_manager.get_call(call.id) if not current: raise RuntimeError(f"Call {call.id} disappeared") if current.status in (CallStatus.CONNECTED, CallStatus.NAVIGATING_IVR): return if current.status in (CallStatus.FAILED, CallStatus.CANCELLED): raise RuntimeError(f"Call {call.id} failed: {current.status}") await asyncio.sleep(0.5) raise TimeoutError(f"Call {call.id} not connected within {timeout}s") # ================================================================ # Menu Navigation Logic # ================================================================ def _decide_menu_option( self, transcript: str, intent: str, expected_options: Optional[str], ) -> Optional[str]: """ Decide which menu option to select based on transcript and intent. Simple keyword-based matching. This is where an LLM integration would massively improve navigation accuracy. Returns: DTMF digit(s) to press, or None if can't decide """ transcript_lower = transcript.lower() intent_lower = intent.lower() # Common IVR patterns: "press 1 for X, press 2 for Y" # Extract options options = re.findall( r'(?:press|dial|say)\s+(\d+)\s+(?:for|to)\s+(.+?)(?:\.|,|press|dial|$)', transcript_lower, ) if not options: # Try alternate patterns: "for X, press 1" options = re.findall( r'for\s+(.+?),?\s*(?:press|dial)\s+(\d+)', transcript_lower, ) # Swap order to be (digit, description) options = [(digit, desc) for desc, digit in options] if not options: return None # Score each option against the intent best_match = None best_score = 0 # Keywords that map intents to IVR options intent_keywords = { "cancel": ["cancel", "close", "end", "terminate"], "dispute": ["dispute", "charge", "billing", "transaction", "statement"], "balance": ["balance", "account", "summary"], "agent": ["agent", "representative", "operator", "speak", "person", "human"], "payment": ["payment", "pay", "bill"], "card": ["card", "credit", "debit"], "fraud": ["fraud", "unauthorized", "stolen", "lost"], "transfer": ["transfer", "move", "send"], } for digit, description in options: score = 0 # Direct keyword match in description for keyword_group, keywords in intent_keywords.items(): if any(kw in intent_lower for kw in keywords): if any(kw in description for kw in keywords): score += 10 # Fuzzy: any word overlap between intent and description intent_words = set(intent_lower.split()) desc_words = set(description.split()) overlap = intent_words & desc_words score += len(overlap) * 3 # "Speak to agent" is usually what we want if nothing else matches if any(w in description for w in ["agent", "representative", "operator", "person"]): score += 5 if score > best_score: best_score = score best_match = digit if best_match and best_score >= 3: return best_match # Default: look for "agent" or "representative" option for digit, description in options: if any(w in description for w in ["agent", "representative", "operator"]): return digit return None async def _load_call_flow(self, flow_id: str) -> Optional[CallFlow]: """Load a stored call flow from the database.""" from db.database import get_session_factory, StoredCallFlow from sqlalchemy import select try: factory = get_session_factory() async with factory() as session: result = await session.execute( select(StoredCallFlow).where(StoredCallFlow.id == flow_id) ) row = result.scalar_one_or_none() if row: from models.call_flow import CallFlowStep return CallFlow( id=row.id, name=row.name, phone_number=row.phone_number, description=row.description or "", steps=[CallFlowStep(**s) for s in row.steps], tags=row.tags or [], notes=row.notes, avg_hold_time=row.avg_hold_time, success_rate=row.success_rate, last_used=row.last_used, times_used=row.times_used or 0, ) except Exception as e: logger.error(f"Failed to load call flow '{flow_id}': {e}") return None