feat: add initial Hold Slayer AI telephony gateway implementation
Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
This commit is contained in:
1
services/__init__.py
Normal file
1
services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""AI services — hold detection, transcription, classification, and more."""
|
||||
444
services/audio_classifier.py
Normal file
444
services/audio_classifier.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
Audio Classifier — Spectral analysis for hold music, speech, and silence detection.
|
||||
|
||||
This is the brain of the Hold Slayer. It analyzes audio in real-time to determine:
|
||||
- Is this hold music?
|
||||
- Is this an IVR prompt (automated voice)?
|
||||
- Is this a live human?
|
||||
- Is this silence?
|
||||
- Is this a ring-back tone?
|
||||
|
||||
Uses spectral analysis (librosa/numpy) to classify audio without needing
|
||||
a trained ML model — just signal processing and heuristics.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from config import ClassifierSettings
|
||||
from models.call import AudioClassification, ClassificationResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Audio constants
|
||||
SAMPLE_RATE = 16000 # 16kHz mono
|
||||
FRAME_SIZE = SAMPLE_RATE * 2 # 16-bit samples = 2 bytes per sample
|
||||
|
||||
|
||||
class AudioClassifier:
|
||||
"""
|
||||
Real-time audio classifier using spectral analysis.
|
||||
|
||||
Classification strategy:
|
||||
- Silence: Low RMS energy
|
||||
- Music: High spectral flatness + sustained tonal content + rhythm
|
||||
- IVR prompt: Speech-like spectral envelope but repetitive/synthetic
|
||||
- Live human: Speech-like spectral envelope + natural variation
|
||||
- Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring)
|
||||
- DTMF: Dual-tone detection at known DTMF frequencies
|
||||
"""
|
||||
|
||||
def __init__(self, settings: ClassifierSettings):
|
||||
self.settings = settings
|
||||
self._window_buffer: list[bytes] = []
|
||||
self._window_samples = int(settings.window_seconds * SAMPLE_RATE)
|
||||
self._classification_history: list[AudioClassification] = []
|
||||
|
||||
def classify_chunk(self, audio_data: bytes) -> ClassificationResult:
|
||||
"""
|
||||
Classify a chunk of audio data.
|
||||
|
||||
Args:
|
||||
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
|
||||
|
||||
Returns:
|
||||
ClassificationResult with type and confidence
|
||||
"""
|
||||
# Convert bytes to numpy array
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
|
||||
|
||||
if len(samples) == 0:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.SILENCE,
|
||||
confidence=1.0,
|
||||
)
|
||||
|
||||
# Normalize to [-1.0, 1.0]
|
||||
samples = samples / 32768.0
|
||||
|
||||
# Run all detectors
|
||||
rms = self._compute_rms(samples)
|
||||
spectral_flatness = self._compute_spectral_flatness(samples)
|
||||
zcr = self._compute_zero_crossing_rate(samples)
|
||||
dominant_freq = self._compute_dominant_frequency(samples)
|
||||
spectral_centroid = self._compute_spectral_centroid(samples)
|
||||
is_tonal = self._detect_tonality(samples)
|
||||
|
||||
# Build feature dict for debugging
|
||||
features = {
|
||||
"rms": float(rms),
|
||||
"spectral_flatness": float(spectral_flatness),
|
||||
"zcr": float(zcr),
|
||||
"dominant_freq": float(dominant_freq),
|
||||
"spectral_centroid": float(spectral_centroid),
|
||||
"is_tonal": is_tonal,
|
||||
}
|
||||
|
||||
# === Classification Logic ===
|
||||
|
||||
# 1. Silence detection
|
||||
if rms < 0.01:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.SILENCE,
|
||||
confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5),
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 2. DTMF detection (very specific dual-tone pattern)
|
||||
dtmf_result = self._detect_dtmf(samples)
|
||||
if dtmf_result:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.DTMF,
|
||||
confidence=0.95,
|
||||
details={**features, "dtmf_digit": dtmf_result},
|
||||
)
|
||||
|
||||
# 3. Ring-back tone detection (440+480Hz in NA, periodic on/off)
|
||||
if is_tonal and 400 < dominant_freq < 520 and rms > 0.02:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.RINGING,
|
||||
confidence=0.8,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 4. Music vs Speech discrimination
|
||||
# Music: higher spectral flatness, more tonal, wider spectral spread
|
||||
# Speech: lower spectral flatness, concentrated energy, variable ZCR
|
||||
music_score = self._compute_music_score(
|
||||
spectral_flatness, is_tonal, spectral_centroid, zcr, rms
|
||||
)
|
||||
speech_score = self._compute_speech_score(
|
||||
spectral_flatness, zcr, spectral_centroid, rms
|
||||
)
|
||||
|
||||
# 5. If it's speech-like, is it live or automated?
|
||||
if speech_score > music_score:
|
||||
# Use history to distinguish live human from IVR
|
||||
# IVR: repetitive patterns, synthetic prosody
|
||||
# Human: natural variation, conversational rhythm
|
||||
if self._looks_like_live_human(speech_score, zcr, rms):
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.LIVE_HUMAN,
|
||||
confidence=speech_score,
|
||||
details=features,
|
||||
)
|
||||
else:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.IVR_PROMPT,
|
||||
confidence=speech_score * 0.8,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 6. Music (hold music)
|
||||
if music_score >= self.settings.music_threshold:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.MUSIC,
|
||||
confidence=music_score,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 7. Unknown / low confidence
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.UNKNOWN,
|
||||
confidence=max(music_score, speech_score),
|
||||
details=features,
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# Feature Extraction
|
||||
# ================================================================
|
||||
|
||||
@staticmethod
|
||||
def _compute_rms(samples: np.ndarray) -> float:
|
||||
"""Root Mean Square — overall energy level."""
|
||||
return float(np.sqrt(np.mean(samples ** 2)))
|
||||
|
||||
@staticmethod
|
||||
def _compute_spectral_flatness(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Spectral flatness (Wiener entropy).
|
||||
|
||||
Close to 1.0 = noise-like (white noise)
|
||||
Close to 0.0 = tonal (pure tone, music)
|
||||
Speech is typically 0.1-0.4, music 0.05-0.3
|
||||
"""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
fft = fft[fft > 0] # Avoid log(0)
|
||||
|
||||
if len(fft) == 0:
|
||||
return 0.0
|
||||
|
||||
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
|
||||
arithmetic_mean = np.mean(fft)
|
||||
|
||||
if arithmetic_mean == 0:
|
||||
return 0.0
|
||||
|
||||
return float(geometric_mean / arithmetic_mean)
|
||||
|
||||
@staticmethod
|
||||
def _compute_zero_crossing_rate(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Zero-crossing rate — how often the signal crosses zero.
|
||||
|
||||
Higher for unvoiced speech and noise.
|
||||
Lower for voiced speech and tonal music.
|
||||
"""
|
||||
crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2
|
||||
return float(crossings / len(samples))
|
||||
|
||||
@staticmethod
|
||||
def _compute_dominant_frequency(samples: np.ndarray) -> float:
|
||||
"""Find the dominant frequency in the signal."""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
|
||||
|
||||
# Ignore DC and very low frequencies
|
||||
mask = freqs > 50
|
||||
if not np.any(mask):
|
||||
return 0.0
|
||||
|
||||
fft_masked = fft[mask]
|
||||
freqs_masked = freqs[mask]
|
||||
|
||||
return float(freqs_masked[np.argmax(fft_masked)])
|
||||
|
||||
@staticmethod
|
||||
def _compute_spectral_centroid(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Spectral centroid — "center of mass" of the spectrum.
|
||||
|
||||
Higher for bright/treble sounds, lower for bass-heavy sounds.
|
||||
Speech typically 500-4000Hz, music varies widely.
|
||||
"""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
|
||||
|
||||
total_energy = np.sum(fft)
|
||||
if total_energy == 0:
|
||||
return 0.0
|
||||
|
||||
return float(np.sum(freqs * fft) / total_energy)
|
||||
|
||||
@staticmethod
|
||||
def _detect_tonality(samples: np.ndarray) -> bool:
|
||||
"""
|
||||
Check if the signal is strongly tonal (has clear pitch).
|
||||
Uses autocorrelation.
|
||||
"""
|
||||
# Autocorrelation
|
||||
correlation = np.correlate(samples, samples, mode="full")
|
||||
correlation = correlation[len(correlation) // 2:]
|
||||
|
||||
# Normalize
|
||||
if correlation[0] == 0:
|
||||
return False
|
||||
correlation = correlation / correlation[0]
|
||||
|
||||
# Look for a strong peak (indicating periodicity)
|
||||
# Skip the first ~50 samples (very high frequencies)
|
||||
min_lag = int(SAMPLE_RATE / 1000) # ~16 samples (1000Hz max)
|
||||
max_lag = int(SAMPLE_RATE / 50) # ~320 samples (50Hz min)
|
||||
|
||||
search_region = correlation[min_lag:max_lag]
|
||||
if len(search_region) == 0:
|
||||
return False
|
||||
|
||||
peak_value = np.max(search_region)
|
||||
return bool(peak_value > 0.5)
|
||||
|
||||
def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]:
|
||||
"""
|
||||
Detect DTMF tones using Goertzel algorithm (simplified).
|
||||
|
||||
DTMF frequencies:
|
||||
697, 770, 852, 941 Hz (row)
|
||||
1209, 1336, 1477, 1633 Hz (column)
|
||||
"""
|
||||
dtmf_freqs_low = [697, 770, 852, 941]
|
||||
dtmf_freqs_high = [1209, 1336, 1477, 1633]
|
||||
dtmf_map = {
|
||||
(697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A",
|
||||
(770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B",
|
||||
(852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C",
|
||||
(941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D",
|
||||
}
|
||||
|
||||
# Compute power at each DTMF frequency
|
||||
def goertzel_power(freq: int) -> float:
|
||||
k = int(0.5 + len(samples) * freq / SAMPLE_RATE)
|
||||
w = 2 * np.pi * k / len(samples)
|
||||
coeff = 2 * np.cos(w)
|
||||
s0, s1, s2 = 0.0, 0.0, 0.0
|
||||
for sample in samples:
|
||||
s0 = sample + coeff * s1 - s2
|
||||
s2 = s1
|
||||
s1 = s0
|
||||
return float(s1 * s1 + s2 * s2 - coeff * s1 * s2)
|
||||
|
||||
# Find strongest low and high frequencies
|
||||
low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low]
|
||||
high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high]
|
||||
|
||||
best_low = max(low_powers, key=lambda x: x[1])
|
||||
best_high = max(high_powers, key=lambda x: x[1])
|
||||
|
||||
# Threshold: both frequencies must be significantly present
|
||||
total_power = np.sum(samples ** 2)
|
||||
if total_power == 0:
|
||||
return None
|
||||
|
||||
threshold = total_power * 0.1
|
||||
if best_low[1] > threshold and best_high[1] > threshold:
|
||||
key = (best_low[0], best_high[0])
|
||||
return dtmf_map.get(key)
|
||||
|
||||
return None
|
||||
|
||||
# ================================================================
|
||||
# Higher-Level Classification
|
||||
# ================================================================
|
||||
|
||||
def _compute_music_score(
|
||||
self,
|
||||
spectral_flatness: float,
|
||||
is_tonal: bool,
|
||||
spectral_centroid: float,
|
||||
zcr: float,
|
||||
rms: float,
|
||||
) -> float:
|
||||
"""Compute a music likelihood score (0.0 - 1.0)."""
|
||||
score = 0.0
|
||||
|
||||
# Music tends to be tonal
|
||||
if is_tonal:
|
||||
score += 0.3
|
||||
|
||||
# Music has moderate spectral flatness (more than pure tone, less than noise)
|
||||
if 0.05 < spectral_flatness < 0.4:
|
||||
score += 0.2
|
||||
|
||||
# Music has sustained energy
|
||||
if rms > 0.03:
|
||||
score += 0.15
|
||||
|
||||
# Music has wider spectral content than speech
|
||||
if spectral_centroid > 1500:
|
||||
score += 0.15
|
||||
|
||||
# Music tends to have lower ZCR than noise
|
||||
if zcr < 0.15:
|
||||
score += 0.2
|
||||
|
||||
return min(1.0, score)
|
||||
|
||||
def _compute_speech_score(
|
||||
self,
|
||||
spectral_flatness: float,
|
||||
zcr: float,
|
||||
spectral_centroid: float,
|
||||
rms: float,
|
||||
) -> float:
|
||||
"""Compute a speech likelihood score (0.0 - 1.0)."""
|
||||
score = 0.0
|
||||
|
||||
# Speech has moderate spectral flatness
|
||||
if 0.1 < spectral_flatness < 0.5:
|
||||
score += 0.25
|
||||
|
||||
# Speech centroid typically 500-4000 Hz
|
||||
if 500 < spectral_centroid < 4000:
|
||||
score += 0.25
|
||||
|
||||
# Speech has moderate ZCR
|
||||
if 0.02 < zcr < 0.2:
|
||||
score += 0.25
|
||||
|
||||
# Speech has moderate energy
|
||||
if 0.01 < rms < 0.5:
|
||||
score += 0.25
|
||||
|
||||
return min(1.0, score)
|
||||
|
||||
def _looks_like_live_human(
|
||||
self,
|
||||
speech_score: float,
|
||||
zcr: float,
|
||||
rms: float,
|
||||
) -> bool:
|
||||
"""
|
||||
Distinguish live human from IVR/TTS.
|
||||
|
||||
Heuristics:
|
||||
- IVR prompts are followed by silence (waiting for input)
|
||||
- Live humans have more natural variation in energy and pitch
|
||||
- After hold music → speech transition, it's likely a human
|
||||
|
||||
This is the hardest classification and benefits most from
|
||||
the transcript context (Speaches STT).
|
||||
"""
|
||||
# Look at recent classification history
|
||||
recent = self._classification_history[-10:] if self._classification_history else []
|
||||
|
||||
# Key signal: if we were just listening to hold music and now
|
||||
# hear speech, it's very likely a live human agent
|
||||
if recent:
|
||||
recent_types = [c for c in recent]
|
||||
if AudioClassification.MUSIC in recent_types[-5:]:
|
||||
# Transition from music to speech = agent picked up!
|
||||
return True
|
||||
|
||||
# High speech score with good energy = more likely human
|
||||
if speech_score > 0.7 and rms > 0.05:
|
||||
return True
|
||||
|
||||
# Default: assume IVR until proven otherwise
|
||||
return False
|
||||
|
||||
def update_history(self, classification: AudioClassification) -> None:
|
||||
"""Track classification history for pattern detection."""
|
||||
self._classification_history.append(classification)
|
||||
# Keep last 100 classifications
|
||||
if len(self._classification_history) > 100:
|
||||
self._classification_history = self._classification_history[-100:]
|
||||
|
||||
def detect_hold_to_human_transition(self) -> bool:
|
||||
"""
|
||||
Detect the critical moment: hold music → live human.
|
||||
|
||||
Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN
|
||||
"""
|
||||
recent = self._classification_history[-20:]
|
||||
if len(recent) < 5:
|
||||
return False
|
||||
|
||||
# Count recent music vs speech
|
||||
music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC)
|
||||
speech_count = sum(
|
||||
1 for c in recent[-3:]
|
||||
if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT)
|
||||
)
|
||||
|
||||
# If we had a lot of music and now have speech, someone picked up
|
||||
return music_count >= 3 and speech_count >= 2
|
||||
324
services/call_analytics.py
Normal file
324
services/call_analytics.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""
|
||||
Call Analytics Service — Tracks call metrics and generates insights.
|
||||
|
||||
Monitors call patterns, hold times, success rates, and IVR navigation
|
||||
efficiency. Provides data for the dashboard and API.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Optional
|
||||
|
||||
from models.call import ActiveCall, AudioClassification, CallMode, CallStatus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CallAnalytics:
|
||||
"""
|
||||
In-memory call analytics engine.
|
||||
|
||||
Tracks:
|
||||
- Call success/failure rates
|
||||
- Hold time statistics (avg, min, max, p95)
|
||||
- IVR navigation efficiency
|
||||
- Human detection accuracy
|
||||
- Per-number/company patterns
|
||||
- Time-of-day patterns
|
||||
|
||||
In production, this would be backed by TimescaleDB or similar.
|
||||
For now, we keep rolling windows in memory.
|
||||
"""
|
||||
|
||||
def __init__(self, max_history: int = 10000):
|
||||
self._max_history = max_history
|
||||
self._call_records: list[CallRecord] = []
|
||||
self._company_stats: dict[str, CompanyStats] = defaultdict(CompanyStats)
|
||||
|
||||
# ================================================================
|
||||
# Record Calls
|
||||
# ================================================================
|
||||
|
||||
def record_call(self, call: ActiveCall) -> None:
|
||||
"""
|
||||
Record a completed call for analytics.
|
||||
|
||||
Called when a call ends (from CallManager).
|
||||
"""
|
||||
record = CallRecord(
|
||||
call_id=call.id,
|
||||
remote_number=call.remote_number,
|
||||
mode=call.mode,
|
||||
status=call.status,
|
||||
intent=call.intent,
|
||||
started_at=call.created_at,
|
||||
duration_seconds=call.duration,
|
||||
hold_time_seconds=call.hold_time,
|
||||
classification_history=[
|
||||
r.audio_type.value for r in call.classification_history
|
||||
],
|
||||
transcript_chunks=list(call.transcript_chunks),
|
||||
services=list(call.services),
|
||||
)
|
||||
|
||||
self._call_records.append(record)
|
||||
|
||||
# Trim history
|
||||
if len(self._call_records) > self._max_history:
|
||||
self._call_records = self._call_records[-self._max_history :]
|
||||
|
||||
# Update company stats
|
||||
company_key = self._normalize_number(call.remote_number)
|
||||
self._company_stats[company_key].update(record)
|
||||
|
||||
logger.debug(
|
||||
f"📊 Recorded call {call.id}: "
|
||||
f"{call.status.value}, {call.duration}s, hold={call.hold_time}s"
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# Aggregate Stats
|
||||
# ================================================================
|
||||
|
||||
def get_summary(self, hours: int = 24) -> dict[str, Any]:
|
||||
"""Get summary statistics for the last N hours."""
|
||||
cutoff = datetime.now() - timedelta(hours=hours)
|
||||
recent = [r for r in self._call_records if r.started_at >= cutoff]
|
||||
|
||||
if not recent:
|
||||
return {
|
||||
"period_hours": hours,
|
||||
"total_calls": 0,
|
||||
"success_rate": 0.0,
|
||||
"avg_hold_time": 0.0,
|
||||
"avg_duration": 0.0,
|
||||
}
|
||||
|
||||
total = len(recent)
|
||||
successful = sum(1 for r in recent if r.status in (
|
||||
CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED
|
||||
))
|
||||
failed = sum(1 for r in recent if r.status == CallStatus.FAILED)
|
||||
|
||||
hold_times = [r.hold_time_seconds for r in recent if r.hold_time_seconds > 0]
|
||||
durations = [r.duration_seconds for r in recent if r.duration_seconds > 0]
|
||||
|
||||
hold_slayer_calls = [r for r in recent if r.mode == CallMode.HOLD_SLAYER]
|
||||
hold_slayer_success = sum(
|
||||
1 for r in hold_slayer_calls
|
||||
if r.status in (CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED)
|
||||
)
|
||||
|
||||
return {
|
||||
"period_hours": hours,
|
||||
"total_calls": total,
|
||||
"successful": successful,
|
||||
"failed": failed,
|
||||
"success_rate": round(successful / total, 3) if total else 0.0,
|
||||
"avg_duration": round(sum(durations) / len(durations), 1) if durations else 0.0,
|
||||
"max_duration": max(durations) if durations else 0,
|
||||
"hold_time": {
|
||||
"avg": round(sum(hold_times) / len(hold_times), 1) if hold_times else 0.0,
|
||||
"min": min(hold_times) if hold_times else 0,
|
||||
"max": max(hold_times) if hold_times else 0,
|
||||
"p95": self._percentile(hold_times, 95) if hold_times else 0,
|
||||
"total": sum(hold_times),
|
||||
},
|
||||
"hold_slayer": {
|
||||
"total": len(hold_slayer_calls),
|
||||
"success": hold_slayer_success,
|
||||
"success_rate": round(
|
||||
hold_slayer_success / len(hold_slayer_calls), 3
|
||||
) if hold_slayer_calls else 0.0,
|
||||
},
|
||||
"by_mode": self._group_by_mode(recent),
|
||||
"by_hour": self._group_by_hour(recent),
|
||||
}
|
||||
|
||||
def get_company_stats(self, number: str) -> dict[str, Any]:
|
||||
"""Get stats for a specific company/number."""
|
||||
key = self._normalize_number(number)
|
||||
stats = self._company_stats.get(key)
|
||||
if not stats:
|
||||
return {"number": number, "total_calls": 0}
|
||||
return stats.to_dict(number)
|
||||
|
||||
def get_top_numbers(self, limit: int = 10) -> list[dict[str, Any]]:
|
||||
"""Get the most-called numbers with their stats."""
|
||||
sorted_stats = sorted(
|
||||
self._company_stats.items(),
|
||||
key=lambda x: x[1].total_calls,
|
||||
reverse=True,
|
||||
)[:limit]
|
||||
return [stats.to_dict(number) for number, stats in sorted_stats]
|
||||
|
||||
# ================================================================
|
||||
# Hold Time Trends
|
||||
# ================================================================
|
||||
|
||||
def get_hold_time_trend(
|
||||
self,
|
||||
number: Optional[str] = None,
|
||||
days: int = 7,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Get hold time trend data for graphing.
|
||||
|
||||
Returns daily average hold times for the last N days.
|
||||
"""
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
records = [r for r in self._call_records if r.started_at >= cutoff]
|
||||
|
||||
if number:
|
||||
key = self._normalize_number(number)
|
||||
records = [r for r in records if self._normalize_number(r.remote_number) == key]
|
||||
|
||||
# Group by day
|
||||
by_day: dict[str, list[int]] = defaultdict(list)
|
||||
for r in records:
|
||||
day = r.started_at.strftime("%Y-%m-%d")
|
||||
if r.hold_time_seconds > 0:
|
||||
by_day[day].append(r.hold_time_seconds)
|
||||
|
||||
trend = []
|
||||
for i in range(days):
|
||||
date = (datetime.now() - timedelta(days=days - 1 - i)).strftime("%Y-%m-%d")
|
||||
times = by_day.get(date, [])
|
||||
trend.append({
|
||||
"date": date,
|
||||
"avg_hold_time": round(sum(times) / len(times), 1) if times else 0,
|
||||
"call_count": len(times),
|
||||
"max_hold_time": max(times) if times else 0,
|
||||
})
|
||||
|
||||
return trend
|
||||
|
||||
# ================================================================
|
||||
# Helpers
|
||||
# ================================================================
|
||||
|
||||
@staticmethod
|
||||
def _normalize_number(number: str) -> str:
|
||||
"""Normalize phone number for grouping."""
|
||||
# Strip formatting, keep last 10 digits
|
||||
digits = "".join(c for c in number if c.isdigit())
|
||||
return digits[-10:] if len(digits) >= 10 else digits
|
||||
|
||||
@staticmethod
|
||||
def _percentile(values: list, pct: int) -> float:
|
||||
"""Calculate percentile value."""
|
||||
if not values:
|
||||
return 0.0
|
||||
sorted_vals = sorted(values)
|
||||
idx = int(len(sorted_vals) * pct / 100)
|
||||
idx = min(idx, len(sorted_vals) - 1)
|
||||
return float(sorted_vals[idx])
|
||||
|
||||
@staticmethod
|
||||
def _group_by_mode(records: list["CallRecord"]) -> dict[str, int]:
|
||||
"""Group call counts by mode."""
|
||||
by_mode: dict[str, int] = defaultdict(int)
|
||||
for r in records:
|
||||
by_mode[r.mode.value] += 1
|
||||
return dict(by_mode)
|
||||
|
||||
@staticmethod
|
||||
def _group_by_hour(records: list["CallRecord"]) -> dict[int, int]:
|
||||
"""Group call counts by hour of day."""
|
||||
by_hour: dict[int, int] = defaultdict(int)
|
||||
for r in records:
|
||||
by_hour[r.started_at.hour] += 1
|
||||
return dict(sorted(by_hour.items()))
|
||||
|
||||
@property
|
||||
def total_calls_recorded(self) -> int:
|
||||
return len(self._call_records)
|
||||
|
||||
|
||||
# ================================================================
|
||||
# Data Models
|
||||
# ================================================================
|
||||
|
||||
class CallRecord:
|
||||
"""A completed call record for analytics."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
call_id: str,
|
||||
remote_number: str,
|
||||
mode: CallMode,
|
||||
status: CallStatus,
|
||||
intent: Optional[str] = None,
|
||||
started_at: Optional[datetime] = None,
|
||||
duration_seconds: int = 0,
|
||||
hold_time_seconds: int = 0,
|
||||
classification_history: Optional[list[str]] = None,
|
||||
transcript_chunks: Optional[list[str]] = None,
|
||||
services: Optional[list[str]] = None,
|
||||
):
|
||||
self.call_id = call_id
|
||||
self.remote_number = remote_number
|
||||
self.mode = mode
|
||||
self.status = status
|
||||
self.intent = intent
|
||||
self.started_at = started_at or datetime.now()
|
||||
self.duration_seconds = duration_seconds
|
||||
self.hold_time_seconds = hold_time_seconds
|
||||
self.classification_history = classification_history or []
|
||||
self.transcript_chunks = transcript_chunks or []
|
||||
self.services = services or []
|
||||
|
||||
|
||||
class CompanyStats:
|
||||
"""Aggregated stats for a specific company/phone number."""
|
||||
|
||||
def __init__(self):
|
||||
self.total_calls = 0
|
||||
self.successful_calls = 0
|
||||
self.failed_calls = 0
|
||||
self.total_hold_time = 0
|
||||
self.hold_times: list[int] = []
|
||||
self.total_duration = 0
|
||||
self.last_called: Optional[datetime] = None
|
||||
self.intents: dict[str, int] = defaultdict(int)
|
||||
|
||||
def update(self, record: CallRecord) -> None:
|
||||
"""Update stats with a new call record."""
|
||||
self.total_calls += 1
|
||||
self.total_duration += record.duration_seconds
|
||||
self.last_called = record.started_at
|
||||
|
||||
if record.status in (CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED):
|
||||
self.successful_calls += 1
|
||||
elif record.status == CallStatus.FAILED:
|
||||
self.failed_calls += 1
|
||||
|
||||
if record.hold_time_seconds > 0:
|
||||
self.total_hold_time += record.hold_time_seconds
|
||||
self.hold_times.append(record.hold_time_seconds)
|
||||
|
||||
if record.intent:
|
||||
self.intents[record.intent] += 1
|
||||
|
||||
def to_dict(self, number: str) -> dict[str, Any]:
|
||||
return {
|
||||
"number": number,
|
||||
"total_calls": self.total_calls,
|
||||
"successful_calls": self.successful_calls,
|
||||
"failed_calls": self.failed_calls,
|
||||
"success_rate": round(
|
||||
self.successful_calls / self.total_calls, 3
|
||||
) if self.total_calls else 0.0,
|
||||
"avg_hold_time": round(
|
||||
self.total_hold_time / len(self.hold_times), 1
|
||||
) if self.hold_times else 0.0,
|
||||
"max_hold_time": max(self.hold_times) if self.hold_times else 0,
|
||||
"avg_duration": round(
|
||||
self.total_duration / self.total_calls, 1
|
||||
) if self.total_calls else 0.0,
|
||||
"last_called": self.last_called.isoformat() if self.last_called else None,
|
||||
"top_intents": dict(
|
||||
sorted(self.intents.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
),
|
||||
}
|
||||
339
services/call_flow_learner.py
Normal file
339
services/call_flow_learner.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Call Flow Learner — Builds and refines call flows from exploration data.
|
||||
|
||||
When Hold Slayer runs in exploration mode, it discovers IVR steps.
|
||||
This service takes those discoveries and:
|
||||
1. Builds a CallFlow tree that can be reused next time
|
||||
2. Merges new discoveries into existing flows (refining them)
|
||||
3. Uses LLM to label steps and infer menu structure
|
||||
|
||||
Over time, each phone number builds up a reliable call flow
|
||||
that makes future calls faster and more accurate.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from models.call_flow import ActionType, CallFlow, CallFlowStep
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CallFlowLearner:
|
||||
"""
|
||||
Learns IVR call flows from exploration data.
|
||||
|
||||
Usage:
|
||||
learner = CallFlowLearner(llm_client=llm)
|
||||
|
||||
# After an exploration call completes:
|
||||
flow = await learner.build_flow(
|
||||
phone_number="+18005551234",
|
||||
discovered_steps=steps_from_exploration,
|
||||
intent="cancel my card",
|
||||
)
|
||||
|
||||
# Next time we call, merge new discoveries:
|
||||
updated = await learner.merge_discoveries(
|
||||
existing_flow=flow,
|
||||
new_steps=new_discoveries,
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, llm_client=None):
|
||||
self._llm = llm_client
|
||||
|
||||
# ================================================================
|
||||
# Build Flow from Exploration
|
||||
# ================================================================
|
||||
|
||||
async def build_flow(
|
||||
self,
|
||||
phone_number: str,
|
||||
discovered_steps: list[dict],
|
||||
intent: Optional[str] = None,
|
||||
company_name: Optional[str] = None,
|
||||
) -> CallFlow:
|
||||
"""
|
||||
Build a CallFlow from exploration discoveries.
|
||||
|
||||
Args:
|
||||
phone_number: The number that was called.
|
||||
discovered_steps: List of step dicts from exploration mode:
|
||||
[{"timestamp": ..., "audio_type": "ivr_prompt",
|
||||
"transcript": "Press 1 for...", "action_taken": {"dtmf": "1"}}, ...]
|
||||
intent: What the caller was trying to accomplish.
|
||||
company_name: Optional company name for labeling.
|
||||
|
||||
Returns:
|
||||
A CallFlow that can be stored and reused.
|
||||
"""
|
||||
logger.info(
|
||||
f"🧠 Building call flow from {len(discovered_steps)} discoveries "
|
||||
f"for {phone_number}"
|
||||
)
|
||||
|
||||
# Phase 1: Extract meaningful steps (skip silence, ringing)
|
||||
meaningful = [
|
||||
s for s in discovered_steps
|
||||
if s.get("audio_type") in ("ivr_prompt", "live_human", "music")
|
||||
or s.get("action_taken")
|
||||
]
|
||||
|
||||
if not meaningful:
|
||||
logger.warning(" No meaningful steps discovered")
|
||||
return self._empty_flow(phone_number, company_name)
|
||||
|
||||
# Phase 2: Convert discoveries to CallFlowSteps
|
||||
flow_steps = []
|
||||
for i, step in enumerate(meaningful):
|
||||
flow_step = self._discovery_to_step(step, i, meaningful)
|
||||
if flow_step:
|
||||
flow_steps.append(flow_step)
|
||||
|
||||
# Phase 3: Link steps together (next_step pointers)
|
||||
for i, step in enumerate(flow_steps[:-1]):
|
||||
step.next_step = flow_steps[i + 1].id
|
||||
|
||||
# Phase 4: Use LLM to enhance step labels if available
|
||||
if self._llm and flow_steps:
|
||||
flow_steps = await self._llm_enhance_steps(flow_steps, intent)
|
||||
|
||||
# Build the flow
|
||||
name = company_name or self._guess_company_name(phone_number)
|
||||
flow = CallFlow(
|
||||
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
||||
name=f"{name} — {intent or 'General'}",
|
||||
phone_number=phone_number,
|
||||
description=f"Auto-learned flow for {name}. Intent: {intent or 'general'}",
|
||||
steps=flow_steps,
|
||||
tags=["auto-learned"],
|
||||
notes=f"Learned from exploration on {datetime.now().isoformat()}",
|
||||
times_used=1,
|
||||
last_used=datetime.now(),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f" ✅ Built flow '{flow.name}' with {len(flow_steps)} steps"
|
||||
)
|
||||
return flow
|
||||
|
||||
def _discovery_to_step(
|
||||
self,
|
||||
discovery: dict,
|
||||
index: int,
|
||||
all_discoveries: list[dict],
|
||||
) -> Optional[CallFlowStep]:
|
||||
"""Convert a single exploration discovery to a CallFlowStep."""
|
||||
audio_type = discovery.get("audio_type", "")
|
||||
transcript = discovery.get("transcript", "")
|
||||
action_taken = discovery.get("action_taken")
|
||||
|
||||
step_id = f"step_{index:03d}"
|
||||
|
||||
if audio_type == "ivr_prompt" and action_taken:
|
||||
# IVR menu where we pressed a button
|
||||
dtmf = action_taken.get("dtmf", "")
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description=self._summarize_menu(transcript) or f"IVR menu (pressed {dtmf})",
|
||||
action=ActionType.DTMF,
|
||||
action_value=dtmf,
|
||||
expect=self._extract_expect_pattern(transcript),
|
||||
timeout=15,
|
||||
)
|
||||
|
||||
elif audio_type == "ivr_prompt" and not action_taken:
|
||||
# IVR prompt we just listened to
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description=self._summarize_menu(transcript) or "IVR announcement",
|
||||
action=ActionType.LISTEN,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
elif audio_type == "music":
|
||||
# Hold music
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description="Hold music — waiting for agent",
|
||||
action=ActionType.HOLD,
|
||||
timeout=3600,
|
||||
)
|
||||
|
||||
elif audio_type == "live_human":
|
||||
# Human detected — this is the transfer point
|
||||
return CallFlowStep(
|
||||
id=step_id,
|
||||
description="Live agent detected — transfer",
|
||||
action=ActionType.TRANSFER,
|
||||
action_value="preferred_device",
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
# ================================================================
|
||||
# Merge New Discoveries into Existing Flow
|
||||
# ================================================================
|
||||
|
||||
async def merge_discoveries(
|
||||
self,
|
||||
existing_flow: CallFlow,
|
||||
new_steps: list[dict],
|
||||
intent: Optional[str] = None,
|
||||
) -> CallFlow:
|
||||
"""
|
||||
Merge new exploration discoveries into an existing flow.
|
||||
|
||||
This refines the flow over time — updating timeouts,
|
||||
confirming step order, adding alternative paths.
|
||||
"""
|
||||
logger.info(
|
||||
f"🔄 Merging {len(new_steps)} new discoveries into "
|
||||
f"flow '{existing_flow.name}'"
|
||||
)
|
||||
|
||||
# Build a new flow from the discoveries
|
||||
new_flow = await self.build_flow(
|
||||
phone_number=existing_flow.phone_number,
|
||||
discovered_steps=new_steps,
|
||||
intent=intent,
|
||||
)
|
||||
|
||||
# Simple merge strategy: keep existing steps but update timeouts
|
||||
# and add any new steps that weren't in the original
|
||||
existing_by_action = {
|
||||
(s.action, s.action_value): s for s in existing_flow.steps
|
||||
}
|
||||
|
||||
for new_step in new_flow.steps:
|
||||
key = (new_step.action, new_step.action_value)
|
||||
if key in existing_by_action:
|
||||
# Update timeout to be the average
|
||||
old_step = existing_by_action[key]
|
||||
if old_step.timeout and new_step.timeout:
|
||||
old_step.timeout = int(
|
||||
(old_step.timeout + new_step.timeout) / 2
|
||||
)
|
||||
# New steps that don't exist are noted but not auto-added
|
||||
# (to avoid corrupting a working flow)
|
||||
|
||||
# Update metadata
|
||||
existing_flow.times_used = (existing_flow.times_used or 0) + 1
|
||||
existing_flow.last_used = datetime.now()
|
||||
|
||||
logger.info(f" ✅ Merged. Flow now has {len(existing_flow.steps)} steps")
|
||||
return existing_flow
|
||||
|
||||
# ================================================================
|
||||
# LLM Enhancement
|
||||
# ================================================================
|
||||
|
||||
async def _llm_enhance_steps(
|
||||
self,
|
||||
steps: list[CallFlowStep],
|
||||
intent: Optional[str],
|
||||
) -> list[CallFlowStep]:
|
||||
"""Use LLM to improve step descriptions and structure."""
|
||||
if not self._llm:
|
||||
return steps
|
||||
|
||||
try:
|
||||
# Build a summary of the steps for the LLM
|
||||
step_descriptions = []
|
||||
for s in steps:
|
||||
desc = f"- {s.action.value}"
|
||||
if s.action_value:
|
||||
desc += f" ({s.action_value})"
|
||||
if s.description:
|
||||
desc += f": {s.description}"
|
||||
step_descriptions.append(desc)
|
||||
|
||||
prompt = (
|
||||
f"These are steps discovered while navigating a phone IVR system.\n"
|
||||
f"Intent: {intent or 'general inquiry'}\n\n"
|
||||
f"Steps:\n" + "\n".join(step_descriptions) + "\n\n"
|
||||
f"For each step, provide a clear, concise description of what "
|
||||
f"that step does. Return JSON array of objects with 'step_index' "
|
||||
f"and 'description' fields."
|
||||
)
|
||||
|
||||
result = await self._llm.chat_json(
|
||||
prompt,
|
||||
system="You are labeling IVR phone menu steps for a call flow database.",
|
||||
)
|
||||
|
||||
# Apply LLM descriptions
|
||||
if isinstance(result, list):
|
||||
for item in result:
|
||||
idx = item.get("step_index", -1)
|
||||
desc = item.get("description", "")
|
||||
if 0 <= idx < len(steps) and desc:
|
||||
steps[idx].description = desc
|
||||
elif isinstance(result, dict) and "steps" in result:
|
||||
for item in result["steps"]:
|
||||
idx = item.get("step_index", -1)
|
||||
desc = item.get("description", "")
|
||||
if 0 <= idx < len(steps) and desc:
|
||||
steps[idx].description = desc
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" LLM enhancement failed (non-fatal): {e}")
|
||||
|
||||
return steps
|
||||
|
||||
# ================================================================
|
||||
# Helpers
|
||||
# ================================================================
|
||||
|
||||
@staticmethod
|
||||
def _summarize_menu(transcript: str) -> Optional[str]:
|
||||
"""Create a short summary of an IVR menu transcript."""
|
||||
if not transcript:
|
||||
return None
|
||||
|
||||
# Count how many options
|
||||
options = re.findall(r'press\s+\d+', transcript.lower())
|
||||
if options:
|
||||
return f"IVR menu with {len(options)} options"
|
||||
|
||||
# Truncate long transcripts
|
||||
if len(transcript) > 80:
|
||||
return transcript[:77] + "..."
|
||||
return transcript
|
||||
|
||||
@staticmethod
|
||||
def _extract_expect_pattern(transcript: str) -> Optional[str]:
|
||||
"""Extract a regex pattern to match this prompt next time."""
|
||||
if not transcript:
|
||||
return None
|
||||
|
||||
# Find the most distinctive phrase (>4 words, not generic)
|
||||
words = transcript.split()
|
||||
if len(words) >= 4:
|
||||
# Use first meaningful phrase
|
||||
phrase = " ".join(words[:6])
|
||||
# Escape for regex
|
||||
return re.escape(phrase.lower())
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _guess_company_name(phone_number: str) -> str:
|
||||
"""Guess company name from phone number (placeholder)."""
|
||||
# In production, this would do a reverse lookup
|
||||
return f"Company {phone_number[-4:]}"
|
||||
|
||||
@staticmethod
|
||||
def _empty_flow(phone_number: str, company_name: Optional[str]) -> CallFlow:
|
||||
"""Create an empty flow placeholder."""
|
||||
return CallFlow(
|
||||
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
||||
name=f"{company_name or phone_number} — Empty",
|
||||
phone_number=phone_number,
|
||||
description="Empty flow — no meaningful steps discovered",
|
||||
steps=[],
|
||||
tags=["auto-learned", "empty"],
|
||||
)
|
||||
717
services/hold_slayer.py
Normal file
717
services/hold_slayer.py
Normal file
@@ -0,0 +1,717 @@
|
||||
"""
|
||||
Hold Slayer Service — The main event.
|
||||
|
||||
Navigate IVR trees, wait on hold, detect when a human picks up,
|
||||
and transfer you in. This is the state machine that orchestrates
|
||||
the entire hold-slaying process.
|
||||
|
||||
Two modes:
|
||||
1. run_with_flow(): Follow a stored call flow tree (fast, reliable)
|
||||
2. run_exploration(): No stored flow — listen, transcribe, and figure it out
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from config import Settings
|
||||
from core.call_manager import CallManager
|
||||
from core.sip_engine import SIPEngine
|
||||
from models.call import ActiveCall, AudioClassification, CallStatus, ClassificationResult
|
||||
from models.call_flow import ActionType, CallFlow, CallFlowStep
|
||||
from models.events import EventType, GatewayEvent
|
||||
from services.audio_classifier import AudioClassifier
|
||||
from services.transcription import TranscriptionService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# LLM client is optional — imported at use time
|
||||
_llm_client = None
|
||||
|
||||
|
||||
def _get_llm():
|
||||
"""Lazy-load LLM client (optional dependency)."""
|
||||
global _llm_client
|
||||
if _llm_client is None:
|
||||
try:
|
||||
from config import get_settings
|
||||
from services.llm_client import LLMClient
|
||||
|
||||
settings = get_settings()
|
||||
_llm_client = LLMClient(
|
||||
base_url=settings.llm.base_url,
|
||||
model=settings.llm.model,
|
||||
api_key=settings.llm.api_key,
|
||||
timeout=settings.llm.timeout,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"LLM client not available: {e}")
|
||||
_llm_client = False # Sentinel: don't retry
|
||||
return _llm_client if _llm_client is not False else None
|
||||
|
||||
|
||||
class HoldSlayerService:
|
||||
"""
|
||||
The Hold Slayer.
|
||||
|
||||
Navigates IVR menus, waits on hold, detects live humans,
|
||||
and transfers the call to your device.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
gateway, # AIPSTNGateway (avoid circular import)
|
||||
call_manager: CallManager,
|
||||
sip_engine: SIPEngine,
|
||||
classifier: AudioClassifier,
|
||||
transcription: TranscriptionService,
|
||||
settings: Settings,
|
||||
):
|
||||
self.gateway = gateway
|
||||
self.call_manager = call_manager
|
||||
self.sip_engine = sip_engine
|
||||
self.classifier = classifier
|
||||
self.transcription = transcription
|
||||
self.settings = settings
|
||||
|
||||
async def run(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
call_flow_id: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Main entry point. Run the Hold Slayer on a call.
|
||||
|
||||
Args:
|
||||
call: The active call to work on
|
||||
sip_leg_id: SIP leg ID for the PSTN call
|
||||
call_flow_id: Optional stored call flow to follow
|
||||
|
||||
Returns:
|
||||
True if successfully transferred to user, False otherwise
|
||||
"""
|
||||
logger.info(f"🗡️ Hold Slayer activated for {call.remote_number}")
|
||||
logger.info(f" Intent: {call.intent}")
|
||||
logger.info(f" Call Flow: {call_flow_id or 'exploration mode'}")
|
||||
|
||||
try:
|
||||
# Wait for call to be connected
|
||||
await self._wait_for_connection(call, timeout=60)
|
||||
|
||||
if call_flow_id:
|
||||
# Load the stored call flow from the database
|
||||
flow = await self._load_call_flow(call_flow_id)
|
||||
if flow:
|
||||
return await self.run_with_flow(call, sip_leg_id, flow)
|
||||
else:
|
||||
logger.warning(f"Call flow '{call_flow_id}' not found, switching to exploration")
|
||||
|
||||
# No flow or flow not found — explore
|
||||
return await self.run_exploration(call, sip_leg_id)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"Hold Slayer cancelled for {call.id}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Hold Slayer error: {e}", exc_info=True)
|
||||
await self.call_manager.update_status(call.id, CallStatus.FAILED)
|
||||
return False
|
||||
|
||||
# ================================================================
|
||||
# Mode 1: Follow a Stored Call Flow
|
||||
# ================================================================
|
||||
|
||||
async def run_with_flow(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
flow: CallFlow,
|
||||
) -> bool:
|
||||
"""
|
||||
Navigate using a stored call flow tree.
|
||||
Falls back to exploration for unknown steps.
|
||||
"""
|
||||
logger.info(f"📋 Following call flow: {flow.name}")
|
||||
steps = flow.steps_by_id()
|
||||
current_step_id = flow.steps[0].id if flow.steps else None
|
||||
|
||||
while current_step_id:
|
||||
step = steps.get(current_step_id)
|
||||
if not step:
|
||||
logger.error(f"Step '{current_step_id}' not found in flow")
|
||||
break
|
||||
|
||||
call.current_step_id = current_step_id
|
||||
logger.info(f"📍 Step: {step.description}")
|
||||
|
||||
await self.call_manager.event_bus.publish(GatewayEvent(
|
||||
type=EventType.IVR_STEP,
|
||||
call_id=call.id,
|
||||
data={"step_id": step.id, "description": step.description, "action": step.action.value},
|
||||
message=f"📍 IVR Step: {step.description}",
|
||||
))
|
||||
|
||||
# === Execute the step based on its action type ===
|
||||
|
||||
if step.action == ActionType.HOLD:
|
||||
# HOLD MODE: Audio classifier takes over
|
||||
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
|
||||
logger.info(f"⏳ On hold. Activating hold detection...")
|
||||
|
||||
human_detected = await self._wait_for_human(
|
||||
call, sip_leg_id, timeout=step.timeout
|
||||
)
|
||||
|
||||
if human_detected:
|
||||
current_step_id = step.next_step
|
||||
else:
|
||||
logger.warning("⏰ Hold timeout reached!")
|
||||
break
|
||||
|
||||
elif step.action == ActionType.DTMF:
|
||||
# Wait for the expected prompt, then send DTMF
|
||||
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
|
||||
|
||||
if step.expect:
|
||||
heard = await self._wait_for_prompt(
|
||||
call, sip_leg_id, step.expect, step.timeout
|
||||
)
|
||||
if not heard and step.fallback_step:
|
||||
logger.info(f"⚠️ Didn't hear expected prompt, falling back")
|
||||
current_step_id = step.fallback_step
|
||||
continue
|
||||
|
||||
# Send the DTMF digits
|
||||
if step.action_value:
|
||||
await self.sip_engine.send_dtmf(sip_leg_id, step.action_value)
|
||||
logger.info(f"📱 Pressed: {step.action_value}")
|
||||
|
||||
await self.call_manager.event_bus.publish(GatewayEvent(
|
||||
type=EventType.IVR_DTMF_SENT,
|
||||
call_id=call.id,
|
||||
data={"digits": step.action_value, "step": step.id},
|
||||
message=f"📱 DTMF sent: {step.action_value}",
|
||||
))
|
||||
|
||||
# Small delay after DTMF for the IVR to process
|
||||
await asyncio.sleep(2.0)
|
||||
current_step_id = step.next_step
|
||||
|
||||
elif step.action == ActionType.WAIT:
|
||||
# Just wait for a prompt
|
||||
if step.expect:
|
||||
await self._wait_for_prompt(
|
||||
call, sip_leg_id, step.expect, step.timeout
|
||||
)
|
||||
else:
|
||||
await asyncio.sleep(step.timeout)
|
||||
current_step_id = step.next_step
|
||||
|
||||
elif step.action == ActionType.LISTEN:
|
||||
# Listen and decide — regex first, LLM fallback
|
||||
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
|
||||
|
||||
transcript = await self._listen_for_menu(
|
||||
call, sip_leg_id, step.timeout
|
||||
)
|
||||
|
||||
# Phase 1: Try regex-based keyword matching (fast, no API call)
|
||||
decision = self._decide_menu_option(
|
||||
transcript, call.intent or "", step.expect
|
||||
)
|
||||
|
||||
# Phase 2: LLM fallback if regex couldn't decide
|
||||
if not decision and transcript:
|
||||
llm = _get_llm()
|
||||
if llm:
|
||||
try:
|
||||
logger.info("🤖 Regex inconclusive, asking LLM...")
|
||||
llm_result = await llm.analyze_ivr_menu(
|
||||
transcript=transcript,
|
||||
intent=call.intent or "",
|
||||
previous_selections=list(call.dtmf_history) if hasattr(call, 'dtmf_history') else None,
|
||||
)
|
||||
decision = llm_result.get("digit")
|
||||
if decision:
|
||||
confidence = llm_result.get("confidence", 0)
|
||||
reason = llm_result.get("reason", "")
|
||||
logger.info(
|
||||
f"🤖 LLM decided: press {decision} "
|
||||
f"(confidence={confidence}, reason='{reason}')"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"🤖 LLM fallback failed: {e}")
|
||||
|
||||
if decision:
|
||||
await self.sip_engine.send_dtmf(sip_leg_id, decision)
|
||||
logger.info(f"🧠 Decided: press {decision} (heard: '{transcript[:60]}...')")
|
||||
else:
|
||||
# Default: press 0 for agent
|
||||
await self.sip_engine.send_dtmf(sip_leg_id, "0")
|
||||
logger.info(f"🧠 No clear match, pressing 0 for agent")
|
||||
|
||||
await asyncio.sleep(2.0)
|
||||
current_step_id = step.next_step
|
||||
|
||||
elif step.action == ActionType.SPEAK:
|
||||
# Say something into the call (TTS)
|
||||
# TODO: Implement TTS integration
|
||||
logger.info(f"🗣️ Would say: '{step.action_value}' (TTS not yet implemented)")
|
||||
await asyncio.sleep(3.0)
|
||||
current_step_id = step.next_step
|
||||
|
||||
elif step.action == ActionType.TRANSFER:
|
||||
# We did it! Transfer to user's device
|
||||
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
|
||||
logger.info(f"🚨 TRANSFERRING TO {step.action_value}")
|
||||
|
||||
device_target = step.action_value or call.device or self.settings.hold_slayer.default_transfer_device
|
||||
await self.gateway.transfer_call(call.id, device_target)
|
||||
return True
|
||||
|
||||
else:
|
||||
logger.warning(f"Unknown action type: {step.action}")
|
||||
current_step_id = step.next_step
|
||||
|
||||
return False
|
||||
|
||||
# ================================================================
|
||||
# Mode 2: Exploration (No Stored Flow)
|
||||
# ================================================================
|
||||
|
||||
async def run_exploration(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
) -> bool:
|
||||
"""
|
||||
No stored flow — explore the IVR blind.
|
||||
Records what it discovers so we can build a flow for next time.
|
||||
"""
|
||||
logger.info(f"🔍 Exploration mode: discovering IVR for {call.remote_number}")
|
||||
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
|
||||
|
||||
discovered_steps: list[dict] = []
|
||||
max_time = self.settings.hold_slayer.max_hold_time
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < max_time:
|
||||
# Check if call is still active
|
||||
current_call = self.call_manager.get_call(call.id)
|
||||
if not current_call or current_call.status in (
|
||||
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
|
||||
):
|
||||
break
|
||||
|
||||
# Get audio and classify
|
||||
audio_chunk = b""
|
||||
try:
|
||||
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
|
||||
audio_chunk += chunk
|
||||
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Audio stream error: {e}")
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
if not audio_chunk:
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
# Classify the audio
|
||||
classification = self.classifier.classify_chunk(audio_chunk)
|
||||
self.classifier.update_history(classification.audio_type)
|
||||
await self.call_manager.add_classification(call.id, classification)
|
||||
|
||||
# Transcribe if it sounds like speech
|
||||
transcript = ""
|
||||
if classification.audio_type in (
|
||||
AudioClassification.IVR_PROMPT,
|
||||
AudioClassification.LIVE_HUMAN,
|
||||
):
|
||||
transcript = await self.transcription.transcribe(
|
||||
audio_chunk,
|
||||
prompt="Phone IVR menu, customer service, press 1 for..."
|
||||
)
|
||||
if transcript:
|
||||
await self.call_manager.add_transcript(call.id, transcript)
|
||||
|
||||
# Record discovery
|
||||
discovered_steps.append({
|
||||
"timestamp": time.time(),
|
||||
"audio_type": classification.audio_type.value,
|
||||
"confidence": classification.confidence,
|
||||
"transcript": transcript,
|
||||
"action_taken": None,
|
||||
})
|
||||
|
||||
# === Decision Logic ===
|
||||
|
||||
if classification.audio_type == AudioClassification.LIVE_HUMAN:
|
||||
# HUMAN DETECTED! Transfer!
|
||||
logger.info("🚨 LIVE HUMAN DETECTED!")
|
||||
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
|
||||
|
||||
device = call.device or self.settings.hold_slayer.default_transfer_device
|
||||
await self.gateway.transfer_call(call.id, device)
|
||||
|
||||
logger.info(f"📋 Discovered {len(discovered_steps)} IVR steps")
|
||||
return True
|
||||
|
||||
elif classification.audio_type == AudioClassification.MUSIC:
|
||||
# On hold — just keep monitoring
|
||||
if current_call.status != CallStatus.ON_HOLD:
|
||||
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
|
||||
|
||||
# Check for hold→human transition
|
||||
if self.classifier.detect_hold_to_human_transition():
|
||||
logger.info("🚨 Hold-to-human transition detected!")
|
||||
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
|
||||
|
||||
device = call.device or self.settings.hold_slayer.default_transfer_device
|
||||
await self.gateway.transfer_call(call.id, device)
|
||||
return True
|
||||
|
||||
elif classification.audio_type == AudioClassification.IVR_PROMPT and transcript:
|
||||
# IVR menu — try to navigate
|
||||
decision = self._decide_menu_option(
|
||||
transcript, call.intent or "", None
|
||||
)
|
||||
if decision:
|
||||
await self.sip_engine.send_dtmf(sip_leg_id, decision)
|
||||
discovered_steps[-1]["action_taken"] = {"dtmf": decision}
|
||||
logger.info(f"🧠 Exploration: pressed {decision}")
|
||||
await asyncio.sleep(2.0)
|
||||
else:
|
||||
# Try pressing 0 for agent
|
||||
await self.sip_engine.send_dtmf(sip_leg_id, "0")
|
||||
discovered_steps[-1]["action_taken"] = {"dtmf": "0", "reason": "default_agent"}
|
||||
logger.info("🧠 Exploration: pressed 0 (trying for agent)")
|
||||
await asyncio.sleep(2.0)
|
||||
|
||||
elif classification.audio_type == AudioClassification.SILENCE:
|
||||
# Silence — wait a bit
|
||||
await asyncio.sleep(2.0)
|
||||
|
||||
elif classification.audio_type == AudioClassification.RINGING:
|
||||
# Still ringing
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
logger.warning(f"Hold Slayer timed out after {max_time}s")
|
||||
return False
|
||||
|
||||
# ================================================================
|
||||
# Core Detection Methods
|
||||
# ================================================================
|
||||
|
||||
async def _wait_for_human(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
timeout: int = 7200,
|
||||
) -> bool:
|
||||
"""
|
||||
Wait on hold until a live human is detected.
|
||||
|
||||
Continuously classifies audio and watches for the
|
||||
music → speech transition.
|
||||
"""
|
||||
check_interval = self.settings.hold_slayer.hold_check_interval
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
# Check if call is still active
|
||||
current_call = self.call_manager.get_call(call.id)
|
||||
if not current_call or current_call.status in (
|
||||
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
|
||||
):
|
||||
return False
|
||||
|
||||
# Get audio chunk
|
||||
audio_chunk = b""
|
||||
try:
|
||||
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
|
||||
audio_chunk += chunk
|
||||
if len(audio_chunk) >= int(16000 * 2 * check_interval):
|
||||
break
|
||||
except Exception:
|
||||
await asyncio.sleep(check_interval)
|
||||
continue
|
||||
|
||||
if not audio_chunk:
|
||||
await asyncio.sleep(check_interval)
|
||||
continue
|
||||
|
||||
# Classify
|
||||
result = self.classifier.classify_chunk(audio_chunk)
|
||||
self.classifier.update_history(result.audio_type)
|
||||
await self.call_manager.add_classification(call.id, result)
|
||||
|
||||
# Check for human
|
||||
if result.audio_type == AudioClassification.LIVE_HUMAN:
|
||||
# Verify with transcription
|
||||
transcript = await self.transcription.transcribe(audio_chunk)
|
||||
if transcript:
|
||||
await self.call_manager.add_transcript(call.id, transcript)
|
||||
# If we got meaningful speech, it's probably a real person
|
||||
if len(transcript.split()) >= 3:
|
||||
logger.info(f"🚨 Human confirmed! Said: '{transcript[:100]}'")
|
||||
return True
|
||||
|
||||
# Check for the music→speech transition pattern
|
||||
if self.classifier.detect_hold_to_human_transition():
|
||||
logger.info("🚨 Hold-to-human transition detected!")
|
||||
return True
|
||||
|
||||
# Log progress periodically
|
||||
elapsed = int(time.time() - start_time)
|
||||
if elapsed > 0 and elapsed % 60 == 0:
|
||||
logger.info(
|
||||
f"⏳ Still on hold... {elapsed}s "
|
||||
f"(audio: {result.audio_type.value}, {result.confidence:.0%})"
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
async def _wait_for_prompt(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
expected_pattern: str,
|
||||
timeout: int = 30,
|
||||
) -> bool:
|
||||
"""
|
||||
Wait for an expected IVR prompt.
|
||||
|
||||
Listens, transcribes, and checks if the transcript matches
|
||||
the expected pattern (regex or keywords).
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
audio_chunk = b""
|
||||
try:
|
||||
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
|
||||
audio_chunk += chunk
|
||||
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
|
||||
break
|
||||
except Exception:
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
if not audio_chunk:
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
# Classify first
|
||||
result = self.classifier.classify_chunk(audio_chunk)
|
||||
if result.audio_type not in (
|
||||
AudioClassification.IVR_PROMPT,
|
||||
AudioClassification.LIVE_HUMAN,
|
||||
):
|
||||
continue
|
||||
|
||||
# Transcribe
|
||||
transcript = await self.transcription.transcribe(audio_chunk)
|
||||
if not transcript:
|
||||
continue
|
||||
|
||||
await self.call_manager.add_transcript(call.id, transcript)
|
||||
|
||||
# Check if it matches expected pattern
|
||||
try:
|
||||
if re.search(expected_pattern, transcript, re.IGNORECASE):
|
||||
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
|
||||
return True
|
||||
except re.error:
|
||||
# Treat as keyword search if regex is invalid
|
||||
if expected_pattern.lower() in transcript.lower():
|
||||
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
|
||||
return True
|
||||
|
||||
logger.warning(f"⚠️ Didn't hear expected prompt within {timeout}s")
|
||||
return False
|
||||
|
||||
async def _listen_for_menu(
|
||||
self,
|
||||
call: ActiveCall,
|
||||
sip_leg_id: str,
|
||||
timeout: int = 30,
|
||||
) -> str:
|
||||
"""Listen for an IVR menu and return the full transcript."""
|
||||
transcript_parts: list[str] = []
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
audio_chunk = b""
|
||||
try:
|
||||
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
|
||||
audio_chunk += chunk
|
||||
if len(audio_chunk) >= 16000 * 2 * 5: # 5 seconds
|
||||
break
|
||||
except Exception:
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
if not audio_chunk:
|
||||
break
|
||||
|
||||
result = self.classifier.classify_chunk(audio_chunk)
|
||||
|
||||
# If we're getting silence after speech, the menu prompt is done
|
||||
if result.audio_type == AudioClassification.SILENCE and transcript_parts:
|
||||
break
|
||||
|
||||
if result.audio_type in (
|
||||
AudioClassification.IVR_PROMPT,
|
||||
AudioClassification.LIVE_HUMAN,
|
||||
):
|
||||
text = await self.transcription.transcribe(audio_chunk)
|
||||
if text:
|
||||
transcript_parts.append(text)
|
||||
|
||||
full_transcript = " ".join(transcript_parts)
|
||||
if full_transcript:
|
||||
await self.call_manager.add_transcript(call.id, full_transcript)
|
||||
|
||||
return full_transcript
|
||||
|
||||
async def _wait_for_connection(self, call: ActiveCall, timeout: int = 60) -> None:
|
||||
"""Wait for the call to be connected (answered)."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
current = self.call_manager.get_call(call.id)
|
||||
if not current:
|
||||
raise RuntimeError(f"Call {call.id} disappeared")
|
||||
if current.status in (CallStatus.CONNECTED, CallStatus.NAVIGATING_IVR):
|
||||
return
|
||||
if current.status in (CallStatus.FAILED, CallStatus.CANCELLED):
|
||||
raise RuntimeError(f"Call {call.id} failed: {current.status}")
|
||||
await asyncio.sleep(0.5)
|
||||
raise TimeoutError(f"Call {call.id} not connected within {timeout}s")
|
||||
|
||||
# ================================================================
|
||||
# Menu Navigation Logic
|
||||
# ================================================================
|
||||
|
||||
def _decide_menu_option(
|
||||
self,
|
||||
transcript: str,
|
||||
intent: str,
|
||||
expected_options: Optional[str],
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Decide which menu option to select based on transcript and intent.
|
||||
|
||||
Simple keyword-based matching. This is where an LLM integration
|
||||
would massively improve navigation accuracy.
|
||||
|
||||
Returns:
|
||||
DTMF digit(s) to press, or None if can't decide
|
||||
"""
|
||||
transcript_lower = transcript.lower()
|
||||
intent_lower = intent.lower()
|
||||
|
||||
# Common IVR patterns: "press 1 for X, press 2 for Y"
|
||||
# Extract options
|
||||
options = re.findall(
|
||||
r'(?:press|dial|say)\s+(\d+)\s+(?:for|to)\s+(.+?)(?:\.|,|press|dial|$)',
|
||||
transcript_lower,
|
||||
)
|
||||
|
||||
if not options:
|
||||
# Try alternate patterns: "for X, press 1"
|
||||
options = re.findall(
|
||||
r'for\s+(.+?),?\s*(?:press|dial)\s+(\d+)',
|
||||
transcript_lower,
|
||||
)
|
||||
# Swap order to be (digit, description)
|
||||
options = [(digit, desc) for desc, digit in options]
|
||||
|
||||
if not options:
|
||||
return None
|
||||
|
||||
# Score each option against the intent
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
# Keywords that map intents to IVR options
|
||||
intent_keywords = {
|
||||
"cancel": ["cancel", "close", "end", "terminate"],
|
||||
"dispute": ["dispute", "charge", "billing", "transaction", "statement"],
|
||||
"balance": ["balance", "account", "summary"],
|
||||
"agent": ["agent", "representative", "operator", "speak", "person", "human"],
|
||||
"payment": ["payment", "pay", "bill"],
|
||||
"card": ["card", "credit", "debit"],
|
||||
"fraud": ["fraud", "unauthorized", "stolen", "lost"],
|
||||
"transfer": ["transfer", "move", "send"],
|
||||
}
|
||||
|
||||
for digit, description in options:
|
||||
score = 0
|
||||
|
||||
# Direct keyword match in description
|
||||
for keyword_group, keywords in intent_keywords.items():
|
||||
if any(kw in intent_lower for kw in keywords):
|
||||
if any(kw in description for kw in keywords):
|
||||
score += 10
|
||||
|
||||
# Fuzzy: any word overlap between intent and description
|
||||
intent_words = set(intent_lower.split())
|
||||
desc_words = set(description.split())
|
||||
overlap = intent_words & desc_words
|
||||
score += len(overlap) * 3
|
||||
|
||||
# "Speak to agent" is usually what we want if nothing else matches
|
||||
if any(w in description for w in ["agent", "representative", "operator", "person"]):
|
||||
score += 5
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = digit
|
||||
|
||||
if best_match and best_score >= 3:
|
||||
return best_match
|
||||
|
||||
# Default: look for "agent" or "representative" option
|
||||
for digit, description in options:
|
||||
if any(w in description for w in ["agent", "representative", "operator"]):
|
||||
return digit
|
||||
|
||||
return None
|
||||
|
||||
async def _load_call_flow(self, flow_id: str) -> Optional[CallFlow]:
|
||||
"""Load a stored call flow from the database."""
|
||||
from db.database import get_session_factory, StoredCallFlow
|
||||
from sqlalchemy import select
|
||||
|
||||
try:
|
||||
factory = get_session_factory()
|
||||
async with factory() as session:
|
||||
result = await session.execute(
|
||||
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
|
||||
)
|
||||
row = result.scalar_one_or_none()
|
||||
if row:
|
||||
from models.call_flow import CallFlowStep
|
||||
return CallFlow(
|
||||
id=row.id,
|
||||
name=row.name,
|
||||
phone_number=row.phone_number,
|
||||
description=row.description or "",
|
||||
steps=[CallFlowStep(**s) for s in row.steps],
|
||||
tags=row.tags or [],
|
||||
notes=row.notes,
|
||||
avg_hold_time=row.avg_hold_time,
|
||||
success_rate=row.success_rate,
|
||||
last_used=row.last_used,
|
||||
times_used=row.times_used or 0,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load call flow '{flow_id}': {e}")
|
||||
|
||||
return None
|
||||
391
services/llm_client.py
Normal file
391
services/llm_client.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
LLM Client — Unified interface for LLM-powered decision making.
|
||||
|
||||
Used by Hold Slayer (IVR navigation fallback), Call Flow Learner,
|
||||
Receptionist, and Smart Routing services.
|
||||
|
||||
Supports OpenAI-compatible APIs (OpenAI, Ollama, LM Studio, etc.)
|
||||
via httpx async client. No SDK dependency — just HTTP.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""
|
||||
Async LLM client for OpenAI-compatible chat completion APIs.
|
||||
|
||||
Works with:
|
||||
- OpenAI API (api.openai.com)
|
||||
- Ollama (localhost:11434)
|
||||
- LM Studio (localhost:1234)
|
||||
- Any OpenAI-compatible endpoint
|
||||
|
||||
Usage:
|
||||
client = LLMClient(base_url="http://localhost:11434/v1", model="llama3")
|
||||
response = await client.chat("What is 2+2?")
|
||||
# or structured:
|
||||
result = await client.chat_json(
|
||||
"Extract the menu options from this IVR transcript...",
|
||||
system="You are a phone menu parser.",
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str = "http://localhost:11434/v1",
|
||||
model: str = "llama3",
|
||||
api_key: str = "not-needed",
|
||||
timeout: float = 30.0,
|
||||
max_tokens: int = 1024,
|
||||
temperature: float = 0.3,
|
||||
):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.model = model
|
||||
self.api_key = api_key
|
||||
self.timeout = timeout
|
||||
self.max_tokens = max_tokens
|
||||
self.temperature = temperature
|
||||
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self.base_url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout=httpx.Timeout(timeout),
|
||||
)
|
||||
|
||||
# Stats
|
||||
self._total_requests = 0
|
||||
self._total_tokens = 0
|
||||
self._total_errors = 0
|
||||
self._avg_latency_ms = 0.0
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client."""
|
||||
await self._client.aclose()
|
||||
|
||||
# ================================================================
|
||||
# Core Chat Methods
|
||||
# ================================================================
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
user_message: str,
|
||||
system: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Send a chat completion request and return the text response.
|
||||
|
||||
Args:
|
||||
user_message: The user's message/prompt.
|
||||
system: Optional system prompt.
|
||||
temperature: Override default temperature.
|
||||
max_tokens: Override default max tokens.
|
||||
|
||||
Returns:
|
||||
The assistant's response text.
|
||||
"""
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": user_message})
|
||||
|
||||
return await self._complete(
|
||||
messages,
|
||||
temperature=temperature or self.temperature,
|
||||
max_tokens=max_tokens or self.max_tokens,
|
||||
)
|
||||
|
||||
async def chat_json(
|
||||
self,
|
||||
user_message: str,
|
||||
system: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Chat completion that parses the response as JSON.
|
||||
|
||||
The system prompt is augmented to request JSON output.
|
||||
Falls back to extracting JSON from markdown code blocks.
|
||||
|
||||
Returns:
|
||||
Parsed JSON dict, or {"error": "..."} on parse failure.
|
||||
"""
|
||||
json_system = (system or "") + (
|
||||
"\n\nIMPORTANT: Respond with valid JSON only. "
|
||||
"No markdown, no explanation, just the JSON object."
|
||||
)
|
||||
|
||||
response_text = await self.chat(
|
||||
user_message,
|
||||
system=json_system.strip(),
|
||||
temperature=temperature or 0.1, # Lower temp for structured output
|
||||
)
|
||||
|
||||
return self._parse_json_response(response_text)
|
||||
|
||||
async def chat_with_history(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Chat with full message history (multi-turn conversation).
|
||||
|
||||
Args:
|
||||
messages: List of {"role": "system|user|assistant", "content": "..."}
|
||||
|
||||
Returns:
|
||||
The assistant's response text.
|
||||
"""
|
||||
return await self._complete(
|
||||
messages,
|
||||
temperature=temperature or self.temperature,
|
||||
max_tokens=max_tokens or self.max_tokens,
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# Hold Slayer Specific Methods
|
||||
# ================================================================
|
||||
|
||||
async def analyze_ivr_menu(
|
||||
self,
|
||||
transcript: str,
|
||||
intent: str,
|
||||
previous_selections: Optional[list[str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Analyze an IVR menu transcript and decide which option to press.
|
||||
|
||||
This is the LLM fallback when regex-based menu parsing fails.
|
||||
|
||||
Args:
|
||||
transcript: The IVR audio transcript.
|
||||
intent: What the user wants to accomplish.
|
||||
previous_selections: DTMF digits already pressed in this call.
|
||||
|
||||
Returns:
|
||||
{"digit": "3", "reason": "Option 3 is for card cancellation",
|
||||
"confidence": 0.85}
|
||||
"""
|
||||
system = (
|
||||
"You are an expert at navigating phone menus (IVR systems). "
|
||||
"Given an IVR transcript and the caller's intent, determine "
|
||||
"which menu option (DTMF digit) to press.\n\n"
|
||||
"Rules:\n"
|
||||
"- If there's a direct match for the intent, choose it.\n"
|
||||
"- If no direct match, choose 'speak to representative' or 'agent' option.\n"
|
||||
"- If menu says 'press 0 for operator', that's always a safe fallback.\n"
|
||||
"- Return the single digit to press.\n"
|
||||
"- If you truly can't determine the right option, return digit: null.\n"
|
||||
)
|
||||
|
||||
context = f"IVR Transcript:\n{transcript}\n\n"
|
||||
context += f"Caller's Intent: {intent}\n"
|
||||
if previous_selections:
|
||||
context += f"Already pressed: {', '.join(previous_selections)}\n"
|
||||
context += "\nWhich digit should be pressed? Return JSON."
|
||||
|
||||
result = await self.chat_json(context, system=system)
|
||||
|
||||
# Normalize response
|
||||
if "digit" not in result:
|
||||
# Try to extract from various response formats
|
||||
for key in ["option", "press", "choice", "dtmf"]:
|
||||
if key in result:
|
||||
result["digit"] = str(result[key])
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
async def detect_human_speech(
|
||||
self,
|
||||
transcript: str,
|
||||
context: str = "",
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Analyze a transcript to determine if a human agent is speaking.
|
||||
|
||||
Used as a secondary check when audio classifier detects speech
|
||||
but we need to distinguish between IVR prompts and a live human.
|
||||
|
||||
Returns:
|
||||
{"is_human": true, "confidence": 0.9, "reason": "Agent greeting detected"}
|
||||
"""
|
||||
system = (
|
||||
"You are analyzing a phone call transcript to determine if "
|
||||
"a live human agent is speaking (vs an automated IVR system).\n\n"
|
||||
"Human indicators:\n"
|
||||
"- Personal greeting ('Hi, my name is...')\n"
|
||||
"- Asking for account details\n"
|
||||
"- Conversational tone, filler words\n"
|
||||
"- Acknowledging hold time ('Thanks for waiting')\n"
|
||||
"\nIVR indicators:\n"
|
||||
"- 'Press N for...', 'Say...'\n"
|
||||
"- Robotic phrasing\n"
|
||||
"- Menu options\n"
|
||||
"- 'Your call is important to us'\n"
|
||||
)
|
||||
|
||||
prompt = f"Transcript:\n{transcript}\n"
|
||||
if context:
|
||||
prompt += f"\nContext: {context}\n"
|
||||
prompt += "\nIs this a live human agent? Return JSON."
|
||||
|
||||
return await self.chat_json(prompt, system=system)
|
||||
|
||||
async def summarize_call(
|
||||
self,
|
||||
transcript_chunks: list[str],
|
||||
intent: str,
|
||||
duration_seconds: int,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Generate a call summary from transcript chunks.
|
||||
|
||||
Used for call history and analytics.
|
||||
|
||||
Returns:
|
||||
{"summary": "...", "outcome": "resolved|unresolved|transferred",
|
||||
"key_info": [...], "sentiment": "positive|neutral|negative"}
|
||||
"""
|
||||
system = (
|
||||
"Summarize this phone call concisely. Include:\n"
|
||||
"- What the caller wanted\n"
|
||||
"- What happened (IVR navigation, hold time, agent interaction)\n"
|
||||
"- The outcome\n"
|
||||
"Return as JSON with: summary, outcome, key_info (list), sentiment."
|
||||
)
|
||||
|
||||
full_transcript = "\n".join(transcript_chunks)
|
||||
prompt = (
|
||||
f"Caller's intent: {intent}\n"
|
||||
f"Call duration: {duration_seconds} seconds\n\n"
|
||||
f"Full transcript:\n{full_transcript}\n\n"
|
||||
"Summarize this call."
|
||||
)
|
||||
|
||||
return await self.chat_json(prompt, system=system)
|
||||
|
||||
# ================================================================
|
||||
# Internal
|
||||
# ================================================================
|
||||
|
||||
async def _complete(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
temperature: float = 0.3,
|
||||
max_tokens: int = 1024,
|
||||
) -> str:
|
||||
"""Execute a chat completion request."""
|
||||
self._total_requests += 1
|
||||
start = time.monotonic()
|
||||
|
||||
try:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
|
||||
response = await self._client.post("/chat/completions", json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
|
||||
# Track token usage
|
||||
if "usage" in data:
|
||||
self._total_tokens += data["usage"].get("total_tokens", 0)
|
||||
|
||||
# Track latency
|
||||
elapsed_ms = (time.monotonic() - start) * 1000
|
||||
self._avg_latency_ms = (
|
||||
self._avg_latency_ms * 0.9 + elapsed_ms * 0.1
|
||||
)
|
||||
|
||||
# Extract response text
|
||||
choices = data.get("choices", [])
|
||||
if choices:
|
||||
return choices[0].get("message", {}).get("content", "")
|
||||
return ""
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._total_errors += 1
|
||||
logger.error(f"LLM API error: {e.response.status_code} {e.response.text[:200]}")
|
||||
return ""
|
||||
except httpx.TimeoutException:
|
||||
self._total_errors += 1
|
||||
logger.error(f"LLM API timeout after {self.timeout}s")
|
||||
return ""
|
||||
except Exception as e:
|
||||
self._total_errors += 1
|
||||
logger.error(f"LLM client error: {e}")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _parse_json_response(text: str) -> dict[str, Any]:
|
||||
"""Parse JSON from LLM response, handling common formatting issues."""
|
||||
text = text.strip()
|
||||
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try extracting from markdown code block
|
||||
if "```" in text:
|
||||
# Find content between ```json and ``` or ``` and ```
|
||||
parts = text.split("```")
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 1: # Odd indices are inside code blocks
|
||||
# Remove optional language tag
|
||||
content = part.strip()
|
||||
if content.startswith("json"):
|
||||
content = content[4:].strip()
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Try finding JSON object in the text
|
||||
brace_start = text.find("{")
|
||||
brace_end = text.rfind("}")
|
||||
if brace_start != -1 and brace_end != -1:
|
||||
try:
|
||||
return json.loads(text[brace_start : brace_end + 1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.warning(f"Failed to parse JSON from LLM response: {text[:200]}")
|
||||
return {"error": "Failed to parse JSON response", "raw": text[:500]}
|
||||
|
||||
# ================================================================
|
||||
# Stats
|
||||
# ================================================================
|
||||
|
||||
@property
|
||||
def stats(self) -> dict:
|
||||
return {
|
||||
"total_requests": self._total_requests,
|
||||
"total_tokens": self._total_tokens,
|
||||
"total_errors": self._total_errors,
|
||||
"avg_latency_ms": round(self._avg_latency_ms, 1),
|
||||
"model": self.model,
|
||||
"base_url": self.base_url,
|
||||
}
|
||||
256
services/notification.py
Normal file
256
services/notification.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
Notification Service — Tell the user what's happening.
|
||||
|
||||
Sends notifications when:
|
||||
- A human picks up (TRANSFER NOW!)
|
||||
- Hold time estimates change
|
||||
- Call fails or times out
|
||||
- IVR navigation milestones
|
||||
|
||||
Supports multiple channels: WebSocket (always), SMS (optional),
|
||||
push notifications (future).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from config import Settings
|
||||
from core.event_bus import EventBus
|
||||
from models.events import EventType, GatewayEvent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NotificationChannel(str, Enum):
|
||||
"""Where to send notifications."""
|
||||
|
||||
WEBSOCKET = "websocket"
|
||||
SMS = "sms"
|
||||
PUSH = "push"
|
||||
|
||||
|
||||
class NotificationPriority(str, Enum):
|
||||
"""How urgently to deliver."""
|
||||
|
||||
LOW = "low" # Status updates, hold time estimates
|
||||
NORMAL = "normal" # IVR navigation milestones
|
||||
HIGH = "high" # Human detected, call failed
|
||||
CRITICAL = "critical" # Transfer happening NOW
|
||||
|
||||
|
||||
class Notification(BaseModel):
|
||||
"""A notification to send to the user."""
|
||||
|
||||
channel: NotificationChannel
|
||||
priority: NotificationPriority
|
||||
title: str
|
||||
message: str
|
||||
call_id: Optional[str] = None
|
||||
data: dict[str, Any] = {}
|
||||
timestamp: datetime = datetime.now()
|
||||
|
||||
|
||||
class NotificationService:
|
||||
"""
|
||||
Sends notifications to users about call events.
|
||||
|
||||
Listens to the EventBus and routes events to the
|
||||
appropriate notification channels.
|
||||
"""
|
||||
|
||||
def __init__(self, event_bus: EventBus, settings: Settings):
|
||||
self._event_bus = event_bus
|
||||
self._settings = settings
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
self._sms_sender: Optional[Any] = None
|
||||
|
||||
# Track what we've already notified (avoid spam)
|
||||
self._notified: dict[str, set[str]] = {} # call_id -> set of event types
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Start listening for events to notify on."""
|
||||
self._task = asyncio.create_task(self._listen_loop())
|
||||
logger.info("📢 Notification service started")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop the notification listener."""
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
logger.info("📢 Notification service stopped")
|
||||
|
||||
async def _listen_loop(self) -> None:
|
||||
"""Main event listener loop."""
|
||||
subscription = self._event_bus.subscribe()
|
||||
try:
|
||||
async for event in subscription:
|
||||
try:
|
||||
await self._handle_event(event)
|
||||
except Exception as e:
|
||||
logger.error(f"Notification handler error: {e}", exc_info=True)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
subscription.close()
|
||||
|
||||
async def _handle_event(self, event: GatewayEvent) -> None:
|
||||
"""Route an event to the appropriate notification(s)."""
|
||||
call_id = event.call_id or ""
|
||||
|
||||
# Initialize tracking for this call
|
||||
if call_id and call_id not in self._notified:
|
||||
self._notified[call_id] = set()
|
||||
|
||||
# Skip duplicate notifications
|
||||
dedup_key = f"{event.type.value}:{event.data.get('step_id', '')}"
|
||||
if call_id and dedup_key in self._notified.get(call_id, set()):
|
||||
return
|
||||
|
||||
notification = self._event_to_notification(event)
|
||||
if not notification:
|
||||
return
|
||||
|
||||
# Mark as notified
|
||||
if call_id:
|
||||
self._notified[call_id].add(dedup_key)
|
||||
|
||||
# Send via all appropriate channels
|
||||
await self._send(notification)
|
||||
|
||||
def _event_to_notification(self, event: GatewayEvent) -> Optional[Notification]:
|
||||
"""Convert a gateway event to a notification (or None to skip)."""
|
||||
|
||||
if event.type == EventType.HUMAN_DETECTED:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.CRITICAL,
|
||||
title="🚨 Human Detected!",
|
||||
message="A live person picked up — transferring you now!",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.TRANSFER_STARTED:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.CRITICAL,
|
||||
title="📞 Call Transferred",
|
||||
message="Your call has been connected to the agent. Pick up your phone!",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.CALL_FAILED:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.HIGH,
|
||||
title="❌ Call Failed",
|
||||
message=event.message or "The call couldn't be completed.",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.HOLD_DETECTED:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.NORMAL,
|
||||
title="⏳ On Hold",
|
||||
message="You're on hold. We'll notify you when someone picks up.",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.IVR_STEP:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.LOW,
|
||||
title="📍 IVR Navigation",
|
||||
message=event.message or "Navigating phone menu...",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.IVR_DTMF_SENT:
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.LOW,
|
||||
title="📱 Button Pressed",
|
||||
message=event.message or f"Pressed {event.data.get('digits', '?')}",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
elif event.type == EventType.CALL_ENDED:
|
||||
# Clean up tracking
|
||||
if event.call_id and event.call_id in self._notified:
|
||||
del self._notified[event.call_id]
|
||||
|
||||
return Notification(
|
||||
channel=NotificationChannel.WEBSOCKET,
|
||||
priority=NotificationPriority.NORMAL,
|
||||
title="📴 Call Ended",
|
||||
message=event.message or "The call has ended.",
|
||||
call_id=event.call_id,
|
||||
data=event.data,
|
||||
)
|
||||
|
||||
# Skip other event types (transcription, classification, etc.)
|
||||
return None
|
||||
|
||||
async def _send(self, notification: Notification) -> None:
|
||||
"""Send a notification via the appropriate channel."""
|
||||
logger.info(
|
||||
f"📢 [{notification.priority.value}] {notification.title}: "
|
||||
f"{notification.message}"
|
||||
)
|
||||
|
||||
# WebSocket notifications go through the event bus
|
||||
# (the WebSocket handler in the API reads from EventBus directly)
|
||||
|
||||
# SMS for critical notifications
|
||||
if (
|
||||
notification.priority == NotificationPriority.CRITICAL
|
||||
and self._settings.notify_sms_number
|
||||
):
|
||||
await self._send_sms(notification)
|
||||
|
||||
async def _send_sms(self, notification: Notification) -> None:
|
||||
"""
|
||||
Send an SMS notification.
|
||||
|
||||
Uses a simple HTTP-based SMS gateway. In production,
|
||||
this would use Twilio, AWS SNS, or similar.
|
||||
"""
|
||||
phone = self._settings.notify_sms_number
|
||||
if not phone:
|
||||
return
|
||||
|
||||
try:
|
||||
import httpx
|
||||
|
||||
# Generic webhook-based SMS (configure your provider)
|
||||
# This is a placeholder — wire up your preferred SMS provider
|
||||
logger.info(f"📱 SMS → {phone}: {notification.title}")
|
||||
|
||||
# Example: Twilio-style API
|
||||
# async with httpx.AsyncClient() as client:
|
||||
# await client.post(
|
||||
# "https://api.twilio.com/2010-04-01/Accounts/.../Messages.json",
|
||||
# data={
|
||||
# "To": phone,
|
||||
# "From": self._settings.sip_trunk.did,
|
||||
# "Body": f"{notification.title}\n{notification.message}",
|
||||
# },
|
||||
# auth=(account_sid, auth_token),
|
||||
# )
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"SMS send failed: {e}")
|
||||
230
services/recording.py
Normal file
230
services/recording.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Recording Service — Call recording management.
|
||||
|
||||
Records calls to WAV files via the PJSUA2 media pipeline,
|
||||
manages storage, and provides playback/download access.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RecordingService:
|
||||
"""
|
||||
Manages call recordings.
|
||||
|
||||
Features:
|
||||
- Start/stop recording for any active call leg
|
||||
- Dual-channel recording (separate caller/agent streams)
|
||||
- Mixed recording (both parties in one file)
|
||||
- WAV storage with organized directory structure
|
||||
- Recording metadata tracking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
storage_dir: str = "recordings",
|
||||
max_recording_seconds: int = 7200, # 2 hours
|
||||
sample_rate: int = 16000,
|
||||
):
|
||||
self._storage_dir = Path(storage_dir)
|
||||
self._max_recording_seconds = max_recording_seconds
|
||||
self._sample_rate = sample_rate
|
||||
self._active_recordings: dict[str, RecordingSession] = {}
|
||||
self._metadata: list[dict] = []
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Initialize the recording service."""
|
||||
self._storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"🎙️ Recording service ready (storage: {self._storage_dir})")
|
||||
|
||||
# ================================================================
|
||||
# Recording Lifecycle
|
||||
# ================================================================
|
||||
|
||||
async def start_recording(
|
||||
self,
|
||||
call_id: str,
|
||||
media_pipeline=None,
|
||||
leg_ids: Optional[list[str]] = None,
|
||||
dual_channel: bool = False,
|
||||
) -> "RecordingSession":
|
||||
"""
|
||||
Start recording a call.
|
||||
|
||||
Args:
|
||||
call_id: The call to record.
|
||||
media_pipeline: MediaPipeline instance for PJSUA2 recording.
|
||||
leg_ids: Specific SIP leg IDs to record. If None, records all legs.
|
||||
dual_channel: If True, record each party to a separate channel.
|
||||
|
||||
Returns:
|
||||
RecordingSession with file paths and metadata.
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
date_dir = datetime.now().strftime("%Y-%m-%d")
|
||||
recording_dir = self._storage_dir / date_dir
|
||||
recording_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if dual_channel:
|
||||
filepath_caller = str(recording_dir / f"{call_id}_{timestamp}_caller.wav")
|
||||
filepath_agent = str(recording_dir / f"{call_id}_{timestamp}_agent.wav")
|
||||
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}_mixed.wav")
|
||||
else:
|
||||
filepath_caller = None
|
||||
filepath_agent = None
|
||||
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}.wav")
|
||||
|
||||
session = RecordingSession(
|
||||
call_id=call_id,
|
||||
filepath_mixed=filepath_mixed,
|
||||
filepath_caller=filepath_caller,
|
||||
filepath_agent=filepath_agent,
|
||||
started_at=datetime.now(),
|
||||
sample_rate=self._sample_rate,
|
||||
)
|
||||
|
||||
# Start PJSUA2 recording if media pipeline is available
|
||||
if media_pipeline and leg_ids:
|
||||
for leg_id in leg_ids:
|
||||
if filepath_mixed:
|
||||
media_pipeline.start_recording(leg_id, filepath_mixed)
|
||||
|
||||
self._active_recordings[call_id] = session
|
||||
logger.info(f"🔴 Recording started: {call_id} → {filepath_mixed}")
|
||||
|
||||
# Safety timeout
|
||||
asyncio.create_task(
|
||||
self._recording_timeout(call_id),
|
||||
name=f"rec_timeout_{call_id}",
|
||||
)
|
||||
|
||||
return session
|
||||
|
||||
async def stop_recording(
|
||||
self,
|
||||
call_id: str,
|
||||
media_pipeline=None,
|
||||
) -> Optional["RecordingSession"]:
|
||||
"""Stop recording a call and finalize the WAV file."""
|
||||
session = self._active_recordings.pop(call_id, None)
|
||||
if not session:
|
||||
logger.warning(f" No active recording for {call_id}")
|
||||
return None
|
||||
|
||||
session.stopped_at = datetime.now()
|
||||
session.duration_seconds = int(
|
||||
(session.stopped_at - session.started_at).total_seconds()
|
||||
)
|
||||
|
||||
# Stop PJSUA2 recording
|
||||
if media_pipeline:
|
||||
# The pipeline handles flushing and closing the WAV file
|
||||
for leg_id in (session._leg_ids or []):
|
||||
media_pipeline.stop_recording(leg_id)
|
||||
|
||||
# Calculate file size
|
||||
if session.filepath_mixed and os.path.exists(session.filepath_mixed):
|
||||
session.file_size_bytes = os.path.getsize(session.filepath_mixed)
|
||||
|
||||
# Store metadata
|
||||
self._metadata.append(session.to_dict())
|
||||
|
||||
logger.info(
|
||||
f"⏹ Recording stopped: {call_id} "
|
||||
f"({session.duration_seconds}s, "
|
||||
f"{session.file_size_bytes or 0} bytes)"
|
||||
)
|
||||
return session
|
||||
|
||||
async def _recording_timeout(self, call_id: str) -> None:
|
||||
"""Auto-stop recording after max duration."""
|
||||
await asyncio.sleep(self._max_recording_seconds)
|
||||
if call_id in self._active_recordings:
|
||||
logger.warning(f" Recording timeout for {call_id}, auto-stopping")
|
||||
await self.stop_recording(call_id)
|
||||
|
||||
# ================================================================
|
||||
# Queries
|
||||
# ================================================================
|
||||
|
||||
def get_recording(self, call_id: str) -> Optional[dict]:
|
||||
"""Get recording metadata for a call."""
|
||||
for meta in reversed(self._metadata):
|
||||
if meta["call_id"] == call_id:
|
||||
return meta
|
||||
return None
|
||||
|
||||
def list_recordings(
|
||||
self,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
) -> list[dict]:
|
||||
"""List recording metadata, newest first."""
|
||||
sorted_meta = sorted(
|
||||
self._metadata,
|
||||
key=lambda m: m.get("started_at", ""),
|
||||
reverse=True,
|
||||
)
|
||||
return sorted_meta[offset : offset + limit]
|
||||
|
||||
@property
|
||||
def active_recording_count(self) -> int:
|
||||
return len(self._active_recordings)
|
||||
|
||||
@property
|
||||
def total_recordings(self) -> int:
|
||||
return len(self._metadata)
|
||||
|
||||
def storage_usage_bytes(self) -> int:
|
||||
"""Calculate total storage used by recordings."""
|
||||
total = 0
|
||||
for root, _dirs, files in os.walk(self._storage_dir):
|
||||
for f in files:
|
||||
total += os.path.getsize(os.path.join(root, f))
|
||||
return total
|
||||
|
||||
|
||||
class RecordingSession:
|
||||
"""Tracks a single active recording session."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
call_id: str,
|
||||
filepath_mixed: Optional[str] = None,
|
||||
filepath_caller: Optional[str] = None,
|
||||
filepath_agent: Optional[str] = None,
|
||||
started_at: Optional[datetime] = None,
|
||||
sample_rate: int = 16000,
|
||||
):
|
||||
self.call_id = call_id
|
||||
self.filepath_mixed = filepath_mixed
|
||||
self.filepath_caller = filepath_caller
|
||||
self.filepath_agent = filepath_agent
|
||||
self.started_at = started_at or datetime.now()
|
||||
self.stopped_at: Optional[datetime] = None
|
||||
self.duration_seconds: Optional[int] = None
|
||||
self.file_size_bytes: Optional[int] = None
|
||||
self.sample_rate = sample_rate
|
||||
self._leg_ids: list[str] = []
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"call_id": self.call_id,
|
||||
"filepath_mixed": self.filepath_mixed,
|
||||
"filepath_caller": self.filepath_caller,
|
||||
"filepath_agent": self.filepath_agent,
|
||||
"started_at": self.started_at.isoformat() if self.started_at else None,
|
||||
"stopped_at": self.stopped_at.isoformat() if self.stopped_at else None,
|
||||
"duration_seconds": self.duration_seconds,
|
||||
"file_size_bytes": self.file_size_bytes,
|
||||
"sample_rate": self.sample_rate,
|
||||
}
|
||||
161
services/transcription.py
Normal file
161
services/transcription.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Transcription Service — Speaches STT integration.
|
||||
|
||||
Sends audio to your Speaches instances for real-time speech-to-text.
|
||||
Used by the Hold Slayer to understand IVR prompts and detect menu options.
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from config import SpeachesSettings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TranscriptionService:
|
||||
"""
|
||||
Client for Speaches STT service.
|
||||
|
||||
Speaches exposes an OpenAI-compatible API:
|
||||
POST /v1/audio/transcriptions
|
||||
"""
|
||||
|
||||
def __init__(self, settings: SpeachesSettings):
|
||||
self.settings = settings
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Get or create the HTTP client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=self.settings.url,
|
||||
timeout=httpx.Timeout(30.0, connect=5.0),
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def transcribe(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
language: str = "en",
|
||||
prompt: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Transcribe audio data to text.
|
||||
|
||||
Args:
|
||||
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
|
||||
language: Language code (default: "en")
|
||||
prompt: Optional context hint for better accuracy
|
||||
(e.g., "IVR menu options, phone banking")
|
||||
|
||||
Returns:
|
||||
Transcribed text
|
||||
"""
|
||||
client = await self._get_client()
|
||||
|
||||
# Convert raw PCM to WAV format for the API
|
||||
wav_data = self._pcm_to_wav(audio_data)
|
||||
|
||||
try:
|
||||
response = await client.post(
|
||||
"/v1/audio/transcriptions",
|
||||
files={"file": ("audio.wav", wav_data, "audio/wav")},
|
||||
data={
|
||||
"model": self.settings.model,
|
||||
"language": language,
|
||||
"response_format": "text",
|
||||
**({"prompt": prompt} if prompt else {}),
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
text = response.text.strip()
|
||||
logger.debug(f"Transcription: '{text}'")
|
||||
return text
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
|
||||
return ""
|
||||
except httpx.ConnectError:
|
||||
logger.error(f"Cannot connect to Speaches at {self.settings.url}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription failed: {e}")
|
||||
return ""
|
||||
|
||||
async def transcribe_stream(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
language: str = "en",
|
||||
):
|
||||
"""
|
||||
Stream transcription — for real-time results.
|
||||
|
||||
Uses Speaches streaming endpoint if available,
|
||||
falls back to chunked transcription.
|
||||
|
||||
Yields:
|
||||
str: Partial transcription chunks
|
||||
"""
|
||||
# For now, do chunked transcription
|
||||
# TODO: Implement WebSocket streaming when Speaches supports it
|
||||
chunk_size = 16000 * 2 * 3 # 3 seconds of 16kHz 16-bit mono
|
||||
|
||||
for i in range(0, len(audio_data), chunk_size):
|
||||
chunk = audio_data[i:i + chunk_size]
|
||||
if len(chunk) > 0:
|
||||
text = await self.transcribe(chunk, language)
|
||||
if text:
|
||||
yield text
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
@staticmethod
|
||||
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
|
||||
"""
|
||||
Convert raw PCM data to WAV format.
|
||||
|
||||
Args:
|
||||
pcm_data: Raw PCM audio bytes
|
||||
sample_rate: Sample rate in Hz (default: 16000)
|
||||
channels: Number of channels (default: 1 = mono)
|
||||
sample_width: Bytes per sample (default: 2 = 16-bit)
|
||||
|
||||
Returns:
|
||||
WAV file as bytes
|
||||
"""
|
||||
import struct
|
||||
|
||||
data_size = len(pcm_data)
|
||||
file_size = 36 + data_size # Header is 44 bytes, minus 8 for RIFF header
|
||||
|
||||
wav = io.BytesIO()
|
||||
|
||||
# RIFF header
|
||||
wav.write(b"RIFF")
|
||||
wav.write(struct.pack("<I", file_size))
|
||||
wav.write(b"WAVE")
|
||||
|
||||
# fmt chunk
|
||||
wav.write(b"fmt ")
|
||||
wav.write(struct.pack("<I", 16)) # Chunk size
|
||||
wav.write(struct.pack("<H", 1)) # PCM format
|
||||
wav.write(struct.pack("<H", channels))
|
||||
wav.write(struct.pack("<I", sample_rate))
|
||||
wav.write(struct.pack("<I", sample_rate * channels * sample_width)) # Byte rate
|
||||
wav.write(struct.pack("<H", channels * sample_width)) # Block align
|
||||
wav.write(struct.pack("<H", sample_width * 8)) # Bits per sample
|
||||
|
||||
# data chunk
|
||||
wav.write(b"data")
|
||||
wav.write(struct.pack("<I", data_size))
|
||||
wav.write(pcm_data)
|
||||
|
||||
return wav.getvalue()
|
||||
Reference in New Issue
Block a user