feat: add initial Hold Slayer AI telephony gateway implementation

Complete project scaffolding and core implementation of an AI-powered
telephony system that calls companies, navigates IVR menus, waits on
hold, and transfers to the user when a human answers.

Key components:
- FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces
- SIP/VoIP call management via PJSUA2 with RTP audio streaming
- LLM-powered IVR navigation using OpenAI/Anthropic with tool calling
- Hold detection service combining audio analysis and silence detection
- Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines
- Call recording with per-channel and mixed audio capture
- Event bus (asyncio pub/sub) for real-time client updates
- Web dashboard with live call monitoring
- SQLite persistence via SQLAlchemy with call history and analytics
- Notification support (email, SMS, webhook, desktop)
- Docker Compose deployment with Opal VoIP and Opal Media containers
- Comprehensive test suite with unit, integration, and E2E tests
- Simplified .gitignore and full project documentation in README
This commit is contained in:
2026-03-21 19:23:26 +00:00
parent c9ff60702b
commit ecf37658ce
56 changed files with 11601 additions and 164 deletions

1
services/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""AI services — hold detection, transcription, classification, and more."""

View File

@@ -0,0 +1,444 @@
"""
Audio Classifier — Spectral analysis for hold music, speech, and silence detection.
This is the brain of the Hold Slayer. It analyzes audio in real-time to determine:
- Is this hold music?
- Is this an IVR prompt (automated voice)?
- Is this a live human?
- Is this silence?
- Is this a ring-back tone?
Uses spectral analysis (librosa/numpy) to classify audio without needing
a trained ML model — just signal processing and heuristics.
"""
import logging
import time
from typing import Optional
import numpy as np
from config import ClassifierSettings
from models.call import AudioClassification, ClassificationResult
logger = logging.getLogger(__name__)
# Audio constants
SAMPLE_RATE = 16000 # 16kHz mono
FRAME_SIZE = SAMPLE_RATE * 2 # 16-bit samples = 2 bytes per sample
class AudioClassifier:
"""
Real-time audio classifier using spectral analysis.
Classification strategy:
- Silence: Low RMS energy
- Music: High spectral flatness + sustained tonal content + rhythm
- IVR prompt: Speech-like spectral envelope but repetitive/synthetic
- Live human: Speech-like spectral envelope + natural variation
- Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring)
- DTMF: Dual-tone detection at known DTMF frequencies
"""
def __init__(self, settings: ClassifierSettings):
self.settings = settings
self._window_buffer: list[bytes] = []
self._window_samples = int(settings.window_seconds * SAMPLE_RATE)
self._classification_history: list[AudioClassification] = []
def classify_chunk(self, audio_data: bytes) -> ClassificationResult:
"""
Classify a chunk of audio data.
Args:
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
Returns:
ClassificationResult with type and confidence
"""
# Convert bytes to numpy array
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
if len(samples) == 0:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.SILENCE,
confidence=1.0,
)
# Normalize to [-1.0, 1.0]
samples = samples / 32768.0
# Run all detectors
rms = self._compute_rms(samples)
spectral_flatness = self._compute_spectral_flatness(samples)
zcr = self._compute_zero_crossing_rate(samples)
dominant_freq = self._compute_dominant_frequency(samples)
spectral_centroid = self._compute_spectral_centroid(samples)
is_tonal = self._detect_tonality(samples)
# Build feature dict for debugging
features = {
"rms": float(rms),
"spectral_flatness": float(spectral_flatness),
"zcr": float(zcr),
"dominant_freq": float(dominant_freq),
"spectral_centroid": float(spectral_centroid),
"is_tonal": is_tonal,
}
# === Classification Logic ===
# 1. Silence detection
if rms < 0.01:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.SILENCE,
confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5),
details=features,
)
# 2. DTMF detection (very specific dual-tone pattern)
dtmf_result = self._detect_dtmf(samples)
if dtmf_result:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.DTMF,
confidence=0.95,
details={**features, "dtmf_digit": dtmf_result},
)
# 3. Ring-back tone detection (440+480Hz in NA, periodic on/off)
if is_tonal and 400 < dominant_freq < 520 and rms > 0.02:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.RINGING,
confidence=0.8,
details=features,
)
# 4. Music vs Speech discrimination
# Music: higher spectral flatness, more tonal, wider spectral spread
# Speech: lower spectral flatness, concentrated energy, variable ZCR
music_score = self._compute_music_score(
spectral_flatness, is_tonal, spectral_centroid, zcr, rms
)
speech_score = self._compute_speech_score(
spectral_flatness, zcr, spectral_centroid, rms
)
# 5. If it's speech-like, is it live or automated?
if speech_score > music_score:
# Use history to distinguish live human from IVR
# IVR: repetitive patterns, synthetic prosody
# Human: natural variation, conversational rhythm
if self._looks_like_live_human(speech_score, zcr, rms):
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.LIVE_HUMAN,
confidence=speech_score,
details=features,
)
else:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.IVR_PROMPT,
confidence=speech_score * 0.8,
details=features,
)
# 6. Music (hold music)
if music_score >= self.settings.music_threshold:
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.MUSIC,
confidence=music_score,
details=features,
)
# 7. Unknown / low confidence
return ClassificationResult(
timestamp=time.time(),
audio_type=AudioClassification.UNKNOWN,
confidence=max(music_score, speech_score),
details=features,
)
# ================================================================
# Feature Extraction
# ================================================================
@staticmethod
def _compute_rms(samples: np.ndarray) -> float:
"""Root Mean Square — overall energy level."""
return float(np.sqrt(np.mean(samples ** 2)))
@staticmethod
def _compute_spectral_flatness(samples: np.ndarray) -> float:
"""
Spectral flatness (Wiener entropy).
Close to 1.0 = noise-like (white noise)
Close to 0.0 = tonal (pure tone, music)
Speech is typically 0.1-0.4, music 0.05-0.3
"""
fft = np.abs(np.fft.rfft(samples))
fft = fft[fft > 0] # Avoid log(0)
if len(fft) == 0:
return 0.0
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
arithmetic_mean = np.mean(fft)
if arithmetic_mean == 0:
return 0.0
return float(geometric_mean / arithmetic_mean)
@staticmethod
def _compute_zero_crossing_rate(samples: np.ndarray) -> float:
"""
Zero-crossing rate — how often the signal crosses zero.
Higher for unvoiced speech and noise.
Lower for voiced speech and tonal music.
"""
crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2
return float(crossings / len(samples))
@staticmethod
def _compute_dominant_frequency(samples: np.ndarray) -> float:
"""Find the dominant frequency in the signal."""
fft = np.abs(np.fft.rfft(samples))
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
# Ignore DC and very low frequencies
mask = freqs > 50
if not np.any(mask):
return 0.0
fft_masked = fft[mask]
freqs_masked = freqs[mask]
return float(freqs_masked[np.argmax(fft_masked)])
@staticmethod
def _compute_spectral_centroid(samples: np.ndarray) -> float:
"""
Spectral centroid — "center of mass" of the spectrum.
Higher for bright/treble sounds, lower for bass-heavy sounds.
Speech typically 500-4000Hz, music varies widely.
"""
fft = np.abs(np.fft.rfft(samples))
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
total_energy = np.sum(fft)
if total_energy == 0:
return 0.0
return float(np.sum(freqs * fft) / total_energy)
@staticmethod
def _detect_tonality(samples: np.ndarray) -> bool:
"""
Check if the signal is strongly tonal (has clear pitch).
Uses autocorrelation.
"""
# Autocorrelation
correlation = np.correlate(samples, samples, mode="full")
correlation = correlation[len(correlation) // 2:]
# Normalize
if correlation[0] == 0:
return False
correlation = correlation / correlation[0]
# Look for a strong peak (indicating periodicity)
# Skip the first ~50 samples (very high frequencies)
min_lag = int(SAMPLE_RATE / 1000) # ~16 samples (1000Hz max)
max_lag = int(SAMPLE_RATE / 50) # ~320 samples (50Hz min)
search_region = correlation[min_lag:max_lag]
if len(search_region) == 0:
return False
peak_value = np.max(search_region)
return bool(peak_value > 0.5)
def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]:
"""
Detect DTMF tones using Goertzel algorithm (simplified).
DTMF frequencies:
697, 770, 852, 941 Hz (row)
1209, 1336, 1477, 1633 Hz (column)
"""
dtmf_freqs_low = [697, 770, 852, 941]
dtmf_freqs_high = [1209, 1336, 1477, 1633]
dtmf_map = {
(697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A",
(770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B",
(852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C",
(941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D",
}
# Compute power at each DTMF frequency
def goertzel_power(freq: int) -> float:
k = int(0.5 + len(samples) * freq / SAMPLE_RATE)
w = 2 * np.pi * k / len(samples)
coeff = 2 * np.cos(w)
s0, s1, s2 = 0.0, 0.0, 0.0
for sample in samples:
s0 = sample + coeff * s1 - s2
s2 = s1
s1 = s0
return float(s1 * s1 + s2 * s2 - coeff * s1 * s2)
# Find strongest low and high frequencies
low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low]
high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high]
best_low = max(low_powers, key=lambda x: x[1])
best_high = max(high_powers, key=lambda x: x[1])
# Threshold: both frequencies must be significantly present
total_power = np.sum(samples ** 2)
if total_power == 0:
return None
threshold = total_power * 0.1
if best_low[1] > threshold and best_high[1] > threshold:
key = (best_low[0], best_high[0])
return dtmf_map.get(key)
return None
# ================================================================
# Higher-Level Classification
# ================================================================
def _compute_music_score(
self,
spectral_flatness: float,
is_tonal: bool,
spectral_centroid: float,
zcr: float,
rms: float,
) -> float:
"""Compute a music likelihood score (0.0 - 1.0)."""
score = 0.0
# Music tends to be tonal
if is_tonal:
score += 0.3
# Music has moderate spectral flatness (more than pure tone, less than noise)
if 0.05 < spectral_flatness < 0.4:
score += 0.2
# Music has sustained energy
if rms > 0.03:
score += 0.15
# Music has wider spectral content than speech
if spectral_centroid > 1500:
score += 0.15
# Music tends to have lower ZCR than noise
if zcr < 0.15:
score += 0.2
return min(1.0, score)
def _compute_speech_score(
self,
spectral_flatness: float,
zcr: float,
spectral_centroid: float,
rms: float,
) -> float:
"""Compute a speech likelihood score (0.0 - 1.0)."""
score = 0.0
# Speech has moderate spectral flatness
if 0.1 < spectral_flatness < 0.5:
score += 0.25
# Speech centroid typically 500-4000 Hz
if 500 < spectral_centroid < 4000:
score += 0.25
# Speech has moderate ZCR
if 0.02 < zcr < 0.2:
score += 0.25
# Speech has moderate energy
if 0.01 < rms < 0.5:
score += 0.25
return min(1.0, score)
def _looks_like_live_human(
self,
speech_score: float,
zcr: float,
rms: float,
) -> bool:
"""
Distinguish live human from IVR/TTS.
Heuristics:
- IVR prompts are followed by silence (waiting for input)
- Live humans have more natural variation in energy and pitch
- After hold music → speech transition, it's likely a human
This is the hardest classification and benefits most from
the transcript context (Speaches STT).
"""
# Look at recent classification history
recent = self._classification_history[-10:] if self._classification_history else []
# Key signal: if we were just listening to hold music and now
# hear speech, it's very likely a live human agent
if recent:
recent_types = [c for c in recent]
if AudioClassification.MUSIC in recent_types[-5:]:
# Transition from music to speech = agent picked up!
return True
# High speech score with good energy = more likely human
if speech_score > 0.7 and rms > 0.05:
return True
# Default: assume IVR until proven otherwise
return False
def update_history(self, classification: AudioClassification) -> None:
"""Track classification history for pattern detection."""
self._classification_history.append(classification)
# Keep last 100 classifications
if len(self._classification_history) > 100:
self._classification_history = self._classification_history[-100:]
def detect_hold_to_human_transition(self) -> bool:
"""
Detect the critical moment: hold music → live human.
Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN
"""
recent = self._classification_history[-20:]
if len(recent) < 5:
return False
# Count recent music vs speech
music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC)
speech_count = sum(
1 for c in recent[-3:]
if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT)
)
# If we had a lot of music and now have speech, someone picked up
return music_count >= 3 and speech_count >= 2

324
services/call_analytics.py Normal file
View File

@@ -0,0 +1,324 @@
"""
Call Analytics Service — Tracks call metrics and generates insights.
Monitors call patterns, hold times, success rates, and IVR navigation
efficiency. Provides data for the dashboard and API.
"""
import logging
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Any, Optional
from models.call import ActiveCall, AudioClassification, CallMode, CallStatus
logger = logging.getLogger(__name__)
class CallAnalytics:
"""
In-memory call analytics engine.
Tracks:
- Call success/failure rates
- Hold time statistics (avg, min, max, p95)
- IVR navigation efficiency
- Human detection accuracy
- Per-number/company patterns
- Time-of-day patterns
In production, this would be backed by TimescaleDB or similar.
For now, we keep rolling windows in memory.
"""
def __init__(self, max_history: int = 10000):
self._max_history = max_history
self._call_records: list[CallRecord] = []
self._company_stats: dict[str, CompanyStats] = defaultdict(CompanyStats)
# ================================================================
# Record Calls
# ================================================================
def record_call(self, call: ActiveCall) -> None:
"""
Record a completed call for analytics.
Called when a call ends (from CallManager).
"""
record = CallRecord(
call_id=call.id,
remote_number=call.remote_number,
mode=call.mode,
status=call.status,
intent=call.intent,
started_at=call.created_at,
duration_seconds=call.duration,
hold_time_seconds=call.hold_time,
classification_history=[
r.audio_type.value for r in call.classification_history
],
transcript_chunks=list(call.transcript_chunks),
services=list(call.services),
)
self._call_records.append(record)
# Trim history
if len(self._call_records) > self._max_history:
self._call_records = self._call_records[-self._max_history :]
# Update company stats
company_key = self._normalize_number(call.remote_number)
self._company_stats[company_key].update(record)
logger.debug(
f"📊 Recorded call {call.id}: "
f"{call.status.value}, {call.duration}s, hold={call.hold_time}s"
)
# ================================================================
# Aggregate Stats
# ================================================================
def get_summary(self, hours: int = 24) -> dict[str, Any]:
"""Get summary statistics for the last N hours."""
cutoff = datetime.now() - timedelta(hours=hours)
recent = [r for r in self._call_records if r.started_at >= cutoff]
if not recent:
return {
"period_hours": hours,
"total_calls": 0,
"success_rate": 0.0,
"avg_hold_time": 0.0,
"avg_duration": 0.0,
}
total = len(recent)
successful = sum(1 for r in recent if r.status in (
CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED
))
failed = sum(1 for r in recent if r.status == CallStatus.FAILED)
hold_times = [r.hold_time_seconds for r in recent if r.hold_time_seconds > 0]
durations = [r.duration_seconds for r in recent if r.duration_seconds > 0]
hold_slayer_calls = [r for r in recent if r.mode == CallMode.HOLD_SLAYER]
hold_slayer_success = sum(
1 for r in hold_slayer_calls
if r.status in (CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED)
)
return {
"period_hours": hours,
"total_calls": total,
"successful": successful,
"failed": failed,
"success_rate": round(successful / total, 3) if total else 0.0,
"avg_duration": round(sum(durations) / len(durations), 1) if durations else 0.0,
"max_duration": max(durations) if durations else 0,
"hold_time": {
"avg": round(sum(hold_times) / len(hold_times), 1) if hold_times else 0.0,
"min": min(hold_times) if hold_times else 0,
"max": max(hold_times) if hold_times else 0,
"p95": self._percentile(hold_times, 95) if hold_times else 0,
"total": sum(hold_times),
},
"hold_slayer": {
"total": len(hold_slayer_calls),
"success": hold_slayer_success,
"success_rate": round(
hold_slayer_success / len(hold_slayer_calls), 3
) if hold_slayer_calls else 0.0,
},
"by_mode": self._group_by_mode(recent),
"by_hour": self._group_by_hour(recent),
}
def get_company_stats(self, number: str) -> dict[str, Any]:
"""Get stats for a specific company/number."""
key = self._normalize_number(number)
stats = self._company_stats.get(key)
if not stats:
return {"number": number, "total_calls": 0}
return stats.to_dict(number)
def get_top_numbers(self, limit: int = 10) -> list[dict[str, Any]]:
"""Get the most-called numbers with their stats."""
sorted_stats = sorted(
self._company_stats.items(),
key=lambda x: x[1].total_calls,
reverse=True,
)[:limit]
return [stats.to_dict(number) for number, stats in sorted_stats]
# ================================================================
# Hold Time Trends
# ================================================================
def get_hold_time_trend(
self,
number: Optional[str] = None,
days: int = 7,
) -> list[dict]:
"""
Get hold time trend data for graphing.
Returns daily average hold times for the last N days.
"""
cutoff = datetime.now() - timedelta(days=days)
records = [r for r in self._call_records if r.started_at >= cutoff]
if number:
key = self._normalize_number(number)
records = [r for r in records if self._normalize_number(r.remote_number) == key]
# Group by day
by_day: dict[str, list[int]] = defaultdict(list)
for r in records:
day = r.started_at.strftime("%Y-%m-%d")
if r.hold_time_seconds > 0:
by_day[day].append(r.hold_time_seconds)
trend = []
for i in range(days):
date = (datetime.now() - timedelta(days=days - 1 - i)).strftime("%Y-%m-%d")
times = by_day.get(date, [])
trend.append({
"date": date,
"avg_hold_time": round(sum(times) / len(times), 1) if times else 0,
"call_count": len(times),
"max_hold_time": max(times) if times else 0,
})
return trend
# ================================================================
# Helpers
# ================================================================
@staticmethod
def _normalize_number(number: str) -> str:
"""Normalize phone number for grouping."""
# Strip formatting, keep last 10 digits
digits = "".join(c for c in number if c.isdigit())
return digits[-10:] if len(digits) >= 10 else digits
@staticmethod
def _percentile(values: list, pct: int) -> float:
"""Calculate percentile value."""
if not values:
return 0.0
sorted_vals = sorted(values)
idx = int(len(sorted_vals) * pct / 100)
idx = min(idx, len(sorted_vals) - 1)
return float(sorted_vals[idx])
@staticmethod
def _group_by_mode(records: list["CallRecord"]) -> dict[str, int]:
"""Group call counts by mode."""
by_mode: dict[str, int] = defaultdict(int)
for r in records:
by_mode[r.mode.value] += 1
return dict(by_mode)
@staticmethod
def _group_by_hour(records: list["CallRecord"]) -> dict[int, int]:
"""Group call counts by hour of day."""
by_hour: dict[int, int] = defaultdict(int)
for r in records:
by_hour[r.started_at.hour] += 1
return dict(sorted(by_hour.items()))
@property
def total_calls_recorded(self) -> int:
return len(self._call_records)
# ================================================================
# Data Models
# ================================================================
class CallRecord:
"""A completed call record for analytics."""
def __init__(
self,
call_id: str,
remote_number: str,
mode: CallMode,
status: CallStatus,
intent: Optional[str] = None,
started_at: Optional[datetime] = None,
duration_seconds: int = 0,
hold_time_seconds: int = 0,
classification_history: Optional[list[str]] = None,
transcript_chunks: Optional[list[str]] = None,
services: Optional[list[str]] = None,
):
self.call_id = call_id
self.remote_number = remote_number
self.mode = mode
self.status = status
self.intent = intent
self.started_at = started_at or datetime.now()
self.duration_seconds = duration_seconds
self.hold_time_seconds = hold_time_seconds
self.classification_history = classification_history or []
self.transcript_chunks = transcript_chunks or []
self.services = services or []
class CompanyStats:
"""Aggregated stats for a specific company/phone number."""
def __init__(self):
self.total_calls = 0
self.successful_calls = 0
self.failed_calls = 0
self.total_hold_time = 0
self.hold_times: list[int] = []
self.total_duration = 0
self.last_called: Optional[datetime] = None
self.intents: dict[str, int] = defaultdict(int)
def update(self, record: CallRecord) -> None:
"""Update stats with a new call record."""
self.total_calls += 1
self.total_duration += record.duration_seconds
self.last_called = record.started_at
if record.status in (CallStatus.COMPLETED, CallStatus.BRIDGED, CallStatus.HUMAN_DETECTED):
self.successful_calls += 1
elif record.status == CallStatus.FAILED:
self.failed_calls += 1
if record.hold_time_seconds > 0:
self.total_hold_time += record.hold_time_seconds
self.hold_times.append(record.hold_time_seconds)
if record.intent:
self.intents[record.intent] += 1
def to_dict(self, number: str) -> dict[str, Any]:
return {
"number": number,
"total_calls": self.total_calls,
"successful_calls": self.successful_calls,
"failed_calls": self.failed_calls,
"success_rate": round(
self.successful_calls / self.total_calls, 3
) if self.total_calls else 0.0,
"avg_hold_time": round(
self.total_hold_time / len(self.hold_times), 1
) if self.hold_times else 0.0,
"max_hold_time": max(self.hold_times) if self.hold_times else 0,
"avg_duration": round(
self.total_duration / self.total_calls, 1
) if self.total_calls else 0.0,
"last_called": self.last_called.isoformat() if self.last_called else None,
"top_intents": dict(
sorted(self.intents.items(), key=lambda x: x[1], reverse=True)[:5]
),
}

View File

@@ -0,0 +1,339 @@
"""
Call Flow Learner — Builds and refines call flows from exploration data.
When Hold Slayer runs in exploration mode, it discovers IVR steps.
This service takes those discoveries and:
1. Builds a CallFlow tree that can be reused next time
2. Merges new discoveries into existing flows (refining them)
3. Uses LLM to label steps and infer menu structure
Over time, each phone number builds up a reliable call flow
that makes future calls faster and more accurate.
"""
import logging
import re
from datetime import datetime
from typing import Any, Optional
from models.call_flow import ActionType, CallFlow, CallFlowStep
logger = logging.getLogger(__name__)
class CallFlowLearner:
"""
Learns IVR call flows from exploration data.
Usage:
learner = CallFlowLearner(llm_client=llm)
# After an exploration call completes:
flow = await learner.build_flow(
phone_number="+18005551234",
discovered_steps=steps_from_exploration,
intent="cancel my card",
)
# Next time we call, merge new discoveries:
updated = await learner.merge_discoveries(
existing_flow=flow,
new_steps=new_discoveries,
)
"""
def __init__(self, llm_client=None):
self._llm = llm_client
# ================================================================
# Build Flow from Exploration
# ================================================================
async def build_flow(
self,
phone_number: str,
discovered_steps: list[dict],
intent: Optional[str] = None,
company_name: Optional[str] = None,
) -> CallFlow:
"""
Build a CallFlow from exploration discoveries.
Args:
phone_number: The number that was called.
discovered_steps: List of step dicts from exploration mode:
[{"timestamp": ..., "audio_type": "ivr_prompt",
"transcript": "Press 1 for...", "action_taken": {"dtmf": "1"}}, ...]
intent: What the caller was trying to accomplish.
company_name: Optional company name for labeling.
Returns:
A CallFlow that can be stored and reused.
"""
logger.info(
f"🧠 Building call flow from {len(discovered_steps)} discoveries "
f"for {phone_number}"
)
# Phase 1: Extract meaningful steps (skip silence, ringing)
meaningful = [
s for s in discovered_steps
if s.get("audio_type") in ("ivr_prompt", "live_human", "music")
or s.get("action_taken")
]
if not meaningful:
logger.warning(" No meaningful steps discovered")
return self._empty_flow(phone_number, company_name)
# Phase 2: Convert discoveries to CallFlowSteps
flow_steps = []
for i, step in enumerate(meaningful):
flow_step = self._discovery_to_step(step, i, meaningful)
if flow_step:
flow_steps.append(flow_step)
# Phase 3: Link steps together (next_step pointers)
for i, step in enumerate(flow_steps[:-1]):
step.next_step = flow_steps[i + 1].id
# Phase 4: Use LLM to enhance step labels if available
if self._llm and flow_steps:
flow_steps = await self._llm_enhance_steps(flow_steps, intent)
# Build the flow
name = company_name or self._guess_company_name(phone_number)
flow = CallFlow(
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
name=f"{name}{intent or 'General'}",
phone_number=phone_number,
description=f"Auto-learned flow for {name}. Intent: {intent or 'general'}",
steps=flow_steps,
tags=["auto-learned"],
notes=f"Learned from exploration on {datetime.now().isoformat()}",
times_used=1,
last_used=datetime.now(),
)
logger.info(
f" ✅ Built flow '{flow.name}' with {len(flow_steps)} steps"
)
return flow
def _discovery_to_step(
self,
discovery: dict,
index: int,
all_discoveries: list[dict],
) -> Optional[CallFlowStep]:
"""Convert a single exploration discovery to a CallFlowStep."""
audio_type = discovery.get("audio_type", "")
transcript = discovery.get("transcript", "")
action_taken = discovery.get("action_taken")
step_id = f"step_{index:03d}"
if audio_type == "ivr_prompt" and action_taken:
# IVR menu where we pressed a button
dtmf = action_taken.get("dtmf", "")
return CallFlowStep(
id=step_id,
description=self._summarize_menu(transcript) or f"IVR menu (pressed {dtmf})",
action=ActionType.DTMF,
action_value=dtmf,
expect=self._extract_expect_pattern(transcript),
timeout=15,
)
elif audio_type == "ivr_prompt" and not action_taken:
# IVR prompt we just listened to
return CallFlowStep(
id=step_id,
description=self._summarize_menu(transcript) or "IVR announcement",
action=ActionType.LISTEN,
timeout=30,
)
elif audio_type == "music":
# Hold music
return CallFlowStep(
id=step_id,
description="Hold music — waiting for agent",
action=ActionType.HOLD,
timeout=3600,
)
elif audio_type == "live_human":
# Human detected — this is the transfer point
return CallFlowStep(
id=step_id,
description="Live agent detected — transfer",
action=ActionType.TRANSFER,
action_value="preferred_device",
)
return None
# ================================================================
# Merge New Discoveries into Existing Flow
# ================================================================
async def merge_discoveries(
self,
existing_flow: CallFlow,
new_steps: list[dict],
intent: Optional[str] = None,
) -> CallFlow:
"""
Merge new exploration discoveries into an existing flow.
This refines the flow over time — updating timeouts,
confirming step order, adding alternative paths.
"""
logger.info(
f"🔄 Merging {len(new_steps)} new discoveries into "
f"flow '{existing_flow.name}'"
)
# Build a new flow from the discoveries
new_flow = await self.build_flow(
phone_number=existing_flow.phone_number,
discovered_steps=new_steps,
intent=intent,
)
# Simple merge strategy: keep existing steps but update timeouts
# and add any new steps that weren't in the original
existing_by_action = {
(s.action, s.action_value): s for s in existing_flow.steps
}
for new_step in new_flow.steps:
key = (new_step.action, new_step.action_value)
if key in existing_by_action:
# Update timeout to be the average
old_step = existing_by_action[key]
if old_step.timeout and new_step.timeout:
old_step.timeout = int(
(old_step.timeout + new_step.timeout) / 2
)
# New steps that don't exist are noted but not auto-added
# (to avoid corrupting a working flow)
# Update metadata
existing_flow.times_used = (existing_flow.times_used or 0) + 1
existing_flow.last_used = datetime.now()
logger.info(f" ✅ Merged. Flow now has {len(existing_flow.steps)} steps")
return existing_flow
# ================================================================
# LLM Enhancement
# ================================================================
async def _llm_enhance_steps(
self,
steps: list[CallFlowStep],
intent: Optional[str],
) -> list[CallFlowStep]:
"""Use LLM to improve step descriptions and structure."""
if not self._llm:
return steps
try:
# Build a summary of the steps for the LLM
step_descriptions = []
for s in steps:
desc = f"- {s.action.value}"
if s.action_value:
desc += f" ({s.action_value})"
if s.description:
desc += f": {s.description}"
step_descriptions.append(desc)
prompt = (
f"These are steps discovered while navigating a phone IVR system.\n"
f"Intent: {intent or 'general inquiry'}\n\n"
f"Steps:\n" + "\n".join(step_descriptions) + "\n\n"
f"For each step, provide a clear, concise description of what "
f"that step does. Return JSON array of objects with 'step_index' "
f"and 'description' fields."
)
result = await self._llm.chat_json(
prompt,
system="You are labeling IVR phone menu steps for a call flow database.",
)
# Apply LLM descriptions
if isinstance(result, list):
for item in result:
idx = item.get("step_index", -1)
desc = item.get("description", "")
if 0 <= idx < len(steps) and desc:
steps[idx].description = desc
elif isinstance(result, dict) and "steps" in result:
for item in result["steps"]:
idx = item.get("step_index", -1)
desc = item.get("description", "")
if 0 <= idx < len(steps) and desc:
steps[idx].description = desc
except Exception as e:
logger.warning(f" LLM enhancement failed (non-fatal): {e}")
return steps
# ================================================================
# Helpers
# ================================================================
@staticmethod
def _summarize_menu(transcript: str) -> Optional[str]:
"""Create a short summary of an IVR menu transcript."""
if not transcript:
return None
# Count how many options
options = re.findall(r'press\s+\d+', transcript.lower())
if options:
return f"IVR menu with {len(options)} options"
# Truncate long transcripts
if len(transcript) > 80:
return transcript[:77] + "..."
return transcript
@staticmethod
def _extract_expect_pattern(transcript: str) -> Optional[str]:
"""Extract a regex pattern to match this prompt next time."""
if not transcript:
return None
# Find the most distinctive phrase (>4 words, not generic)
words = transcript.split()
if len(words) >= 4:
# Use first meaningful phrase
phrase = " ".join(words[:6])
# Escape for regex
return re.escape(phrase.lower())
return None
@staticmethod
def _guess_company_name(phone_number: str) -> str:
"""Guess company name from phone number (placeholder)."""
# In production, this would do a reverse lookup
return f"Company {phone_number[-4:]}"
@staticmethod
def _empty_flow(phone_number: str, company_name: Optional[str]) -> CallFlow:
"""Create an empty flow placeholder."""
return CallFlow(
id=f"flow_{phone_number.replace('+', '')}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
name=f"{company_name or phone_number} — Empty",
phone_number=phone_number,
description="Empty flow — no meaningful steps discovered",
steps=[],
tags=["auto-learned", "empty"],
)

717
services/hold_slayer.py Normal file
View File

@@ -0,0 +1,717 @@
"""
Hold Slayer Service — The main event.
Navigate IVR trees, wait on hold, detect when a human picks up,
and transfer you in. This is the state machine that orchestrates
the entire hold-slaying process.
Two modes:
1. run_with_flow(): Follow a stored call flow tree (fast, reliable)
2. run_exploration(): No stored flow — listen, transcribe, and figure it out
"""
import asyncio
import logging
import re
import time
from typing import Optional
from config import Settings
from core.call_manager import CallManager
from core.sip_engine import SIPEngine
from models.call import ActiveCall, AudioClassification, CallStatus, ClassificationResult
from models.call_flow import ActionType, CallFlow, CallFlowStep
from models.events import EventType, GatewayEvent
from services.audio_classifier import AudioClassifier
from services.transcription import TranscriptionService
logger = logging.getLogger(__name__)
# LLM client is optional — imported at use time
_llm_client = None
def _get_llm():
"""Lazy-load LLM client (optional dependency)."""
global _llm_client
if _llm_client is None:
try:
from config import get_settings
from services.llm_client import LLMClient
settings = get_settings()
_llm_client = LLMClient(
base_url=settings.llm.base_url,
model=settings.llm.model,
api_key=settings.llm.api_key,
timeout=settings.llm.timeout,
)
except Exception as e:
logger.debug(f"LLM client not available: {e}")
_llm_client = False # Sentinel: don't retry
return _llm_client if _llm_client is not False else None
class HoldSlayerService:
"""
The Hold Slayer.
Navigates IVR menus, waits on hold, detects live humans,
and transfers the call to your device.
"""
def __init__(
self,
gateway, # AIPSTNGateway (avoid circular import)
call_manager: CallManager,
sip_engine: SIPEngine,
classifier: AudioClassifier,
transcription: TranscriptionService,
settings: Settings,
):
self.gateway = gateway
self.call_manager = call_manager
self.sip_engine = sip_engine
self.classifier = classifier
self.transcription = transcription
self.settings = settings
async def run(
self,
call: ActiveCall,
sip_leg_id: str,
call_flow_id: Optional[str] = None,
) -> bool:
"""
Main entry point. Run the Hold Slayer on a call.
Args:
call: The active call to work on
sip_leg_id: SIP leg ID for the PSTN call
call_flow_id: Optional stored call flow to follow
Returns:
True if successfully transferred to user, False otherwise
"""
logger.info(f"🗡️ Hold Slayer activated for {call.remote_number}")
logger.info(f" Intent: {call.intent}")
logger.info(f" Call Flow: {call_flow_id or 'exploration mode'}")
try:
# Wait for call to be connected
await self._wait_for_connection(call, timeout=60)
if call_flow_id:
# Load the stored call flow from the database
flow = await self._load_call_flow(call_flow_id)
if flow:
return await self.run_with_flow(call, sip_leg_id, flow)
else:
logger.warning(f"Call flow '{call_flow_id}' not found, switching to exploration")
# No flow or flow not found — explore
return await self.run_exploration(call, sip_leg_id)
except asyncio.CancelledError:
logger.info(f"Hold Slayer cancelled for {call.id}")
return False
except Exception as e:
logger.error(f"Hold Slayer error: {e}", exc_info=True)
await self.call_manager.update_status(call.id, CallStatus.FAILED)
return False
# ================================================================
# Mode 1: Follow a Stored Call Flow
# ================================================================
async def run_with_flow(
self,
call: ActiveCall,
sip_leg_id: str,
flow: CallFlow,
) -> bool:
"""
Navigate using a stored call flow tree.
Falls back to exploration for unknown steps.
"""
logger.info(f"📋 Following call flow: {flow.name}")
steps = flow.steps_by_id()
current_step_id = flow.steps[0].id if flow.steps else None
while current_step_id:
step = steps.get(current_step_id)
if not step:
logger.error(f"Step '{current_step_id}' not found in flow")
break
call.current_step_id = current_step_id
logger.info(f"📍 Step: {step.description}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_STEP,
call_id=call.id,
data={"step_id": step.id, "description": step.description, "action": step.action.value},
message=f"📍 IVR Step: {step.description}",
))
# === Execute the step based on its action type ===
if step.action == ActionType.HOLD:
# HOLD MODE: Audio classifier takes over
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
logger.info(f"⏳ On hold. Activating hold detection...")
human_detected = await self._wait_for_human(
call, sip_leg_id, timeout=step.timeout
)
if human_detected:
current_step_id = step.next_step
else:
logger.warning("⏰ Hold timeout reached!")
break
elif step.action == ActionType.DTMF:
# Wait for the expected prompt, then send DTMF
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
if step.expect:
heard = await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
if not heard and step.fallback_step:
logger.info(f"⚠️ Didn't hear expected prompt, falling back")
current_step_id = step.fallback_step
continue
# Send the DTMF digits
if step.action_value:
await self.sip_engine.send_dtmf(sip_leg_id, step.action_value)
logger.info(f"📱 Pressed: {step.action_value}")
await self.call_manager.event_bus.publish(GatewayEvent(
type=EventType.IVR_DTMF_SENT,
call_id=call.id,
data={"digits": step.action_value, "step": step.id},
message=f"📱 DTMF sent: {step.action_value}",
))
# Small delay after DTMF for the IVR to process
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.WAIT:
# Just wait for a prompt
if step.expect:
await self._wait_for_prompt(
call, sip_leg_id, step.expect, step.timeout
)
else:
await asyncio.sleep(step.timeout)
current_step_id = step.next_step
elif step.action == ActionType.LISTEN:
# Listen and decide — regex first, LLM fallback
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
transcript = await self._listen_for_menu(
call, sip_leg_id, step.timeout
)
# Phase 1: Try regex-based keyword matching (fast, no API call)
decision = self._decide_menu_option(
transcript, call.intent or "", step.expect
)
# Phase 2: LLM fallback if regex couldn't decide
if not decision and transcript:
llm = _get_llm()
if llm:
try:
logger.info("🤖 Regex inconclusive, asking LLM...")
llm_result = await llm.analyze_ivr_menu(
transcript=transcript,
intent=call.intent or "",
previous_selections=list(call.dtmf_history) if hasattr(call, 'dtmf_history') else None,
)
decision = llm_result.get("digit")
if decision:
confidence = llm_result.get("confidence", 0)
reason = llm_result.get("reason", "")
logger.info(
f"🤖 LLM decided: press {decision} "
f"(confidence={confidence}, reason='{reason}')"
)
except Exception as e:
logger.warning(f"🤖 LLM fallback failed: {e}")
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
logger.info(f"🧠 Decided: press {decision} (heard: '{transcript[:60]}...')")
else:
# Default: press 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
logger.info(f"🧠 No clear match, pressing 0 for agent")
await asyncio.sleep(2.0)
current_step_id = step.next_step
elif step.action == ActionType.SPEAK:
# Say something into the call (TTS)
# TODO: Implement TTS integration
logger.info(f"🗣️ Would say: '{step.action_value}' (TTS not yet implemented)")
await asyncio.sleep(3.0)
current_step_id = step.next_step
elif step.action == ActionType.TRANSFER:
# We did it! Transfer to user's device
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
logger.info(f"🚨 TRANSFERRING TO {step.action_value}")
device_target = step.action_value or call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device_target)
return True
else:
logger.warning(f"Unknown action type: {step.action}")
current_step_id = step.next_step
return False
# ================================================================
# Mode 2: Exploration (No Stored Flow)
# ================================================================
async def run_exploration(
self,
call: ActiveCall,
sip_leg_id: str,
) -> bool:
"""
No stored flow — explore the IVR blind.
Records what it discovers so we can build a flow for next time.
"""
logger.info(f"🔍 Exploration mode: discovering IVR for {call.remote_number}")
await self.call_manager.update_status(call.id, CallStatus.NAVIGATING_IVR)
discovered_steps: list[dict] = []
max_time = self.settings.hold_slayer.max_hold_time
start_time = time.time()
while time.time() - start_time < max_time:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
break
# Get audio and classify
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception as e:
logger.error(f"Audio stream error: {e}")
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify the audio
classification = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(classification.audio_type)
await self.call_manager.add_classification(call.id, classification)
# Transcribe if it sounds like speech
transcript = ""
if classification.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
transcript = await self.transcription.transcribe(
audio_chunk,
prompt="Phone IVR menu, customer service, press 1 for..."
)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# Record discovery
discovered_steps.append({
"timestamp": time.time(),
"audio_type": classification.audio_type.value,
"confidence": classification.confidence,
"transcript": transcript,
"action_taken": None,
})
# === Decision Logic ===
if classification.audio_type == AudioClassification.LIVE_HUMAN:
# HUMAN DETECTED! Transfer!
logger.info("🚨 LIVE HUMAN DETECTED!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
logger.info(f"📋 Discovered {len(discovered_steps)} IVR steps")
return True
elif classification.audio_type == AudioClassification.MUSIC:
# On hold — just keep monitoring
if current_call.status != CallStatus.ON_HOLD:
await self.call_manager.update_status(call.id, CallStatus.ON_HOLD)
# Check for hold→human transition
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
await self.call_manager.update_status(call.id, CallStatus.HUMAN_DETECTED)
device = call.device or self.settings.hold_slayer.default_transfer_device
await self.gateway.transfer_call(call.id, device)
return True
elif classification.audio_type == AudioClassification.IVR_PROMPT and transcript:
# IVR menu — try to navigate
decision = self._decide_menu_option(
transcript, call.intent or "", None
)
if decision:
await self.sip_engine.send_dtmf(sip_leg_id, decision)
discovered_steps[-1]["action_taken"] = {"dtmf": decision}
logger.info(f"🧠 Exploration: pressed {decision}")
await asyncio.sleep(2.0)
else:
# Try pressing 0 for agent
await self.sip_engine.send_dtmf(sip_leg_id, "0")
discovered_steps[-1]["action_taken"] = {"dtmf": "0", "reason": "default_agent"}
logger.info("🧠 Exploration: pressed 0 (trying for agent)")
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.SILENCE:
# Silence — wait a bit
await asyncio.sleep(2.0)
elif classification.audio_type == AudioClassification.RINGING:
# Still ringing
await asyncio.sleep(1.0)
logger.warning(f"Hold Slayer timed out after {max_time}s")
return False
# ================================================================
# Core Detection Methods
# ================================================================
async def _wait_for_human(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 7200,
) -> bool:
"""
Wait on hold until a live human is detected.
Continuously classifies audio and watches for the
music → speech transition.
"""
check_interval = self.settings.hold_slayer.hold_check_interval
start_time = time.time()
while time.time() - start_time < timeout:
# Check if call is still active
current_call = self.call_manager.get_call(call.id)
if not current_call or current_call.status in (
CallStatus.COMPLETED, CallStatus.FAILED, CallStatus.CANCELLED
):
return False
# Get audio chunk
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= int(16000 * 2 * check_interval):
break
except Exception:
await asyncio.sleep(check_interval)
continue
if not audio_chunk:
await asyncio.sleep(check_interval)
continue
# Classify
result = self.classifier.classify_chunk(audio_chunk)
self.classifier.update_history(result.audio_type)
await self.call_manager.add_classification(call.id, result)
# Check for human
if result.audio_type == AudioClassification.LIVE_HUMAN:
# Verify with transcription
transcript = await self.transcription.transcribe(audio_chunk)
if transcript:
await self.call_manager.add_transcript(call.id, transcript)
# If we got meaningful speech, it's probably a real person
if len(transcript.split()) >= 3:
logger.info(f"🚨 Human confirmed! Said: '{transcript[:100]}'")
return True
# Check for the music→speech transition pattern
if self.classifier.detect_hold_to_human_transition():
logger.info("🚨 Hold-to-human transition detected!")
return True
# Log progress periodically
elapsed = int(time.time() - start_time)
if elapsed > 0 and elapsed % 60 == 0:
logger.info(
f"⏳ Still on hold... {elapsed}s "
f"(audio: {result.audio_type.value}, {result.confidence:.0%})"
)
return False
async def _wait_for_prompt(
self,
call: ActiveCall,
sip_leg_id: str,
expected_pattern: str,
timeout: int = 30,
) -> bool:
"""
Wait for an expected IVR prompt.
Listens, transcribes, and checks if the transcript matches
the expected pattern (regex or keywords).
"""
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 3: # 3 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
await asyncio.sleep(1.0)
continue
# Classify first
result = self.classifier.classify_chunk(audio_chunk)
if result.audio_type not in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
continue
# Transcribe
transcript = await self.transcription.transcribe(audio_chunk)
if not transcript:
continue
await self.call_manager.add_transcript(call.id, transcript)
# Check if it matches expected pattern
try:
if re.search(expected_pattern, transcript, re.IGNORECASE):
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
except re.error:
# Treat as keyword search if regex is invalid
if expected_pattern.lower() in transcript.lower():
logger.info(f"✅ Heard expected: '{transcript[:80]}'")
return True
logger.warning(f"⚠️ Didn't hear expected prompt within {timeout}s")
return False
async def _listen_for_menu(
self,
call: ActiveCall,
sip_leg_id: str,
timeout: int = 30,
) -> str:
"""Listen for an IVR menu and return the full transcript."""
transcript_parts: list[str] = []
start_time = time.time()
while time.time() - start_time < timeout:
audio_chunk = b""
try:
async for chunk in self.sip_engine.get_audio_stream(sip_leg_id):
audio_chunk += chunk
if len(audio_chunk) >= 16000 * 2 * 5: # 5 seconds
break
except Exception:
await asyncio.sleep(1.0)
continue
if not audio_chunk:
break
result = self.classifier.classify_chunk(audio_chunk)
# If we're getting silence after speech, the menu prompt is done
if result.audio_type == AudioClassification.SILENCE and transcript_parts:
break
if result.audio_type in (
AudioClassification.IVR_PROMPT,
AudioClassification.LIVE_HUMAN,
):
text = await self.transcription.transcribe(audio_chunk)
if text:
transcript_parts.append(text)
full_transcript = " ".join(transcript_parts)
if full_transcript:
await self.call_manager.add_transcript(call.id, full_transcript)
return full_transcript
async def _wait_for_connection(self, call: ActiveCall, timeout: int = 60) -> None:
"""Wait for the call to be connected (answered)."""
start = time.time()
while time.time() - start < timeout:
current = self.call_manager.get_call(call.id)
if not current:
raise RuntimeError(f"Call {call.id} disappeared")
if current.status in (CallStatus.CONNECTED, CallStatus.NAVIGATING_IVR):
return
if current.status in (CallStatus.FAILED, CallStatus.CANCELLED):
raise RuntimeError(f"Call {call.id} failed: {current.status}")
await asyncio.sleep(0.5)
raise TimeoutError(f"Call {call.id} not connected within {timeout}s")
# ================================================================
# Menu Navigation Logic
# ================================================================
def _decide_menu_option(
self,
transcript: str,
intent: str,
expected_options: Optional[str],
) -> Optional[str]:
"""
Decide which menu option to select based on transcript and intent.
Simple keyword-based matching. This is where an LLM integration
would massively improve navigation accuracy.
Returns:
DTMF digit(s) to press, or None if can't decide
"""
transcript_lower = transcript.lower()
intent_lower = intent.lower()
# Common IVR patterns: "press 1 for X, press 2 for Y"
# Extract options
options = re.findall(
r'(?:press|dial|say)\s+(\d+)\s+(?:for|to)\s+(.+?)(?:\.|,|press|dial|$)',
transcript_lower,
)
if not options:
# Try alternate patterns: "for X, press 1"
options = re.findall(
r'for\s+(.+?),?\s*(?:press|dial)\s+(\d+)',
transcript_lower,
)
# Swap order to be (digit, description)
options = [(digit, desc) for desc, digit in options]
if not options:
return None
# Score each option against the intent
best_match = None
best_score = 0
# Keywords that map intents to IVR options
intent_keywords = {
"cancel": ["cancel", "close", "end", "terminate"],
"dispute": ["dispute", "charge", "billing", "transaction", "statement"],
"balance": ["balance", "account", "summary"],
"agent": ["agent", "representative", "operator", "speak", "person", "human"],
"payment": ["payment", "pay", "bill"],
"card": ["card", "credit", "debit"],
"fraud": ["fraud", "unauthorized", "stolen", "lost"],
"transfer": ["transfer", "move", "send"],
}
for digit, description in options:
score = 0
# Direct keyword match in description
for keyword_group, keywords in intent_keywords.items():
if any(kw in intent_lower for kw in keywords):
if any(kw in description for kw in keywords):
score += 10
# Fuzzy: any word overlap between intent and description
intent_words = set(intent_lower.split())
desc_words = set(description.split())
overlap = intent_words & desc_words
score += len(overlap) * 3
# "Speak to agent" is usually what we want if nothing else matches
if any(w in description for w in ["agent", "representative", "operator", "person"]):
score += 5
if score > best_score:
best_score = score
best_match = digit
if best_match and best_score >= 3:
return best_match
# Default: look for "agent" or "representative" option
for digit, description in options:
if any(w in description for w in ["agent", "representative", "operator"]):
return digit
return None
async def _load_call_flow(self, flow_id: str) -> Optional[CallFlow]:
"""Load a stored call flow from the database."""
from db.database import get_session_factory, StoredCallFlow
from sqlalchemy import select
try:
factory = get_session_factory()
async with factory() as session:
result = await session.execute(
select(StoredCallFlow).where(StoredCallFlow.id == flow_id)
)
row = result.scalar_one_or_none()
if row:
from models.call_flow import CallFlowStep
return CallFlow(
id=row.id,
name=row.name,
phone_number=row.phone_number,
description=row.description or "",
steps=[CallFlowStep(**s) for s in row.steps],
tags=row.tags or [],
notes=row.notes,
avg_hold_time=row.avg_hold_time,
success_rate=row.success_rate,
last_used=row.last_used,
times_used=row.times_used or 0,
)
except Exception as e:
logger.error(f"Failed to load call flow '{flow_id}': {e}")
return None

391
services/llm_client.py Normal file
View File

@@ -0,0 +1,391 @@
"""
LLM Client — Unified interface for LLM-powered decision making.
Used by Hold Slayer (IVR navigation fallback), Call Flow Learner,
Receptionist, and Smart Routing services.
Supports OpenAI-compatible APIs (OpenAI, Ollama, LM Studio, etc.)
via httpx async client. No SDK dependency — just HTTP.
"""
import json
import logging
import time
from typing import Any, Optional
import httpx
from config import get_settings
logger = logging.getLogger(__name__)
class LLMClient:
"""
Async LLM client for OpenAI-compatible chat completion APIs.
Works with:
- OpenAI API (api.openai.com)
- Ollama (localhost:11434)
- LM Studio (localhost:1234)
- Any OpenAI-compatible endpoint
Usage:
client = LLMClient(base_url="http://localhost:11434/v1", model="llama3")
response = await client.chat("What is 2+2?")
# or structured:
result = await client.chat_json(
"Extract the menu options from this IVR transcript...",
system="You are a phone menu parser.",
)
"""
def __init__(
self,
base_url: str = "http://localhost:11434/v1",
model: str = "llama3",
api_key: str = "not-needed",
timeout: float = 30.0,
max_tokens: int = 1024,
temperature: float = 0.3,
):
self.base_url = base_url.rstrip("/")
self.model = model
self.api_key = api_key
self.timeout = timeout
self.max_tokens = max_tokens
self.temperature = temperature
self._client = httpx.AsyncClient(
base_url=self.base_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
timeout=httpx.Timeout(timeout),
)
# Stats
self._total_requests = 0
self._total_tokens = 0
self._total_errors = 0
self._avg_latency_ms = 0.0
async def close(self):
"""Close the HTTP client."""
await self._client.aclose()
# ================================================================
# Core Chat Methods
# ================================================================
async def chat(
self,
user_message: str,
system: Optional[str] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
) -> str:
"""
Send a chat completion request and return the text response.
Args:
user_message: The user's message/prompt.
system: Optional system prompt.
temperature: Override default temperature.
max_tokens: Override default max tokens.
Returns:
The assistant's response text.
"""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": user_message})
return await self._complete(
messages,
temperature=temperature or self.temperature,
max_tokens=max_tokens or self.max_tokens,
)
async def chat_json(
self,
user_message: str,
system: Optional[str] = None,
temperature: Optional[float] = None,
) -> dict[str, Any]:
"""
Chat completion that parses the response as JSON.
The system prompt is augmented to request JSON output.
Falls back to extracting JSON from markdown code blocks.
Returns:
Parsed JSON dict, or {"error": "..."} on parse failure.
"""
json_system = (system or "") + (
"\n\nIMPORTANT: Respond with valid JSON only. "
"No markdown, no explanation, just the JSON object."
)
response_text = await self.chat(
user_message,
system=json_system.strip(),
temperature=temperature or 0.1, # Lower temp for structured output
)
return self._parse_json_response(response_text)
async def chat_with_history(
self,
messages: list[dict[str, str]],
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
) -> str:
"""
Chat with full message history (multi-turn conversation).
Args:
messages: List of {"role": "system|user|assistant", "content": "..."}
Returns:
The assistant's response text.
"""
return await self._complete(
messages,
temperature=temperature or self.temperature,
max_tokens=max_tokens or self.max_tokens,
)
# ================================================================
# Hold Slayer Specific Methods
# ================================================================
async def analyze_ivr_menu(
self,
transcript: str,
intent: str,
previous_selections: Optional[list[str]] = None,
) -> dict[str, Any]:
"""
Analyze an IVR menu transcript and decide which option to press.
This is the LLM fallback when regex-based menu parsing fails.
Args:
transcript: The IVR audio transcript.
intent: What the user wants to accomplish.
previous_selections: DTMF digits already pressed in this call.
Returns:
{"digit": "3", "reason": "Option 3 is for card cancellation",
"confidence": 0.85}
"""
system = (
"You are an expert at navigating phone menus (IVR systems). "
"Given an IVR transcript and the caller's intent, determine "
"which menu option (DTMF digit) to press.\n\n"
"Rules:\n"
"- If there's a direct match for the intent, choose it.\n"
"- If no direct match, choose 'speak to representative' or 'agent' option.\n"
"- If menu says 'press 0 for operator', that's always a safe fallback.\n"
"- Return the single digit to press.\n"
"- If you truly can't determine the right option, return digit: null.\n"
)
context = f"IVR Transcript:\n{transcript}\n\n"
context += f"Caller's Intent: {intent}\n"
if previous_selections:
context += f"Already pressed: {', '.join(previous_selections)}\n"
context += "\nWhich digit should be pressed? Return JSON."
result = await self.chat_json(context, system=system)
# Normalize response
if "digit" not in result:
# Try to extract from various response formats
for key in ["option", "press", "choice", "dtmf"]:
if key in result:
result["digit"] = str(result[key])
break
return result
async def detect_human_speech(
self,
transcript: str,
context: str = "",
) -> dict[str, Any]:
"""
Analyze a transcript to determine if a human agent is speaking.
Used as a secondary check when audio classifier detects speech
but we need to distinguish between IVR prompts and a live human.
Returns:
{"is_human": true, "confidence": 0.9, "reason": "Agent greeting detected"}
"""
system = (
"You are analyzing a phone call transcript to determine if "
"a live human agent is speaking (vs an automated IVR system).\n\n"
"Human indicators:\n"
"- Personal greeting ('Hi, my name is...')\n"
"- Asking for account details\n"
"- Conversational tone, filler words\n"
"- Acknowledging hold time ('Thanks for waiting')\n"
"\nIVR indicators:\n"
"- 'Press N for...', 'Say...'\n"
"- Robotic phrasing\n"
"- Menu options\n"
"- 'Your call is important to us'\n"
)
prompt = f"Transcript:\n{transcript}\n"
if context:
prompt += f"\nContext: {context}\n"
prompt += "\nIs this a live human agent? Return JSON."
return await self.chat_json(prompt, system=system)
async def summarize_call(
self,
transcript_chunks: list[str],
intent: str,
duration_seconds: int,
) -> dict[str, Any]:
"""
Generate a call summary from transcript chunks.
Used for call history and analytics.
Returns:
{"summary": "...", "outcome": "resolved|unresolved|transferred",
"key_info": [...], "sentiment": "positive|neutral|negative"}
"""
system = (
"Summarize this phone call concisely. Include:\n"
"- What the caller wanted\n"
"- What happened (IVR navigation, hold time, agent interaction)\n"
"- The outcome\n"
"Return as JSON with: summary, outcome, key_info (list), sentiment."
)
full_transcript = "\n".join(transcript_chunks)
prompt = (
f"Caller's intent: {intent}\n"
f"Call duration: {duration_seconds} seconds\n\n"
f"Full transcript:\n{full_transcript}\n\n"
"Summarize this call."
)
return await self.chat_json(prompt, system=system)
# ================================================================
# Internal
# ================================================================
async def _complete(
self,
messages: list[dict[str, str]],
temperature: float = 0.3,
max_tokens: int = 1024,
) -> str:
"""Execute a chat completion request."""
self._total_requests += 1
start = time.monotonic()
try:
payload = {
"model": self.model,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
response = await self._client.post("/chat/completions", json=payload)
response.raise_for_status()
data = response.json()
# Track token usage
if "usage" in data:
self._total_tokens += data["usage"].get("total_tokens", 0)
# Track latency
elapsed_ms = (time.monotonic() - start) * 1000
self._avg_latency_ms = (
self._avg_latency_ms * 0.9 + elapsed_ms * 0.1
)
# Extract response text
choices = data.get("choices", [])
if choices:
return choices[0].get("message", {}).get("content", "")
return ""
except httpx.HTTPStatusError as e:
self._total_errors += 1
logger.error(f"LLM API error: {e.response.status_code} {e.response.text[:200]}")
return ""
except httpx.TimeoutException:
self._total_errors += 1
logger.error(f"LLM API timeout after {self.timeout}s")
return ""
except Exception as e:
self._total_errors += 1
logger.error(f"LLM client error: {e}")
return ""
@staticmethod
def _parse_json_response(text: str) -> dict[str, Any]:
"""Parse JSON from LLM response, handling common formatting issues."""
text = text.strip()
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
if "```" in text:
# Find content between ```json and ``` or ``` and ```
parts = text.split("```")
for i, part in enumerate(parts):
if i % 2 == 1: # Odd indices are inside code blocks
# Remove optional language tag
content = part.strip()
if content.startswith("json"):
content = content[4:].strip()
try:
return json.loads(content)
except json.JSONDecodeError:
continue
# Try finding JSON object in the text
brace_start = text.find("{")
brace_end = text.rfind("}")
if brace_start != -1 and brace_end != -1:
try:
return json.loads(text[brace_start : brace_end + 1])
except json.JSONDecodeError:
pass
logger.warning(f"Failed to parse JSON from LLM response: {text[:200]}")
return {"error": "Failed to parse JSON response", "raw": text[:500]}
# ================================================================
# Stats
# ================================================================
@property
def stats(self) -> dict:
return {
"total_requests": self._total_requests,
"total_tokens": self._total_tokens,
"total_errors": self._total_errors,
"avg_latency_ms": round(self._avg_latency_ms, 1),
"model": self.model,
"base_url": self.base_url,
}

256
services/notification.py Normal file
View File

@@ -0,0 +1,256 @@
"""
Notification Service — Tell the user what's happening.
Sends notifications when:
- A human picks up (TRANSFER NOW!)
- Hold time estimates change
- Call fails or times out
- IVR navigation milestones
Supports multiple channels: WebSocket (always), SMS (optional),
push notifications (future).
"""
import asyncio
import logging
from datetime import datetime
from enum import Enum
from typing import Any, Optional
from pydantic import BaseModel
from config import Settings
from core.event_bus import EventBus
from models.events import EventType, GatewayEvent
logger = logging.getLogger(__name__)
class NotificationChannel(str, Enum):
"""Where to send notifications."""
WEBSOCKET = "websocket"
SMS = "sms"
PUSH = "push"
class NotificationPriority(str, Enum):
"""How urgently to deliver."""
LOW = "low" # Status updates, hold time estimates
NORMAL = "normal" # IVR navigation milestones
HIGH = "high" # Human detected, call failed
CRITICAL = "critical" # Transfer happening NOW
class Notification(BaseModel):
"""A notification to send to the user."""
channel: NotificationChannel
priority: NotificationPriority
title: str
message: str
call_id: Optional[str] = None
data: dict[str, Any] = {}
timestamp: datetime = datetime.now()
class NotificationService:
"""
Sends notifications to users about call events.
Listens to the EventBus and routes events to the
appropriate notification channels.
"""
def __init__(self, event_bus: EventBus, settings: Settings):
self._event_bus = event_bus
self._settings = settings
self._task: Optional[asyncio.Task] = None
self._sms_sender: Optional[Any] = None
# Track what we've already notified (avoid spam)
self._notified: dict[str, set[str]] = {} # call_id -> set of event types
async def start(self) -> None:
"""Start listening for events to notify on."""
self._task = asyncio.create_task(self._listen_loop())
logger.info("📢 Notification service started")
async def stop(self) -> None:
"""Stop the notification listener."""
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
logger.info("📢 Notification service stopped")
async def _listen_loop(self) -> None:
"""Main event listener loop."""
subscription = self._event_bus.subscribe()
try:
async for event in subscription:
try:
await self._handle_event(event)
except Exception as e:
logger.error(f"Notification handler error: {e}", exc_info=True)
except asyncio.CancelledError:
pass
finally:
subscription.close()
async def _handle_event(self, event: GatewayEvent) -> None:
"""Route an event to the appropriate notification(s)."""
call_id = event.call_id or ""
# Initialize tracking for this call
if call_id and call_id not in self._notified:
self._notified[call_id] = set()
# Skip duplicate notifications
dedup_key = f"{event.type.value}:{event.data.get('step_id', '')}"
if call_id and dedup_key in self._notified.get(call_id, set()):
return
notification = self._event_to_notification(event)
if not notification:
return
# Mark as notified
if call_id:
self._notified[call_id].add(dedup_key)
# Send via all appropriate channels
await self._send(notification)
def _event_to_notification(self, event: GatewayEvent) -> Optional[Notification]:
"""Convert a gateway event to a notification (or None to skip)."""
if event.type == EventType.HUMAN_DETECTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.CRITICAL,
title="🚨 Human Detected!",
message="A live person picked up — transferring you now!",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.TRANSFER_STARTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.CRITICAL,
title="📞 Call Transferred",
message="Your call has been connected to the agent. Pick up your phone!",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.CALL_FAILED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.HIGH,
title="❌ Call Failed",
message=event.message or "The call couldn't be completed.",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.HOLD_DETECTED:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.NORMAL,
title="⏳ On Hold",
message="You're on hold. We'll notify you when someone picks up.",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.IVR_STEP:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.LOW,
title="📍 IVR Navigation",
message=event.message or "Navigating phone menu...",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.IVR_DTMF_SENT:
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.LOW,
title="📱 Button Pressed",
message=event.message or f"Pressed {event.data.get('digits', '?')}",
call_id=event.call_id,
data=event.data,
)
elif event.type == EventType.CALL_ENDED:
# Clean up tracking
if event.call_id and event.call_id in self._notified:
del self._notified[event.call_id]
return Notification(
channel=NotificationChannel.WEBSOCKET,
priority=NotificationPriority.NORMAL,
title="📴 Call Ended",
message=event.message or "The call has ended.",
call_id=event.call_id,
data=event.data,
)
# Skip other event types (transcription, classification, etc.)
return None
async def _send(self, notification: Notification) -> None:
"""Send a notification via the appropriate channel."""
logger.info(
f"📢 [{notification.priority.value}] {notification.title}: "
f"{notification.message}"
)
# WebSocket notifications go through the event bus
# (the WebSocket handler in the API reads from EventBus directly)
# SMS for critical notifications
if (
notification.priority == NotificationPriority.CRITICAL
and self._settings.notify_sms_number
):
await self._send_sms(notification)
async def _send_sms(self, notification: Notification) -> None:
"""
Send an SMS notification.
Uses a simple HTTP-based SMS gateway. In production,
this would use Twilio, AWS SNS, or similar.
"""
phone = self._settings.notify_sms_number
if not phone:
return
try:
import httpx
# Generic webhook-based SMS (configure your provider)
# This is a placeholder — wire up your preferred SMS provider
logger.info(f"📱 SMS → {phone}: {notification.title}")
# Example: Twilio-style API
# async with httpx.AsyncClient() as client:
# await client.post(
# "https://api.twilio.com/2010-04-01/Accounts/.../Messages.json",
# data={
# "To": phone,
# "From": self._settings.sip_trunk.did,
# "Body": f"{notification.title}\n{notification.message}",
# },
# auth=(account_sid, auth_token),
# )
except Exception as e:
logger.error(f"SMS send failed: {e}")

230
services/recording.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Recording Service — Call recording management.
Records calls to WAV files via the PJSUA2 media pipeline,
manages storage, and provides playback/download access.
"""
import asyncio
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Optional
from config import get_settings
logger = logging.getLogger(__name__)
class RecordingService:
"""
Manages call recordings.
Features:
- Start/stop recording for any active call leg
- Dual-channel recording (separate caller/agent streams)
- Mixed recording (both parties in one file)
- WAV storage with organized directory structure
- Recording metadata tracking
"""
def __init__(
self,
storage_dir: str = "recordings",
max_recording_seconds: int = 7200, # 2 hours
sample_rate: int = 16000,
):
self._storage_dir = Path(storage_dir)
self._max_recording_seconds = max_recording_seconds
self._sample_rate = sample_rate
self._active_recordings: dict[str, RecordingSession] = {}
self._metadata: list[dict] = []
async def start(self) -> None:
"""Initialize the recording service."""
self._storage_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"🎙️ Recording service ready (storage: {self._storage_dir})")
# ================================================================
# Recording Lifecycle
# ================================================================
async def start_recording(
self,
call_id: str,
media_pipeline=None,
leg_ids: Optional[list[str]] = None,
dual_channel: bool = False,
) -> "RecordingSession":
"""
Start recording a call.
Args:
call_id: The call to record.
media_pipeline: MediaPipeline instance for PJSUA2 recording.
leg_ids: Specific SIP leg IDs to record. If None, records all legs.
dual_channel: If True, record each party to a separate channel.
Returns:
RecordingSession with file paths and metadata.
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
date_dir = datetime.now().strftime("%Y-%m-%d")
recording_dir = self._storage_dir / date_dir
recording_dir.mkdir(parents=True, exist_ok=True)
if dual_channel:
filepath_caller = str(recording_dir / f"{call_id}_{timestamp}_caller.wav")
filepath_agent = str(recording_dir / f"{call_id}_{timestamp}_agent.wav")
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}_mixed.wav")
else:
filepath_caller = None
filepath_agent = None
filepath_mixed = str(recording_dir / f"{call_id}_{timestamp}.wav")
session = RecordingSession(
call_id=call_id,
filepath_mixed=filepath_mixed,
filepath_caller=filepath_caller,
filepath_agent=filepath_agent,
started_at=datetime.now(),
sample_rate=self._sample_rate,
)
# Start PJSUA2 recording if media pipeline is available
if media_pipeline and leg_ids:
for leg_id in leg_ids:
if filepath_mixed:
media_pipeline.start_recording(leg_id, filepath_mixed)
self._active_recordings[call_id] = session
logger.info(f"🔴 Recording started: {call_id}{filepath_mixed}")
# Safety timeout
asyncio.create_task(
self._recording_timeout(call_id),
name=f"rec_timeout_{call_id}",
)
return session
async def stop_recording(
self,
call_id: str,
media_pipeline=None,
) -> Optional["RecordingSession"]:
"""Stop recording a call and finalize the WAV file."""
session = self._active_recordings.pop(call_id, None)
if not session:
logger.warning(f" No active recording for {call_id}")
return None
session.stopped_at = datetime.now()
session.duration_seconds = int(
(session.stopped_at - session.started_at).total_seconds()
)
# Stop PJSUA2 recording
if media_pipeline:
# The pipeline handles flushing and closing the WAV file
for leg_id in (session._leg_ids or []):
media_pipeline.stop_recording(leg_id)
# Calculate file size
if session.filepath_mixed and os.path.exists(session.filepath_mixed):
session.file_size_bytes = os.path.getsize(session.filepath_mixed)
# Store metadata
self._metadata.append(session.to_dict())
logger.info(
f"⏹ Recording stopped: {call_id} "
f"({session.duration_seconds}s, "
f"{session.file_size_bytes or 0} bytes)"
)
return session
async def _recording_timeout(self, call_id: str) -> None:
"""Auto-stop recording after max duration."""
await asyncio.sleep(self._max_recording_seconds)
if call_id in self._active_recordings:
logger.warning(f" Recording timeout for {call_id}, auto-stopping")
await self.stop_recording(call_id)
# ================================================================
# Queries
# ================================================================
def get_recording(self, call_id: str) -> Optional[dict]:
"""Get recording metadata for a call."""
for meta in reversed(self._metadata):
if meta["call_id"] == call_id:
return meta
return None
def list_recordings(
self,
limit: int = 50,
offset: int = 0,
) -> list[dict]:
"""List recording metadata, newest first."""
sorted_meta = sorted(
self._metadata,
key=lambda m: m.get("started_at", ""),
reverse=True,
)
return sorted_meta[offset : offset + limit]
@property
def active_recording_count(self) -> int:
return len(self._active_recordings)
@property
def total_recordings(self) -> int:
return len(self._metadata)
def storage_usage_bytes(self) -> int:
"""Calculate total storage used by recordings."""
total = 0
for root, _dirs, files in os.walk(self._storage_dir):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total
class RecordingSession:
"""Tracks a single active recording session."""
def __init__(
self,
call_id: str,
filepath_mixed: Optional[str] = None,
filepath_caller: Optional[str] = None,
filepath_agent: Optional[str] = None,
started_at: Optional[datetime] = None,
sample_rate: int = 16000,
):
self.call_id = call_id
self.filepath_mixed = filepath_mixed
self.filepath_caller = filepath_caller
self.filepath_agent = filepath_agent
self.started_at = started_at or datetime.now()
self.stopped_at: Optional[datetime] = None
self.duration_seconds: Optional[int] = None
self.file_size_bytes: Optional[int] = None
self.sample_rate = sample_rate
self._leg_ids: list[str] = []
def to_dict(self) -> dict:
return {
"call_id": self.call_id,
"filepath_mixed": self.filepath_mixed,
"filepath_caller": self.filepath_caller,
"filepath_agent": self.filepath_agent,
"started_at": self.started_at.isoformat() if self.started_at else None,
"stopped_at": self.stopped_at.isoformat() if self.stopped_at else None,
"duration_seconds": self.duration_seconds,
"file_size_bytes": self.file_size_bytes,
"sample_rate": self.sample_rate,
}

161
services/transcription.py Normal file
View File

@@ -0,0 +1,161 @@
"""
Transcription Service — Speaches STT integration.
Sends audio to your Speaches instances for real-time speech-to-text.
Used by the Hold Slayer to understand IVR prompts and detect menu options.
"""
import io
import logging
from typing import Optional
import httpx
from config import SpeachesSettings
logger = logging.getLogger(__name__)
class TranscriptionService:
"""
Client for Speaches STT service.
Speaches exposes an OpenAI-compatible API:
POST /v1/audio/transcriptions
"""
def __init__(self, settings: SpeachesSettings):
self.settings = settings
self._client: Optional[httpx.AsyncClient] = None
async def _get_client(self) -> httpx.AsyncClient:
"""Get or create the HTTP client."""
if self._client is None or self._client.is_closed:
self._client = httpx.AsyncClient(
base_url=self.settings.url,
timeout=httpx.Timeout(30.0, connect=5.0),
)
return self._client
async def transcribe(
self,
audio_data: bytes,
language: str = "en",
prompt: Optional[str] = None,
) -> str:
"""
Transcribe audio data to text.
Args:
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
language: Language code (default: "en")
prompt: Optional context hint for better accuracy
(e.g., "IVR menu options, phone banking")
Returns:
Transcribed text
"""
client = await self._get_client()
# Convert raw PCM to WAV format for the API
wav_data = self._pcm_to_wav(audio_data)
try:
response = await client.post(
"/v1/audio/transcriptions",
files={"file": ("audio.wav", wav_data, "audio/wav")},
data={
"model": self.settings.model,
"language": language,
"response_format": "text",
**({"prompt": prompt} if prompt else {}),
},
)
response.raise_for_status()
text = response.text.strip()
logger.debug(f"Transcription: '{text}'")
return text
except httpx.HTTPStatusError as e:
logger.error(f"Speaches API error: {e.response.status_code} {e.response.text}")
return ""
except httpx.ConnectError:
logger.error(f"Cannot connect to Speaches at {self.settings.url}")
return ""
except Exception as e:
logger.error(f"Transcription failed: {e}")
return ""
async def transcribe_stream(
self,
audio_data: bytes,
language: str = "en",
):
"""
Stream transcription — for real-time results.
Uses Speaches streaming endpoint if available,
falls back to chunked transcription.
Yields:
str: Partial transcription chunks
"""
# For now, do chunked transcription
# TODO: Implement WebSocket streaming when Speaches supports it
chunk_size = 16000 * 2 * 3 # 3 seconds of 16kHz 16-bit mono
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
if len(chunk) > 0:
text = await self.transcribe(chunk, language)
if text:
yield text
async def close(self) -> None:
"""Close the HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None
@staticmethod
def _pcm_to_wav(pcm_data: bytes, sample_rate: int = 16000, channels: int = 1, sample_width: int = 2) -> bytes:
"""
Convert raw PCM data to WAV format.
Args:
pcm_data: Raw PCM audio bytes
sample_rate: Sample rate in Hz (default: 16000)
channels: Number of channels (default: 1 = mono)
sample_width: Bytes per sample (default: 2 = 16-bit)
Returns:
WAV file as bytes
"""
import struct
data_size = len(pcm_data)
file_size = 36 + data_size # Header is 44 bytes, minus 8 for RIFF header
wav = io.BytesIO()
# RIFF header
wav.write(b"RIFF")
wav.write(struct.pack("<I", file_size))
wav.write(b"WAVE")
# fmt chunk
wav.write(b"fmt ")
wav.write(struct.pack("<I", 16)) # Chunk size
wav.write(struct.pack("<H", 1)) # PCM format
wav.write(struct.pack("<H", channels))
wav.write(struct.pack("<I", sample_rate))
wav.write(struct.pack("<I", sample_rate * channels * sample_width)) # Byte rate
wav.write(struct.pack("<H", channels * sample_width)) # Block align
wav.write(struct.pack("<H", sample_width * 8)) # Bits per sample
# data chunk
wav.write(b"data")
wav.write(struct.pack("<I", data_size))
wav.write(pcm_data)
return wav.getvalue()