feat: add initial Hold Slayer AI telephony gateway implementation
Complete project scaffolding and core implementation of an AI-powered telephony system that calls companies, navigates IVR menus, waits on hold, and transfers to the user when a human answers. Key components: - FastAPI server with REST API, WebSocket, and MCP (SSE) interfaces - SIP/VoIP call management via PJSUA2 with RTP audio streaming - LLM-powered IVR navigation using OpenAI/Anthropic with tool calling - Hold detection service combining audio analysis and silence detection - Real-time STT (Whisper/Deepgram) and TTS (OpenAI/Piper) pipelines - Call recording with per-channel and mixed audio capture - Event bus (asyncio pub/sub) for real-time client updates - Web dashboard with live call monitoring - SQLite persistence via SQLAlchemy with call history and analytics - Notification support (email, SMS, webhook, desktop) - Docker Compose deployment with Opal VoIP and Opal Media containers - Comprehensive test suite with unit, integration, and E2E tests - Simplified .gitignore and full project documentation in README
This commit is contained in:
444
services/audio_classifier.py
Normal file
444
services/audio_classifier.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
Audio Classifier — Spectral analysis for hold music, speech, and silence detection.
|
||||
|
||||
This is the brain of the Hold Slayer. It analyzes audio in real-time to determine:
|
||||
- Is this hold music?
|
||||
- Is this an IVR prompt (automated voice)?
|
||||
- Is this a live human?
|
||||
- Is this silence?
|
||||
- Is this a ring-back tone?
|
||||
|
||||
Uses spectral analysis (librosa/numpy) to classify audio without needing
|
||||
a trained ML model — just signal processing and heuristics.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from config import ClassifierSettings
|
||||
from models.call import AudioClassification, ClassificationResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Audio constants
|
||||
SAMPLE_RATE = 16000 # 16kHz mono
|
||||
FRAME_SIZE = SAMPLE_RATE * 2 # 16-bit samples = 2 bytes per sample
|
||||
|
||||
|
||||
class AudioClassifier:
|
||||
"""
|
||||
Real-time audio classifier using spectral analysis.
|
||||
|
||||
Classification strategy:
|
||||
- Silence: Low RMS energy
|
||||
- Music: High spectral flatness + sustained tonal content + rhythm
|
||||
- IVR prompt: Speech-like spectral envelope but repetitive/synthetic
|
||||
- Live human: Speech-like spectral envelope + natural variation
|
||||
- Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring)
|
||||
- DTMF: Dual-tone detection at known DTMF frequencies
|
||||
"""
|
||||
|
||||
def __init__(self, settings: ClassifierSettings):
|
||||
self.settings = settings
|
||||
self._window_buffer: list[bytes] = []
|
||||
self._window_samples = int(settings.window_seconds * SAMPLE_RATE)
|
||||
self._classification_history: list[AudioClassification] = []
|
||||
|
||||
def classify_chunk(self, audio_data: bytes) -> ClassificationResult:
|
||||
"""
|
||||
Classify a chunk of audio data.
|
||||
|
||||
Args:
|
||||
audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)
|
||||
|
||||
Returns:
|
||||
ClassificationResult with type and confidence
|
||||
"""
|
||||
# Convert bytes to numpy array
|
||||
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
|
||||
|
||||
if len(samples) == 0:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.SILENCE,
|
||||
confidence=1.0,
|
||||
)
|
||||
|
||||
# Normalize to [-1.0, 1.0]
|
||||
samples = samples / 32768.0
|
||||
|
||||
# Run all detectors
|
||||
rms = self._compute_rms(samples)
|
||||
spectral_flatness = self._compute_spectral_flatness(samples)
|
||||
zcr = self._compute_zero_crossing_rate(samples)
|
||||
dominant_freq = self._compute_dominant_frequency(samples)
|
||||
spectral_centroid = self._compute_spectral_centroid(samples)
|
||||
is_tonal = self._detect_tonality(samples)
|
||||
|
||||
# Build feature dict for debugging
|
||||
features = {
|
||||
"rms": float(rms),
|
||||
"spectral_flatness": float(spectral_flatness),
|
||||
"zcr": float(zcr),
|
||||
"dominant_freq": float(dominant_freq),
|
||||
"spectral_centroid": float(spectral_centroid),
|
||||
"is_tonal": is_tonal,
|
||||
}
|
||||
|
||||
# === Classification Logic ===
|
||||
|
||||
# 1. Silence detection
|
||||
if rms < 0.01:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.SILENCE,
|
||||
confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5),
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 2. DTMF detection (very specific dual-tone pattern)
|
||||
dtmf_result = self._detect_dtmf(samples)
|
||||
if dtmf_result:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.DTMF,
|
||||
confidence=0.95,
|
||||
details={**features, "dtmf_digit": dtmf_result},
|
||||
)
|
||||
|
||||
# 3. Ring-back tone detection (440+480Hz in NA, periodic on/off)
|
||||
if is_tonal and 400 < dominant_freq < 520 and rms > 0.02:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.RINGING,
|
||||
confidence=0.8,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 4. Music vs Speech discrimination
|
||||
# Music: higher spectral flatness, more tonal, wider spectral spread
|
||||
# Speech: lower spectral flatness, concentrated energy, variable ZCR
|
||||
music_score = self._compute_music_score(
|
||||
spectral_flatness, is_tonal, spectral_centroid, zcr, rms
|
||||
)
|
||||
speech_score = self._compute_speech_score(
|
||||
spectral_flatness, zcr, spectral_centroid, rms
|
||||
)
|
||||
|
||||
# 5. If it's speech-like, is it live or automated?
|
||||
if speech_score > music_score:
|
||||
# Use history to distinguish live human from IVR
|
||||
# IVR: repetitive patterns, synthetic prosody
|
||||
# Human: natural variation, conversational rhythm
|
||||
if self._looks_like_live_human(speech_score, zcr, rms):
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.LIVE_HUMAN,
|
||||
confidence=speech_score,
|
||||
details=features,
|
||||
)
|
||||
else:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.IVR_PROMPT,
|
||||
confidence=speech_score * 0.8,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 6. Music (hold music)
|
||||
if music_score >= self.settings.music_threshold:
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.MUSIC,
|
||||
confidence=music_score,
|
||||
details=features,
|
||||
)
|
||||
|
||||
# 7. Unknown / low confidence
|
||||
return ClassificationResult(
|
||||
timestamp=time.time(),
|
||||
audio_type=AudioClassification.UNKNOWN,
|
||||
confidence=max(music_score, speech_score),
|
||||
details=features,
|
||||
)
|
||||
|
||||
# ================================================================
|
||||
# Feature Extraction
|
||||
# ================================================================
|
||||
|
||||
@staticmethod
|
||||
def _compute_rms(samples: np.ndarray) -> float:
|
||||
"""Root Mean Square — overall energy level."""
|
||||
return float(np.sqrt(np.mean(samples ** 2)))
|
||||
|
||||
@staticmethod
|
||||
def _compute_spectral_flatness(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Spectral flatness (Wiener entropy).
|
||||
|
||||
Close to 1.0 = noise-like (white noise)
|
||||
Close to 0.0 = tonal (pure tone, music)
|
||||
Speech is typically 0.1-0.4, music 0.05-0.3
|
||||
"""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
fft = fft[fft > 0] # Avoid log(0)
|
||||
|
||||
if len(fft) == 0:
|
||||
return 0.0
|
||||
|
||||
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
|
||||
arithmetic_mean = np.mean(fft)
|
||||
|
||||
if arithmetic_mean == 0:
|
||||
return 0.0
|
||||
|
||||
return float(geometric_mean / arithmetic_mean)
|
||||
|
||||
@staticmethod
|
||||
def _compute_zero_crossing_rate(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Zero-crossing rate — how often the signal crosses zero.
|
||||
|
||||
Higher for unvoiced speech and noise.
|
||||
Lower for voiced speech and tonal music.
|
||||
"""
|
||||
crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2
|
||||
return float(crossings / len(samples))
|
||||
|
||||
@staticmethod
|
||||
def _compute_dominant_frequency(samples: np.ndarray) -> float:
|
||||
"""Find the dominant frequency in the signal."""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
|
||||
|
||||
# Ignore DC and very low frequencies
|
||||
mask = freqs > 50
|
||||
if not np.any(mask):
|
||||
return 0.0
|
||||
|
||||
fft_masked = fft[mask]
|
||||
freqs_masked = freqs[mask]
|
||||
|
||||
return float(freqs_masked[np.argmax(fft_masked)])
|
||||
|
||||
@staticmethod
|
||||
def _compute_spectral_centroid(samples: np.ndarray) -> float:
|
||||
"""
|
||||
Spectral centroid — "center of mass" of the spectrum.
|
||||
|
||||
Higher for bright/treble sounds, lower for bass-heavy sounds.
|
||||
Speech typically 500-4000Hz, music varies widely.
|
||||
"""
|
||||
fft = np.abs(np.fft.rfft(samples))
|
||||
freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)
|
||||
|
||||
total_energy = np.sum(fft)
|
||||
if total_energy == 0:
|
||||
return 0.0
|
||||
|
||||
return float(np.sum(freqs * fft) / total_energy)
|
||||
|
||||
@staticmethod
|
||||
def _detect_tonality(samples: np.ndarray) -> bool:
|
||||
"""
|
||||
Check if the signal is strongly tonal (has clear pitch).
|
||||
Uses autocorrelation.
|
||||
"""
|
||||
# Autocorrelation
|
||||
correlation = np.correlate(samples, samples, mode="full")
|
||||
correlation = correlation[len(correlation) // 2:]
|
||||
|
||||
# Normalize
|
||||
if correlation[0] == 0:
|
||||
return False
|
||||
correlation = correlation / correlation[0]
|
||||
|
||||
# Look for a strong peak (indicating periodicity)
|
||||
# Skip the first ~50 samples (very high frequencies)
|
||||
min_lag = int(SAMPLE_RATE / 1000) # ~16 samples (1000Hz max)
|
||||
max_lag = int(SAMPLE_RATE / 50) # ~320 samples (50Hz min)
|
||||
|
||||
search_region = correlation[min_lag:max_lag]
|
||||
if len(search_region) == 0:
|
||||
return False
|
||||
|
||||
peak_value = np.max(search_region)
|
||||
return bool(peak_value > 0.5)
|
||||
|
||||
def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]:
|
||||
"""
|
||||
Detect DTMF tones using Goertzel algorithm (simplified).
|
||||
|
||||
DTMF frequencies:
|
||||
697, 770, 852, 941 Hz (row)
|
||||
1209, 1336, 1477, 1633 Hz (column)
|
||||
"""
|
||||
dtmf_freqs_low = [697, 770, 852, 941]
|
||||
dtmf_freqs_high = [1209, 1336, 1477, 1633]
|
||||
dtmf_map = {
|
||||
(697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A",
|
||||
(770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B",
|
||||
(852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C",
|
||||
(941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D",
|
||||
}
|
||||
|
||||
# Compute power at each DTMF frequency
|
||||
def goertzel_power(freq: int) -> float:
|
||||
k = int(0.5 + len(samples) * freq / SAMPLE_RATE)
|
||||
w = 2 * np.pi * k / len(samples)
|
||||
coeff = 2 * np.cos(w)
|
||||
s0, s1, s2 = 0.0, 0.0, 0.0
|
||||
for sample in samples:
|
||||
s0 = sample + coeff * s1 - s2
|
||||
s2 = s1
|
||||
s1 = s0
|
||||
return float(s1 * s1 + s2 * s2 - coeff * s1 * s2)
|
||||
|
||||
# Find strongest low and high frequencies
|
||||
low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low]
|
||||
high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high]
|
||||
|
||||
best_low = max(low_powers, key=lambda x: x[1])
|
||||
best_high = max(high_powers, key=lambda x: x[1])
|
||||
|
||||
# Threshold: both frequencies must be significantly present
|
||||
total_power = np.sum(samples ** 2)
|
||||
if total_power == 0:
|
||||
return None
|
||||
|
||||
threshold = total_power * 0.1
|
||||
if best_low[1] > threshold and best_high[1] > threshold:
|
||||
key = (best_low[0], best_high[0])
|
||||
return dtmf_map.get(key)
|
||||
|
||||
return None
|
||||
|
||||
# ================================================================
|
||||
# Higher-Level Classification
|
||||
# ================================================================
|
||||
|
||||
def _compute_music_score(
|
||||
self,
|
||||
spectral_flatness: float,
|
||||
is_tonal: bool,
|
||||
spectral_centroid: float,
|
||||
zcr: float,
|
||||
rms: float,
|
||||
) -> float:
|
||||
"""Compute a music likelihood score (0.0 - 1.0)."""
|
||||
score = 0.0
|
||||
|
||||
# Music tends to be tonal
|
||||
if is_tonal:
|
||||
score += 0.3
|
||||
|
||||
# Music has moderate spectral flatness (more than pure tone, less than noise)
|
||||
if 0.05 < spectral_flatness < 0.4:
|
||||
score += 0.2
|
||||
|
||||
# Music has sustained energy
|
||||
if rms > 0.03:
|
||||
score += 0.15
|
||||
|
||||
# Music has wider spectral content than speech
|
||||
if spectral_centroid > 1500:
|
||||
score += 0.15
|
||||
|
||||
# Music tends to have lower ZCR than noise
|
||||
if zcr < 0.15:
|
||||
score += 0.2
|
||||
|
||||
return min(1.0, score)
|
||||
|
||||
def _compute_speech_score(
|
||||
self,
|
||||
spectral_flatness: float,
|
||||
zcr: float,
|
||||
spectral_centroid: float,
|
||||
rms: float,
|
||||
) -> float:
|
||||
"""Compute a speech likelihood score (0.0 - 1.0)."""
|
||||
score = 0.0
|
||||
|
||||
# Speech has moderate spectral flatness
|
||||
if 0.1 < spectral_flatness < 0.5:
|
||||
score += 0.25
|
||||
|
||||
# Speech centroid typically 500-4000 Hz
|
||||
if 500 < spectral_centroid < 4000:
|
||||
score += 0.25
|
||||
|
||||
# Speech has moderate ZCR
|
||||
if 0.02 < zcr < 0.2:
|
||||
score += 0.25
|
||||
|
||||
# Speech has moderate energy
|
||||
if 0.01 < rms < 0.5:
|
||||
score += 0.25
|
||||
|
||||
return min(1.0, score)
|
||||
|
||||
def _looks_like_live_human(
|
||||
self,
|
||||
speech_score: float,
|
||||
zcr: float,
|
||||
rms: float,
|
||||
) -> bool:
|
||||
"""
|
||||
Distinguish live human from IVR/TTS.
|
||||
|
||||
Heuristics:
|
||||
- IVR prompts are followed by silence (waiting for input)
|
||||
- Live humans have more natural variation in energy and pitch
|
||||
- After hold music → speech transition, it's likely a human
|
||||
|
||||
This is the hardest classification and benefits most from
|
||||
the transcript context (Speaches STT).
|
||||
"""
|
||||
# Look at recent classification history
|
||||
recent = self._classification_history[-10:] if self._classification_history else []
|
||||
|
||||
# Key signal: if we were just listening to hold music and now
|
||||
# hear speech, it's very likely a live human agent
|
||||
if recent:
|
||||
recent_types = [c for c in recent]
|
||||
if AudioClassification.MUSIC in recent_types[-5:]:
|
||||
# Transition from music to speech = agent picked up!
|
||||
return True
|
||||
|
||||
# High speech score with good energy = more likely human
|
||||
if speech_score > 0.7 and rms > 0.05:
|
||||
return True
|
||||
|
||||
# Default: assume IVR until proven otherwise
|
||||
return False
|
||||
|
||||
def update_history(self, classification: AudioClassification) -> None:
|
||||
"""Track classification history for pattern detection."""
|
||||
self._classification_history.append(classification)
|
||||
# Keep last 100 classifications
|
||||
if len(self._classification_history) > 100:
|
||||
self._classification_history = self._classification_history[-100:]
|
||||
|
||||
def detect_hold_to_human_transition(self) -> bool:
|
||||
"""
|
||||
Detect the critical moment: hold music → live human.
|
||||
|
||||
Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN
|
||||
"""
|
||||
recent = self._classification_history[-20:]
|
||||
if len(recent) < 5:
|
||||
return False
|
||||
|
||||
# Count recent music vs speech
|
||||
music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC)
|
||||
speech_count = sum(
|
||||
1 for c in recent[-3:]
|
||||
if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT)
|
||||
)
|
||||
|
||||
# If we had a lot of music and now have speech, someone picked up
|
||||
return music_count >= 3 and speech_count >= 2
|
||||
Reference in New Issue
Block a user