""" Audio Classifier — Spectral analysis for hold music, speech, and silence detection. This is the brain of the Hold Slayer. It analyzes audio in real-time to determine: - Is this hold music? - Is this an IVR prompt (automated voice)? - Is this a live human? - Is this silence? - Is this a ring-back tone? Uses spectral analysis (librosa/numpy) to classify audio without needing a trained ML model — just signal processing and heuristics. """ import logging import time from typing import Optional import numpy as np from config import ClassifierSettings from models.call import AudioClassification, ClassificationResult logger = logging.getLogger(__name__) # Audio constants SAMPLE_RATE = 16000 # 16kHz mono FRAME_SIZE = SAMPLE_RATE * 2 # 16-bit samples = 2 bytes per sample class AudioClassifier: """ Real-time audio classifier using spectral analysis. Classification strategy: - Silence: Low RMS energy - Music: High spectral flatness + sustained tonal content + rhythm - IVR prompt: Speech-like spectral envelope but repetitive/synthetic - Live human: Speech-like spectral envelope + natural variation - Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring) - DTMF: Dual-tone detection at known DTMF frequencies """ def __init__(self, settings: ClassifierSettings): self.settings = settings self._window_buffer: list[bytes] = [] self._window_samples = int(settings.window_seconds * SAMPLE_RATE) self._classification_history: list[AudioClassification] = [] def classify_chunk(self, audio_data: bytes) -> ClassificationResult: """ Classify a chunk of audio data. Args: audio_data: Raw PCM audio (16-bit signed, 16kHz, mono) Returns: ClassificationResult with type and confidence """ # Convert bytes to numpy array samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) if len(samples) == 0: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.SILENCE, confidence=1.0, ) # Normalize to [-1.0, 1.0] samples = samples / 32768.0 # Run all detectors rms = self._compute_rms(samples) spectral_flatness = self._compute_spectral_flatness(samples) zcr = self._compute_zero_crossing_rate(samples) dominant_freq = self._compute_dominant_frequency(samples) spectral_centroid = self._compute_spectral_centroid(samples) is_tonal = self._detect_tonality(samples) # Build feature dict for debugging features = { "rms": float(rms), "spectral_flatness": float(spectral_flatness), "zcr": float(zcr), "dominant_freq": float(dominant_freq), "spectral_centroid": float(spectral_centroid), "is_tonal": is_tonal, } # === Classification Logic === # 1. Silence detection if rms < 0.01: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.SILENCE, confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5), details=features, ) # 2. DTMF detection (very specific dual-tone pattern) dtmf_result = self._detect_dtmf(samples) if dtmf_result: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.DTMF, confidence=0.95, details={**features, "dtmf_digit": dtmf_result}, ) # 3. Ring-back tone detection (440+480Hz in NA, periodic on/off) if is_tonal and 400 < dominant_freq < 520 and rms > 0.02: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.RINGING, confidence=0.8, details=features, ) # 4. Music vs Speech discrimination # Music: higher spectral flatness, more tonal, wider spectral spread # Speech: lower spectral flatness, concentrated energy, variable ZCR music_score = self._compute_music_score( spectral_flatness, is_tonal, spectral_centroid, zcr, rms ) speech_score = self._compute_speech_score( spectral_flatness, zcr, spectral_centroid, rms ) # 5. If it's speech-like, is it live or automated? if speech_score > music_score: # Use history to distinguish live human from IVR # IVR: repetitive patterns, synthetic prosody # Human: natural variation, conversational rhythm if self._looks_like_live_human(speech_score, zcr, rms): return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.LIVE_HUMAN, confidence=speech_score, details=features, ) else: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.IVR_PROMPT, confidence=speech_score * 0.8, details=features, ) # 6. Music (hold music) if music_score >= self.settings.music_threshold: return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.MUSIC, confidence=music_score, details=features, ) # 7. Unknown / low confidence return ClassificationResult( timestamp=time.time(), audio_type=AudioClassification.UNKNOWN, confidence=max(music_score, speech_score), details=features, ) # ================================================================ # Feature Extraction # ================================================================ @staticmethod def _compute_rms(samples: np.ndarray) -> float: """Root Mean Square — overall energy level.""" return float(np.sqrt(np.mean(samples ** 2))) @staticmethod def _compute_spectral_flatness(samples: np.ndarray) -> float: """ Spectral flatness (Wiener entropy). Close to 1.0 = noise-like (white noise) Close to 0.0 = tonal (pure tone, music) Speech is typically 0.1-0.4, music 0.05-0.3 """ fft = np.abs(np.fft.rfft(samples)) fft = fft[fft > 0] # Avoid log(0) if len(fft) == 0: return 0.0 geometric_mean = np.exp(np.mean(np.log(fft + 1e-10))) arithmetic_mean = np.mean(fft) if arithmetic_mean == 0: return 0.0 return float(geometric_mean / arithmetic_mean) @staticmethod def _compute_zero_crossing_rate(samples: np.ndarray) -> float: """ Zero-crossing rate — how often the signal crosses zero. Higher for unvoiced speech and noise. Lower for voiced speech and tonal music. """ crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2 return float(crossings / len(samples)) @staticmethod def _compute_dominant_frequency(samples: np.ndarray) -> float: """Find the dominant frequency in the signal.""" fft = np.abs(np.fft.rfft(samples)) freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE) # Ignore DC and very low frequencies mask = freqs > 50 if not np.any(mask): return 0.0 fft_masked = fft[mask] freqs_masked = freqs[mask] return float(freqs_masked[np.argmax(fft_masked)]) @staticmethod def _compute_spectral_centroid(samples: np.ndarray) -> float: """ Spectral centroid — "center of mass" of the spectrum. Higher for bright/treble sounds, lower for bass-heavy sounds. Speech typically 500-4000Hz, music varies widely. """ fft = np.abs(np.fft.rfft(samples)) freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE) total_energy = np.sum(fft) if total_energy == 0: return 0.0 return float(np.sum(freqs * fft) / total_energy) @staticmethod def _detect_tonality(samples: np.ndarray) -> bool: """ Check if the signal is strongly tonal (has clear pitch). Uses autocorrelation. """ # Autocorrelation correlation = np.correlate(samples, samples, mode="full") correlation = correlation[len(correlation) // 2:] # Normalize if correlation[0] == 0: return False correlation = correlation / correlation[0] # Look for a strong peak (indicating periodicity) # Skip the first ~50 samples (very high frequencies) min_lag = int(SAMPLE_RATE / 1000) # ~16 samples (1000Hz max) max_lag = int(SAMPLE_RATE / 50) # ~320 samples (50Hz min) search_region = correlation[min_lag:max_lag] if len(search_region) == 0: return False peak_value = np.max(search_region) return bool(peak_value > 0.5) def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]: """ Detect DTMF tones using Goertzel algorithm (simplified). DTMF frequencies: 697, 770, 852, 941 Hz (row) 1209, 1336, 1477, 1633 Hz (column) """ dtmf_freqs_low = [697, 770, 852, 941] dtmf_freqs_high = [1209, 1336, 1477, 1633] dtmf_map = { (697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A", (770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B", (852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C", (941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D", } # Compute power at each DTMF frequency def goertzel_power(freq: int) -> float: k = int(0.5 + len(samples) * freq / SAMPLE_RATE) w = 2 * np.pi * k / len(samples) coeff = 2 * np.cos(w) s0, s1, s2 = 0.0, 0.0, 0.0 for sample in samples: s0 = sample + coeff * s1 - s2 s2 = s1 s1 = s0 return float(s1 * s1 + s2 * s2 - coeff * s1 * s2) # Find strongest low and high frequencies low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low] high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high] best_low = max(low_powers, key=lambda x: x[1]) best_high = max(high_powers, key=lambda x: x[1]) # Threshold: both frequencies must be significantly present total_power = np.sum(samples ** 2) if total_power == 0: return None threshold = total_power * 0.1 if best_low[1] > threshold and best_high[1] > threshold: key = (best_low[0], best_high[0]) return dtmf_map.get(key) return None # ================================================================ # Higher-Level Classification # ================================================================ def _compute_music_score( self, spectral_flatness: float, is_tonal: bool, spectral_centroid: float, zcr: float, rms: float, ) -> float: """Compute a music likelihood score (0.0 - 1.0).""" score = 0.0 # Music tends to be tonal if is_tonal: score += 0.3 # Music has moderate spectral flatness (more than pure tone, less than noise) if 0.05 < spectral_flatness < 0.4: score += 0.2 # Music has sustained energy if rms > 0.03: score += 0.15 # Music has wider spectral content than speech if spectral_centroid > 1500: score += 0.15 # Music tends to have lower ZCR than noise if zcr < 0.15: score += 0.2 return min(1.0, score) def _compute_speech_score( self, spectral_flatness: float, zcr: float, spectral_centroid: float, rms: float, ) -> float: """Compute a speech likelihood score (0.0 - 1.0).""" score = 0.0 # Speech has moderate spectral flatness if 0.1 < spectral_flatness < 0.5: score += 0.25 # Speech centroid typically 500-4000 Hz if 500 < spectral_centroid < 4000: score += 0.25 # Speech has moderate ZCR if 0.02 < zcr < 0.2: score += 0.25 # Speech has moderate energy if 0.01 < rms < 0.5: score += 0.25 return min(1.0, score) def _looks_like_live_human( self, speech_score: float, zcr: float, rms: float, ) -> bool: """ Distinguish live human from IVR/TTS. Heuristics: - IVR prompts are followed by silence (waiting for input) - Live humans have more natural variation in energy and pitch - After hold music → speech transition, it's likely a human This is the hardest classification and benefits most from the transcript context (Speaches STT). """ # Look at recent classification history recent = self._classification_history[-10:] if self._classification_history else [] # Key signal: if we were just listening to hold music and now # hear speech, it's very likely a live human agent if recent: recent_types = [c for c in recent] if AudioClassification.MUSIC in recent_types[-5:]: # Transition from music to speech = agent picked up! return True # High speech score with good energy = more likely human if speech_score > 0.7 and rms > 0.05: return True # Default: assume IVR until proven otherwise return False def update_history(self, classification: AudioClassification) -> None: """Track classification history for pattern detection.""" self._classification_history.append(classification) # Keep last 100 classifications if len(self._classification_history) > 100: self._classification_history = self._classification_history[-100:] def detect_hold_to_human_transition(self) -> bool: """ Detect the critical moment: hold music → live human. Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN """ recent = self._classification_history[-20:] if len(recent) < 5: return False # Count recent music vs speech music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC) speech_count = sum( 1 for c in recent[-3:] if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT) ) # If we had a lot of music and now have speech, someone picked up return music_count >= 3 and speech_count >= 2