hold-slayer/services/audio_classifier.py

"""
Audio Classifier — Spectral analysis for hold music, speech, and silence detection.

This is the brain of the Hold Slayer. It analyzes audio in real-time to determine:
- Is this hold music?
- Is this an IVR prompt (automated voice)?
- Is this a live human?
- Is this silence?
- Is this a ring-back tone?

Uses spectral analysis (librosa/numpy) to classify audio without needing
a trained ML model — just signal processing and heuristics.
"""

import logging
import time
from typing import Optional

import numpy as np

from config import ClassifierSettings
from models.call import AudioClassification, ClassificationResult

logger = logging.getLogger(__name__)

# Audio constants
SAMPLE_RATE = 16000  # 16kHz mono
FRAME_SIZE = SAMPLE_RATE * 2  # 16-bit samples = 2 bytes per sample


class AudioClassifier:
    """
    Real-time audio classifier using spectral analysis.

    Classification strategy:
    - Silence: Low RMS energy
    - Music: High spectral flatness + sustained tonal content + rhythm
    - IVR prompt: Speech-like spectral envelope but repetitive/synthetic
    - Live human: Speech-like spectral envelope + natural variation
    - Ringing: Very tonal, specific frequencies (~440Hz, ~480Hz for NA ring)
    - DTMF: Dual-tone detection at known DTMF frequencies
    """

    def __init__(self, settings: ClassifierSettings):
        self.settings = settings
        self._window_buffer: list[bytes] = []
        self._window_samples = int(settings.window_seconds * SAMPLE_RATE)
        self._classification_history: list[AudioClassification] = []

    def classify_chunk(self, audio_data: bytes) -> ClassificationResult:
        """
        Classify a chunk of audio data.

        Args:
            audio_data: Raw PCM audio (16-bit signed, 16kHz, mono)

        Returns:
            ClassificationResult with type and confidence
        """
        # Convert bytes to numpy array
        samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)

        if len(samples) == 0:
            return ClassificationResult(
                timestamp=time.time(),
                audio_type=AudioClassification.SILENCE,
                confidence=1.0,
            )

        # Normalize to [-1.0, 1.0]
        samples = samples / 32768.0

        # Run all detectors
        rms = self._compute_rms(samples)
        spectral_flatness = self._compute_spectral_flatness(samples)
        zcr = self._compute_zero_crossing_rate(samples)
        dominant_freq = self._compute_dominant_frequency(samples)
        spectral_centroid = self._compute_spectral_centroid(samples)
        is_tonal = self._detect_tonality(samples)

        # Build feature dict for debugging
        features = {
            "rms": float(rms),
            "spectral_flatness": float(spectral_flatness),
            "zcr": float(zcr),
            "dominant_freq": float(dominant_freq),
            "spectral_centroid": float(spectral_centroid),
            "is_tonal": is_tonal,
        }

        # === Classification Logic ===

        # 1. Silence detection
        if rms < 0.01:
            return ClassificationResult(
                timestamp=time.time(),
                audio_type=AudioClassification.SILENCE,
                confidence=min(1.0, (0.01 - rms) / 0.01 + 0.5),
                details=features,
            )

        # 2. DTMF detection (very specific dual-tone pattern)
        dtmf_result = self._detect_dtmf(samples)
        if dtmf_result:
            return ClassificationResult(
                timestamp=time.time(),
                audio_type=AudioClassification.DTMF,
                confidence=0.95,
                details={**features, "dtmf_digit": dtmf_result},
            )

        # 3. Ring-back tone detection (440+480Hz in NA, periodic on/off)
        if is_tonal and 400 < dominant_freq < 520 and rms > 0.02:
            return ClassificationResult(
                timestamp=time.time(),
                audio_type=AudioClassification.RINGING,
                confidence=0.8,
                details=features,
            )

        # 4. Music vs Speech discrimination
        #    Music: higher spectral flatness, more tonal, wider spectral spread
        #    Speech: lower spectral flatness, concentrated energy, variable ZCR
        music_score = self._compute_music_score(
            spectral_flatness, is_tonal, spectral_centroid, zcr, rms
        )
        speech_score = self._compute_speech_score(
            spectral_flatness, zcr, spectral_centroid, rms
        )

        # 5. If it's speech-like, is it live or automated?
        if speech_score > music_score:
            # Use history to distinguish live human from IVR
            # IVR: repetitive patterns, synthetic prosody
            # Human: natural variation, conversational rhythm
            if self._looks_like_live_human(speech_score, zcr, rms):
                return ClassificationResult(
                    timestamp=time.time(),
                    audio_type=AudioClassification.LIVE_HUMAN,
                    confidence=speech_score,
                    details=features,
                )
            else:
                return ClassificationResult(
                    timestamp=time.time(),
                    audio_type=AudioClassification.IVR_PROMPT,
                    confidence=speech_score * 0.8,
                    details=features,
                )

        # 6. Music (hold music)
        if music_score >= self.settings.music_threshold:
            return ClassificationResult(
                timestamp=time.time(),
                audio_type=AudioClassification.MUSIC,
                confidence=music_score,
                details=features,
            )

        # 7. Unknown / low confidence
        return ClassificationResult(
            timestamp=time.time(),
            audio_type=AudioClassification.UNKNOWN,
            confidence=max(music_score, speech_score),
            details=features,
        )

    # ================================================================
    # Feature Extraction
    # ================================================================

    @staticmethod
    def _compute_rms(samples: np.ndarray) -> float:
        """Root Mean Square — overall energy level."""
        return float(np.sqrt(np.mean(samples ** 2)))

    @staticmethod
    def _compute_spectral_flatness(samples: np.ndarray) -> float:
        """
        Spectral flatness (Wiener entropy).

        Close to 1.0 = noise-like (white noise)
        Close to 0.0 = tonal (pure tone, music)
        Speech is typically 0.1-0.4, music 0.05-0.3
        """
        fft = np.abs(np.fft.rfft(samples))
        fft = fft[fft > 0]  # Avoid log(0)

        if len(fft) == 0:
            return 0.0

        geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
        arithmetic_mean = np.mean(fft)

        if arithmetic_mean == 0:
            return 0.0

        return float(geometric_mean / arithmetic_mean)

    @staticmethod
    def _compute_zero_crossing_rate(samples: np.ndarray) -> float:
        """
        Zero-crossing rate — how often the signal crosses zero.

        Higher for unvoiced speech and noise.
        Lower for voiced speech and tonal music.
        """
        crossings = np.sum(np.abs(np.diff(np.sign(samples)))) / 2
        return float(crossings / len(samples))

    @staticmethod
    def _compute_dominant_frequency(samples: np.ndarray) -> float:
        """Find the dominant frequency in the signal."""
        fft = np.abs(np.fft.rfft(samples))
        freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)

        # Ignore DC and very low frequencies
        mask = freqs > 50
        if not np.any(mask):
            return 0.0

        fft_masked = fft[mask]
        freqs_masked = freqs[mask]

        return float(freqs_masked[np.argmax(fft_masked)])

    @staticmethod
    def _compute_spectral_centroid(samples: np.ndarray) -> float:
        """
        Spectral centroid — "center of mass" of the spectrum.

        Higher for bright/treble sounds, lower for bass-heavy sounds.
        Speech typically 500-4000Hz, music varies widely.
        """
        fft = np.abs(np.fft.rfft(samples))
        freqs = np.fft.rfftfreq(len(samples), 1.0 / SAMPLE_RATE)

        total_energy = np.sum(fft)
        if total_energy == 0:
            return 0.0

        return float(np.sum(freqs * fft) / total_energy)

    @staticmethod
    def _detect_tonality(samples: np.ndarray) -> bool:
        """
        Check if the signal is strongly tonal (has clear pitch).
        Uses autocorrelation.
        """
        # Autocorrelation
        correlation = np.correlate(samples, samples, mode="full")
        correlation = correlation[len(correlation) // 2:]

        # Normalize
        if correlation[0] == 0:
            return False
        correlation = correlation / correlation[0]

        # Look for a strong peak (indicating periodicity)
        # Skip the first ~50 samples (very high frequencies)
        min_lag = int(SAMPLE_RATE / 1000)  # ~16 samples (1000Hz max)
        max_lag = int(SAMPLE_RATE / 50)  # ~320 samples (50Hz min)

        search_region = correlation[min_lag:max_lag]
        if len(search_region) == 0:
            return False

        peak_value = np.max(search_region)
        return bool(peak_value > 0.5)

    def _detect_dtmf(self, samples: np.ndarray) -> Optional[str]:
        """
        Detect DTMF tones using Goertzel algorithm (simplified).

        DTMF frequencies:
            697, 770, 852, 941 Hz (row)
            1209, 1336, 1477, 1633 Hz (column)
        """
        dtmf_freqs_low = [697, 770, 852, 941]
        dtmf_freqs_high = [1209, 1336, 1477, 1633]
        dtmf_map = {
            (697, 1209): "1", (697, 1336): "2", (697, 1477): "3", (697, 1633): "A",
            (770, 1209): "4", (770, 1336): "5", (770, 1477): "6", (770, 1633): "B",
            (852, 1209): "7", (852, 1336): "8", (852, 1477): "9", (852, 1633): "C",
            (941, 1209): "*", (941, 1336): "0", (941, 1477): "#", (941, 1633): "D",
        }

        # Compute power at each DTMF frequency
        def goertzel_power(freq: int) -> float:
            k = int(0.5 + len(samples) * freq / SAMPLE_RATE)
            w = 2 * np.pi * k / len(samples)
            coeff = 2 * np.cos(w)
            s0, s1, s2 = 0.0, 0.0, 0.0
            for sample in samples:
                s0 = sample + coeff * s1 - s2
                s2 = s1
                s1 = s0
            return float(s1 * s1 + s2 * s2 - coeff * s1 * s2)

        # Find strongest low and high frequencies
        low_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_low]
        high_powers = [(f, goertzel_power(f)) for f in dtmf_freqs_high]

        best_low = max(low_powers, key=lambda x: x[1])
        best_high = max(high_powers, key=lambda x: x[1])

        # Threshold: both frequencies must be significantly present
        total_power = np.sum(samples ** 2)
        if total_power == 0:
            return None

        threshold = total_power * 0.1
        if best_low[1] > threshold and best_high[1] > threshold:
            key = (best_low[0], best_high[0])
            return dtmf_map.get(key)

        return None

    # ================================================================
    # Higher-Level Classification
    # ================================================================

    def _compute_music_score(
        self,
        spectral_flatness: float,
        is_tonal: bool,
        spectral_centroid: float,
        zcr: float,
        rms: float,
    ) -> float:
        """Compute a music likelihood score (0.0 - 1.0)."""
        score = 0.0

        # Music tends to be tonal
        if is_tonal:
            score += 0.3

        # Music has moderate spectral flatness (more than pure tone, less than noise)
        if 0.05 < spectral_flatness < 0.4:
            score += 0.2

        # Music has sustained energy
        if rms > 0.03:
            score += 0.15

        # Music has wider spectral content than speech
        if spectral_centroid > 1500:
            score += 0.15

        # Music tends to have lower ZCR than noise
        if zcr < 0.15:
            score += 0.2

        return min(1.0, score)

    def _compute_speech_score(
        self,
        spectral_flatness: float,
        zcr: float,
        spectral_centroid: float,
        rms: float,
    ) -> float:
        """Compute a speech likelihood score (0.0 - 1.0)."""
        score = 0.0

        # Speech has moderate spectral flatness
        if 0.1 < spectral_flatness < 0.5:
            score += 0.25

        # Speech centroid typically 500-4000 Hz
        if 500 < spectral_centroid < 4000:
            score += 0.25

        # Speech has moderate ZCR
        if 0.02 < zcr < 0.2:
            score += 0.25

        # Speech has moderate energy
        if 0.01 < rms < 0.5:
            score += 0.25

        return min(1.0, score)

    def _looks_like_live_human(
        self,
        speech_score: float,
        zcr: float,
        rms: float,
    ) -> bool:
        """
        Distinguish live human from IVR/TTS.

        Heuristics:
        - IVR prompts are followed by silence (waiting for input)
        - Live humans have more natural variation in energy and pitch
        - After hold music → speech transition, it's likely a human

        This is the hardest classification and benefits most from
        the transcript context (Speaches STT).
        """
        # Look at recent classification history
        recent = self._classification_history[-10:] if self._classification_history else []

        # Key signal: if we were just listening to hold music and now
        # hear speech, it's very likely a live human agent
        if recent:
            recent_types = [c for c in recent]
            if AudioClassification.MUSIC in recent_types[-5:]:
                # Transition from music to speech = agent picked up!
                return True

        # High speech score with good energy = more likely human
        if speech_score > 0.7 and rms > 0.05:
            return True

        # Default: assume IVR until proven otherwise
        return False

    def update_history(self, classification: AudioClassification) -> None:
        """Track classification history for pattern detection."""
        self._classification_history.append(classification)
        # Keep last 100 classifications
        if len(self._classification_history) > 100:
            self._classification_history = self._classification_history[-100:]

    def detect_hold_to_human_transition(self) -> bool:
        """
        Detect the critical moment: hold music → live human.

        Looks for pattern: MUSIC, MUSIC, MUSIC, ..., SPEECH/LIVE_HUMAN
        """
        recent = self._classification_history[-20:]
        if len(recent) < 5:
            return False

        # Count recent music vs speech
        music_count = sum(1 for c in recent[:-3] if c == AudioClassification.MUSIC)
        speech_count = sum(
            1 for c in recent[-3:]
            if c in (AudioClassification.LIVE_HUMAN, AudioClassification.IVR_PROMPT)
        )

        # If we had a lot of music and now have speech, someone picked up
        return music_count >= 3 and speech_count >= 2