Maia/backend/app/services/whisper_service.py

import httpx
import logging
import os
from app.config import settings

logger = logging.getLogger(__name__)

MIME_TYPES = {
    ".webm": "audio/webm",
    ".wav": "audio/wav",
    ".mp3": "audio/mpeg",
    ".m4a": "audio/mp4",
    ".ogg": "audio/ogg",
    ".flac": "audio/flac",
}

async def transcribe_audio(audio_path: str) -> dict:
    """
    Send audio to Whisper Diarization API.
    1) Tries POST /api/v2/transcribe (diarization — field: audio) -> returns segments with speaker labels
    2) Falls back to POST /transcribe-segments (no diarization) -> assigns SPEAKER_01 to all
    """
    logger.info(f"Sending audio to Whisper server: {audio_path}")
    try:
        with open(audio_path, "rb") as f:
            content = f.read()
        filename = os.path.basename(audio_path)
        ext = os.path.splitext(filename)[1].lower()
        mime_type = MIME_TYPES.get(ext, "audio/webm")

        async with httpx.AsyncClient(timeout=600.0) as http:
            # 1) Try /api/v2/transcribe with diarization (field: audio)
            try:
                files = {"audio": (filename, content, mime_type)}
                data = {"language": "pt"}
                resp = await http.post(
                    f"{settings.WHISPER_SERVER_URL}/api/v2/transcribe",
                    files=files, data=data, timeout=600.0,
                )
                if resp.status_code not in (404, 422, 500):
                    resp.raise_for_status()
                    result = resp.json()
                    logger.info(f"Diarized transcription completed for {audio_path} "
                                f"({result.get('speakers_count', '?')} speakers, "
                                f"method: {result.get('diarization_method', '?')})")
                    return result
                logger.warning(f"/api/v2/transcribe returned {resp.status_code}, falling back...")
            except httpx.HTTPStatusError as e:
                if e.response.status_code not in (404, 422, 500):
                    raise
                logger.warning(f"/api/v2/transcribe error {e.response.status_code}, falling back...")
            except Exception as e:
                logger.warning(f"/api/v2/transcribe exception: {e}, falling back...")

            # 2) Fallback: /transcribe-segments (no speaker diarization)
            files = {"file": (filename, content, mime_type)}
            data = {"language": "pt"}
            resp = await http.post(
                f"{settings.WHISPER_SERVER_URL}/transcribe-segments",
                files=files, data=data, timeout=600.0,
            )
            resp.raise_for_status()
            result = resp.json()
            logger.info(f"Transcription via fallback /transcribe-segments completed for {audio_path}")

            segments = result.get("segments", [])
            normalized = [
                {
                    "start": seg.get("start", 0.0),
                    "end": seg.get("end", 0.0),
                    "speaker": "SPEAKER_01",
                    "text": seg.get("text", "").strip(),
                }
                for seg in segments
            ]
            return {
                "segments": normalized,
                "speakers_count": 1,
                "audio_duration": segments[-1]["end"] if segments else 0.0,
                "language": result.get("language", "pt"),
                "full_text": result.get("text", ""),
                "diarization_available": False,
                "diarization_method": "none",
            }

    except httpx.ConnectError as e:
        logger.error(f"Cannot connect to Whisper server at {settings.WHISPER_SERVER_URL}: {e}")
        raise Exception(f"Whisper server unavailable: {str(e)}")
    except Exception as e:
        logger.error(f"Transcription error: {e}")
        raise