import httpx import logging import os from app.config import settings logger = logging.getLogger(__name__) MIME_TYPES = { ".webm": "audio/webm", ".wav": "audio/wav", ".mp3": "audio/mpeg", ".m4a": "audio/mp4", ".ogg": "audio/ogg", ".flac": "audio/flac", } async def transcribe_audio(audio_path: str) -> dict: """ Send audio to Whisper Diarization API. 1) Tries POST /api/v2/transcribe (diarization — field: audio) -> returns segments with speaker labels 2) Falls back to POST /transcribe-segments (no diarization) -> assigns SPEAKER_01 to all """ logger.info(f"Sending audio to Whisper server: {audio_path}") try: with open(audio_path, "rb") as f: content = f.read() filename = os.path.basename(audio_path) ext = os.path.splitext(filename)[1].lower() mime_type = MIME_TYPES.get(ext, "audio/webm") async with httpx.AsyncClient(timeout=600.0) as http: # 1) Try /api/v2/transcribe with diarization (field: audio) try: files = {"audio": (filename, content, mime_type)} data = {"language": "pt"} resp = await http.post( f"{settings.WHISPER_SERVER_URL}/api/v2/transcribe", files=files, data=data, timeout=600.0, ) if resp.status_code not in (404, 422, 500): resp.raise_for_status() result = resp.json() logger.info(f"Diarized transcription completed for {audio_path} " f"({result.get('speakers_count', '?')} speakers, " f"method: {result.get('diarization_method', '?')})") return result logger.warning(f"/api/v2/transcribe returned {resp.status_code}, falling back...") except httpx.HTTPStatusError as e: if e.response.status_code not in (404, 422, 500): raise logger.warning(f"/api/v2/transcribe error {e.response.status_code}, falling back...") except Exception as e: logger.warning(f"/api/v2/transcribe exception: {e}, falling back...") # 2) Fallback: /transcribe-segments (no speaker diarization) files = {"file": (filename, content, mime_type)} data = {"language": "pt"} resp = await http.post( f"{settings.WHISPER_SERVER_URL}/transcribe-segments", files=files, data=data, timeout=600.0, ) resp.raise_for_status() result = resp.json() logger.info(f"Transcription via fallback /transcribe-segments completed for {audio_path}") segments = result.get("segments", []) normalized = [ { "start": seg.get("start", 0.0), "end": seg.get("end", 0.0), "speaker": "SPEAKER_01", "text": seg.get("text", "").strip(), } for seg in segments ] return { "segments": normalized, "speakers_count": 1, "audio_duration": segments[-1]["end"] if segments else 0.0, "language": result.get("language", "pt"), "full_text": result.get("text", ""), "diarization_available": False, "diarization_method": "none", } except httpx.ConnectError as e: logger.error(f"Cannot connect to Whisper server at {settings.WHISPER_SERVER_URL}: {e}") raise Exception(f"Whisper server unavailable: {str(e)}") except Exception as e: logger.error(f"Transcription error: {e}") raise