Maia/backend/app/services/whisper_service.py

92 lines
3.7 KiB
Python

import httpx
import logging
import os
from app.config import settings
logger = logging.getLogger(__name__)
MIME_TYPES = {
".webm": "audio/webm",
".wav": "audio/wav",
".mp3": "audio/mpeg",
".m4a": "audio/mp4",
".ogg": "audio/ogg",
".flac": "audio/flac",
}
async def transcribe_audio(audio_path: str) -> dict:
"""
Send audio to Whisper Diarization API.
1) Tries POST /api/v2/transcribe (diarization — field: audio) -> returns segments with speaker labels
2) Falls back to POST /transcribe-segments (no diarization) -> assigns SPEAKER_01 to all
"""
logger.info(f"Sending audio to Whisper server: {audio_path}")
try:
with open(audio_path, "rb") as f:
content = f.read()
filename = os.path.basename(audio_path)
ext = os.path.splitext(filename)[1].lower()
mime_type = MIME_TYPES.get(ext, "audio/webm")
async with httpx.AsyncClient(timeout=600.0) as http:
# 1) Try /api/v2/transcribe with diarization (field: audio)
try:
files = {"audio": (filename, content, mime_type)}
data = {"language": "pt"}
resp = await http.post(
f"{settings.WHISPER_SERVER_URL}/api/v2/transcribe",
files=files, data=data, timeout=600.0,
)
if resp.status_code not in (404, 422, 500):
resp.raise_for_status()
result = resp.json()
logger.info(f"Diarized transcription completed for {audio_path} "
f"({result.get('speakers_count', '?')} speakers, "
f"method: {result.get('diarization_method', '?')})")
return result
logger.warning(f"/api/v2/transcribe returned {resp.status_code}, falling back...")
except httpx.HTTPStatusError as e:
if e.response.status_code not in (404, 422, 500):
raise
logger.warning(f"/api/v2/transcribe error {e.response.status_code}, falling back...")
except Exception as e:
logger.warning(f"/api/v2/transcribe exception: {e}, falling back...")
# 2) Fallback: /transcribe-segments (no speaker diarization)
files = {"file": (filename, content, mime_type)}
data = {"language": "pt"}
resp = await http.post(
f"{settings.WHISPER_SERVER_URL}/transcribe-segments",
files=files, data=data, timeout=600.0,
)
resp.raise_for_status()
result = resp.json()
logger.info(f"Transcription via fallback /transcribe-segments completed for {audio_path}")
segments = result.get("segments", [])
normalized = [
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"speaker": "SPEAKER_01",
"text": seg.get("text", "").strip(),
}
for seg in segments
]
return {
"segments": normalized,
"speakers_count": 1,
"audio_duration": segments[-1]["end"] if segments else 0.0,
"language": result.get("language", "pt"),
"full_text": result.get("text", ""),
"diarization_available": False,
"diarization_method": "none",
}
except httpx.ConnectError as e:
logger.error(f"Cannot connect to Whisper server at {settings.WHISPER_SERVER_URL}: {e}")
raise Exception(f"Whisper server unavailable: {str(e)}")
except Exception as e:
logger.error(f"Transcription error: {e}")
raise