92 lines
3.7 KiB
Python
92 lines
3.7 KiB
Python
import httpx
|
|
import logging
|
|
import os
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MIME_TYPES = {
|
|
".webm": "audio/webm",
|
|
".wav": "audio/wav",
|
|
".mp3": "audio/mpeg",
|
|
".m4a": "audio/mp4",
|
|
".ogg": "audio/ogg",
|
|
".flac": "audio/flac",
|
|
}
|
|
|
|
async def transcribe_audio(audio_path: str) -> dict:
|
|
"""
|
|
Send audio to Whisper Diarization API.
|
|
1) Tries POST /api/v2/transcribe (diarization — field: audio) -> returns segments with speaker labels
|
|
2) Falls back to POST /transcribe-segments (no diarization) -> assigns SPEAKER_01 to all
|
|
"""
|
|
logger.info(f"Sending audio to Whisper server: {audio_path}")
|
|
try:
|
|
with open(audio_path, "rb") as f:
|
|
content = f.read()
|
|
filename = os.path.basename(audio_path)
|
|
ext = os.path.splitext(filename)[1].lower()
|
|
mime_type = MIME_TYPES.get(ext, "audio/webm")
|
|
|
|
async with httpx.AsyncClient(timeout=600.0) as http:
|
|
# 1) Try /api/v2/transcribe with diarization (field: audio)
|
|
try:
|
|
files = {"audio": (filename, content, mime_type)}
|
|
data = {"language": "pt"}
|
|
resp = await http.post(
|
|
f"{settings.WHISPER_SERVER_URL}/api/v2/transcribe",
|
|
files=files, data=data, timeout=600.0,
|
|
)
|
|
if resp.status_code not in (404, 422, 500):
|
|
resp.raise_for_status()
|
|
result = resp.json()
|
|
logger.info(f"Diarized transcription completed for {audio_path} "
|
|
f"({result.get('speakers_count', '?')} speakers, "
|
|
f"method: {result.get('diarization_method', '?')})")
|
|
return result
|
|
logger.warning(f"/api/v2/transcribe returned {resp.status_code}, falling back...")
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code not in (404, 422, 500):
|
|
raise
|
|
logger.warning(f"/api/v2/transcribe error {e.response.status_code}, falling back...")
|
|
except Exception as e:
|
|
logger.warning(f"/api/v2/transcribe exception: {e}, falling back...")
|
|
|
|
# 2) Fallback: /transcribe-segments (no speaker diarization)
|
|
files = {"file": (filename, content, mime_type)}
|
|
data = {"language": "pt"}
|
|
resp = await http.post(
|
|
f"{settings.WHISPER_SERVER_URL}/transcribe-segments",
|
|
files=files, data=data, timeout=600.0,
|
|
)
|
|
resp.raise_for_status()
|
|
result = resp.json()
|
|
logger.info(f"Transcription via fallback /transcribe-segments completed for {audio_path}")
|
|
|
|
segments = result.get("segments", [])
|
|
normalized = [
|
|
{
|
|
"start": seg.get("start", 0.0),
|
|
"end": seg.get("end", 0.0),
|
|
"speaker": "SPEAKER_01",
|
|
"text": seg.get("text", "").strip(),
|
|
}
|
|
for seg in segments
|
|
]
|
|
return {
|
|
"segments": normalized,
|
|
"speakers_count": 1,
|
|
"audio_duration": segments[-1]["end"] if segments else 0.0,
|
|
"language": result.get("language", "pt"),
|
|
"full_text": result.get("text", ""),
|
|
"diarization_available": False,
|
|
"diarization_method": "none",
|
|
}
|
|
|
|
except httpx.ConnectError as e:
|
|
logger.error(f"Cannot connect to Whisper server at {settings.WHISPER_SERVER_URL}: {e}")
|
|
raise Exception(f"Whisper server unavailable: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Transcription error: {e}")
|
|
raise
|