OpenAI Whisper API speech-to-text backend (cloud).
Classes
OpenAIWhisperBackend
OpenAIWhisperBackend(api_key: Optional[str] = None)
Bases: SpeechBackend
Cloud speech-to-text using OpenAI Whisper API.
Source code in src/openjarvis/speech/openai_whisper.py
| def __init__(self, api_key: Optional[str] = None) -> None:
self._api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
self._client: Optional[OpenAI] = None
if self._api_key and OpenAI is not None:
self._client = OpenAI(api_key=self._api_key)
|
Functions
transcribe
transcribe(audio: bytes, *, format: str = 'wav', language: Optional[str] = None) -> TranscriptionResult
Transcribe audio using OpenAI's Whisper API.
Source code in src/openjarvis/speech/openai_whisper.py
| def transcribe(
self,
audio: bytes,
*,
format: str = "wav",
language: Optional[str] = None,
) -> TranscriptionResult:
"""Transcribe audio using OpenAI's Whisper API."""
if self._client is None:
raise RuntimeError("OpenAI client not initialized (missing API key?)")
ext = format if not format.startswith(".") else format[1:]
audio_file = io.BytesIO(audio)
audio_file.name = f"audio.{ext}"
kwargs: dict = {"model": "whisper-1", "file": audio_file}
if language:
kwargs["language"] = language
kwargs["response_format"] = "verbose_json"
response = self._client.audio.transcriptions.create(**kwargs)
return TranscriptionResult(
text=getattr(response, "text", str(response)),
language=getattr(response, "language", None),
confidence=None,
duration_seconds=getattr(response, "duration", 0.0),
segments=[],
)
|