Skip to content

openai_whisper

openai_whisper

OpenAI Whisper API speech-to-text backend (cloud).

Classes

OpenAIWhisperBackend

OpenAIWhisperBackend(api_key: Optional[str] = None)

Bases: SpeechBackend

Cloud speech-to-text using OpenAI Whisper API.

Source code in src/openjarvis/speech/openai_whisper.py
def __init__(self, api_key: Optional[str] = None) -> None:
    self._api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
    self._client: Optional[OpenAI] = None
    if self._api_key and OpenAI is not None:
        self._client = OpenAI(api_key=self._api_key)
Functions
transcribe
transcribe(audio: bytes, *, format: str = 'wav', language: Optional[str] = None) -> TranscriptionResult

Transcribe audio using OpenAI's Whisper API.

Source code in src/openjarvis/speech/openai_whisper.py
def transcribe(
    self,
    audio: bytes,
    *,
    format: str = "wav",
    language: Optional[str] = None,
) -> TranscriptionResult:
    """Transcribe audio using OpenAI's Whisper API."""
    if self._client is None:
        raise RuntimeError("OpenAI client not initialized (missing API key?)")

    ext = format if not format.startswith(".") else format[1:]
    audio_file = io.BytesIO(audio)
    audio_file.name = f"audio.{ext}"

    kwargs: dict = {"model": "whisper-1", "file": audio_file}
    if language:
        kwargs["language"] = language
    kwargs["response_format"] = "verbose_json"

    response = self._client.audio.transcriptions.create(**kwargs)

    return TranscriptionResult(
        text=getattr(response, "text", str(response)),
        language=getattr(response, "language", None),
        confidence=None,
        duration_seconds=getattr(response, "duration", 0.0),
        segments=[],
    )