Local Audio Transcription Pipeline — Whisper + CUDA

Local pipeline using openai-whisper with CUDA (RTX 3000) for legal-context audio transcription. Includes faster-whisper alternative for speed.

PythonWhisperCUDANLP

Architecture

Audio Input

→

Preprocessing

→

Model Inference

→

Post-process

Code Snippet

import whisper
import os
from pathlib import Path
from datetime import datetime

# Set FFmpeg path — update to match your local installation
FFMPEG_BIN = r"C:\ffmpeg\bin"
os.environ["PATH"] = FFMPEG_BIN + os.pathsep + os.environ.get("PATH", "")

AUDIO_PATH = r"path\to\your\audio_file.m4a"

MODEL_SIZE = "small"   # tiny | base | small | medium | large
OUTPUT_FOLDER = "Transcriptions"
LANGUAGE = None        # None = auto-detect, or set e.g. "en", "es"

def transcribe_audio():
    file = Path(AUDIO_PATH)
    if not file.exists():
        print(f"File not found: {file}")
        return

    print(f"Model: whisper-{MODEL_SIZE}")
    model = whisper.load_model(MODEL_SIZE)

    result = model.transcribe(
        str(file),
        language=LANGUAGE,
        fp16=False,
    )
    text = result["text"].strip()

    output_dir = file.parent / OUTPUT_FOLDER
    output_dir.mkdir(exist_ok=True)
    stem = file.stem.replace(" ", "_").replace(".", "_")
    out_path = output_dir / f"transcription_{stem}.txt"

    with open(out_path, "w", encoding="utf-8") as f:
        f.write(f"Source file : {file.name}\n")
        f.write(f"Model       : whisper-{MODEL_SIZE}\n")
        f.write(f"Language    : {'auto-detect' if LANGUAGE is None else LANGUAGE}\n")
        f.write("=" * 60 + "\n\n")
        f.write(text + "\n")

    print(f"Transcription saved to: {out_path}")

if __name__ == "__main__":
    transcribe_audio()

Detailed write-up, screenshots, and metrics coming in Phase 4.