roboaudio/app.py at main · asmarufoglu/roboaudio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import whisper
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import torch
import os

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Initializing application. Device: {DEVICE}")

# Load models globally to avoid reloading on each inference
print("Loading speech recognition models into device memory...")
model_base = whisper.load_model("base", device=DEVICE)
model_medium = whisper.load_model("medium", device=DEVICE)
print("Models loaded successfully.")

def generate_spectrogram(audio_path):
    """
    Generate and save a spectrogram visualization from audio file.
    Returns path to saved spectrogram image.
    """
    y, sr = librosa.load(audio_path, sr=16000)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

    fig = plt.figure(figsize=(10, 4))
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Audio Spectrogram')
    plt.tight_layout()

    spec_path = "temp_spec.png"
    plt.savefig(spec_path)
    plt.close()

    return spec_path


def transcribe_with_base_model(audio_path):
    """
    Transcribe audio using the base Whisper model.
    """
    result = model_base.transcribe(audio_path, language="tr")
    return result["text"].strip()


def transcribe_with_medium_model(audio_path):
    """
    Transcribe audio using the medium Whisper model.
    """
    result = model_medium.transcribe(audio_path, language="tr")
    return result["text"].strip()


def process_audio_pipeline(audio_path):
    """
    Main pipeline: generate spectrogram and transcribe audio with both models.
    Returns spectrogram path, base model transcription, and medium model transcription.
    """
    if audio_path is None:
        return None, "No audio provided", "No audio provided"

    spec_path = generate_spectrogram(audio_path)
    text_base = transcribe_with_base_model(audio_path)
    text_medium = transcribe_with_medium_model(audio_path)

    # Add note if base model produces output but medium model does not
    if text_base and not text_medium:
        text_base += "\n\n(Note: Base model detected speech but medium model did not. Possible hallucination.)"

    return spec_path, text_base, text_medium

# Interface configuration
demo = gr.Interface(
    fn=process_audio_pipeline,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="filepath",
        label="Upload or record audio"
    ),
    outputs=[
        gr.Image(label="Spectrogram"),
        gr.Textbox(label="Base Model Transcription"),
        gr.Textbox(label="Medium Model Transcription")
    ],
    title="Audio Analysis Tool",
    description="""
    Exploratory tool for analyzing automatic speech recognition (ASR) model behavior on audio inputs.
    Upload or record audio to visualize its spectrogram and compare transcriptions from two Whisper model variants.
    """,
    theme="soft"
)

if __name__ == "__main__":
    demo.launch()