|
import os |
|
import uuid |
|
import time |
|
import json |
|
import requests |
|
import soundfile as sf |
|
import gradio as gr |
|
from moviepy import VideoFileClip |
|
|
|
ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY', '') |
|
|
|
def extract_audio(video_path, output_format="mp3"): |
|
|
|
if not video_path: |
|
return None, "No video provided" |
|
|
|
output_path = f"extracted_audio_{uuid.uuid4().hex[:8]}.{output_format}" |
|
|
|
try: |
|
video = VideoFileClip(video_path) |
|
video.audio.write_audiofile(output_path, logger=None) |
|
video.close() |
|
return output_path, f"Audio extracted successfully" |
|
except Exception as e: |
|
return None, f"Error extracting audio: {str(e)}" |
|
|
|
def transcribe_with_scribe(audio_path, api_key, model_id="scribe_v1"): |
|
start_time = time.time() |
|
|
|
if not api_key: |
|
return {"error": "Please provide an API key"} |
|
|
|
url = "https://api.elevenlabs.io/v1/speech-to-text" |
|
headers = { |
|
"xi-api-key": api_key |
|
} |
|
|
|
try: |
|
with open(audio_path, "rb") as f: |
|
files = { |
|
"file": f, |
|
"model_id": (None, model_id) |
|
} |
|
response = requests.post(url, headers=headers, files=files) |
|
response.raise_for_status() |
|
result = response.json() |
|
except requests.exceptions.RequestException as e: |
|
return {"error": f"API request failed: {str(e)}"} |
|
except json.JSONDecodeError: |
|
return {"error": "Failed to parse API response"} |
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
file_size = os.path.getsize(audio_path) / (1024 * 1024) |
|
|
|
try: |
|
audio_data, sample_rate = sf.read(audio_path) |
|
audio_duration = len(audio_data) / sample_rate |
|
except: |
|
try: |
|
import librosa |
|
audio_duration = librosa.get_duration(filename=audio_path) |
|
except: |
|
audio_duration = 0 |
|
|
|
text = result.get('text', '') |
|
|
|
return { |
|
"service": "Scribe", |
|
"text": text, |
|
"processing_time": processing_time, |
|
"file_size_mb": file_size, |
|
"audio_duration": audio_duration, |
|
"real_time_factor": processing_time / audio_duration if audio_duration > 0 else None, |
|
"processing_speed": audio_duration / processing_time if audio_duration > 0 else None, |
|
"raw_response": result |
|
} |
|
|
|
def save_transcription(transcription): |
|
if "error" in transcription: |
|
return None, transcription["error"] |
|
|
|
transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt" |
|
|
|
try: |
|
with open(transcript_filename, "w", encoding="utf-8") as f: |
|
f.write(transcription.get('text', 'No text found')) |
|
return transcript_filename, "Transcription saved as text file" |
|
except Exception as e: |
|
return None, f"Error saving transcription: {str(e)}" |
|
|
|
def process_video_file(video_input, output_format, api_key, model_id): |
|
|
|
audio_output, audio_status = extract_audio(video_input, output_format) |
|
|
|
if not audio_output: |
|
return None, audio_status, None, audio_status |
|
|
|
transcription = transcribe_with_scribe(audio_output, api_key, model_id) |
|
|
|
transcript_file, transcript_status = save_transcription(transcription) |
|
|
|
try: |
|
os.remove(audio_output) |
|
except Exception: |
|
pass |
|
|
|
return audio_output, audio_status, transcript_file, transcript_status |
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Video to Audio to Transcription") as app: |
|
gr.Markdown("# Video => Audio => Transcription") |
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
placeholder="Enter your ElevenLabs API key", |
|
label="ElevenLabs API Key", |
|
type="password", |
|
value=ELEVENLABS_API_KEY |
|
) |
|
model_id = gr.Dropdown( |
|
choices=["scribe_v1"], |
|
value="scribe_v1", |
|
label="Transcription Model" |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Upload Video"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
video_input = gr.Video(label="Upload Video") |
|
format_choice_file = gr.Radio( |
|
["mp3"], |
|
value="mp3", |
|
label="Output Format" |
|
) |
|
extract_button_file = gr.Button("Extract Audio & Transcribe") |
|
|
|
with gr.Column(): |
|
audio_output_file = gr.Audio(label="Extracted Audio", type="filepath") |
|
status_output_file = gr.Textbox(label="Audio Extraction Status") |
|
transcript_file_output = gr.File(label="Transcription Text File") |
|
transcript_status_output = gr.Textbox(label="Transcription Status") |
|
|
|
extract_button_file.click( |
|
fn=process_video_file, |
|
inputs=[video_input, format_choice_file, api_key, model_id], |
|
outputs=[ |
|
audio_output_file, |
|
status_output_file, |
|
transcript_file_output, |
|
transcript_status_output |
|
] |
|
) |
|
|
|
return app |
|
|
|
def main(): |
|
app = create_interface() |
|
app.launch() |
|
|
|
if __name__ == "__main__": |
|
main() |