import os import uuid import time import json import requests import soundfile as sf import gradio as gr from moviepy import VideoFileClip ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY', '') def extract_audio(video_path, output_format="mp3"): if not video_path: return None, "No video provided" output_path = f"extracted_audio_{uuid.uuid4().hex[:8]}.{output_format}" try: video = VideoFileClip(video_path) video.audio.write_audiofile(output_path, logger=None) video.close() return output_path, f"Audio extracted successfully" except Exception as e: return None, f"Error extracting audio: {str(e)}" def transcribe_with_scribe(audio_path, api_key, model_id="scribe_v1"): start_time = time.time() if not api_key: return {"error": "Please provide an API key"} url = "https://api.elevenlabs.io/v1/speech-to-text" headers = { "xi-api-key": api_key } try: with open(audio_path, "rb") as f: files = { "file": f, "model_id": (None, model_id) } response = requests.post(url, headers=headers, files=files) response.raise_for_status() result = response.json() except requests.exceptions.RequestException as e: return {"error": f"API request failed: {str(e)}"} except json.JSONDecodeError: return {"error": "Failed to parse API response"} end_time = time.time() processing_time = end_time - start_time file_size = os.path.getsize(audio_path) / (1024 * 1024) try: audio_data, sample_rate = sf.read(audio_path) audio_duration = len(audio_data) / sample_rate except: try: import librosa audio_duration = librosa.get_duration(filename=audio_path) except: audio_duration = 0 text = result.get('text', '') return { "service": "Scribe", "text": text, "processing_time": processing_time, "file_size_mb": file_size, "audio_duration": audio_duration, "real_time_factor": processing_time / audio_duration if audio_duration > 0 else None, "processing_speed": audio_duration / processing_time if audio_duration > 0 else None, "raw_response": result } def save_transcription(transcription): if "error" in transcription: return None, transcription["error"] transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt" try: with open(transcript_filename, "w", encoding="utf-8") as f: f.write(transcription.get('text', 'No text found')) return transcript_filename, "Transcription saved as text file" except Exception as e: return None, f"Error saving transcription: {str(e)}" def process_video_file(video_input, output_format, api_key, model_id): audio_output, audio_status = extract_audio(video_input, output_format) if not audio_output: return None, audio_status, None, audio_status transcription = transcribe_with_scribe(audio_output, api_key, model_id) transcript_file, transcript_status = save_transcription(transcription) try: os.remove(audio_output) except Exception: pass return audio_output, audio_status, transcript_file, transcript_status def create_interface(): with gr.Blocks(title="Video to Audio to Transcription") as app: gr.Markdown("# Video => Audio => Transcription") with gr.Row(): api_key = gr.Textbox( placeholder="Enter your ElevenLabs API key", label="ElevenLabs API Key", type="password", value=ELEVENLABS_API_KEY ) model_id = gr.Dropdown( choices=["scribe_v1"], value="scribe_v1", label="Transcription Model" ) with gr.Tabs(): with gr.TabItem("Upload Video"): with gr.Row(): with gr.Column(): video_input = gr.Video(label="Upload Video") format_choice_file = gr.Radio( ["mp3"], value="mp3", label="Output Format" ) extract_button_file = gr.Button("Extract Audio & Transcribe") with gr.Column(): audio_output_file = gr.Audio(label="Extracted Audio", type="filepath") status_output_file = gr.Textbox(label="Audio Extraction Status") transcript_file_output = gr.File(label="Transcription Text File") transcript_status_output = gr.Textbox(label="Transcription Status") extract_button_file.click( fn=process_video_file, inputs=[video_input, format_choice_file, api_key, model_id], outputs=[ audio_output_file, status_output_file, transcript_file_output, transcript_status_output ] ) return app def main(): app = create_interface() app.launch() if __name__ == "__main__": main()