MrSimple01 commited on
Commit
af4d14d
·
verified ·
1 Parent(s): bfe7fc0

new version

Browse files
Files changed (1) hide show
  1. app.py +103 -118
app.py CHANGED
@@ -1,15 +1,16 @@
1
  import os
2
- import gradio as gr
3
- import requests
4
- import json
5
- from moviepy import VideoFileClip
6
  import uuid
7
  import time
 
 
8
  import soundfile as sf
 
 
9
 
10
- ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", None)
11
 
12
  def extract_audio(video_path, output_format="mp3"):
 
13
  if not video_path:
14
  return None, "No video provided"
15
 
@@ -23,55 +24,7 @@ def extract_audio(video_path, output_format="mp3"):
23
  except Exception as e:
24
  return None, f"Error extracting audio: {str(e)}"
25
 
26
- def save_transcription(transcription):
27
- if "error" in transcription:
28
- return None, transcription["error"]
29
- transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt"
30
-
31
- try:
32
- with open(transcript_filename, "w", encoding="utf-8") as f:
33
- f.write(transcription.get('text', 'No text found'))
34
-
35
- return transcript_filename, "Transcription saved as text file"
36
- except Exception as e:
37
- return None, f"Error saving transcription: {str(e)}"
38
-
39
- def process_video_file(video_file, output_format, api_key, model_id):
40
- if video_file is None:
41
- return None, "Please upload a video file", None, "No video provided"
42
-
43
- audio_path, message = extract_audio(video_file, output_format)
44
-
45
- if audio_path and os.path.exists(audio_path):
46
- transcription = transcribe_audio(audio_path, api_key, model_id)
47
- transcript_file, transcript_message = save_transcription(transcription)
48
- return audio_path, message, transcript_file, transcript_message
49
- else:
50
- return None, message, None, "Audio extraction failed, cannot transcribe"
51
-
52
- def process_video_url(video_url, output_format, api_key, model_id):
53
- if not video_url.strip():
54
- return None, "Please enter a video URL", None, "No URL provided"
55
-
56
- video_path, error = download_video_from_url(video_url)
57
- if error:
58
- return None, error, None, "Video download failed, cannot transcribe"
59
-
60
- audio_path, message = extract_audio(video_path, output_format)
61
- if video_path and os.path.exists(video_path):
62
- try:
63
- os.remove(video_path)
64
- except:
65
- pass
66
-
67
- if audio_path and os.path.exists(audio_path):
68
- transcription = transcribe_audio(audio_path, api_key, model_id)
69
- transcript_file, transcript_message = save_transcription(transcription)
70
- return audio_path, message, transcript_file, transcript_message
71
- else:
72
- return None, message, None, "Audio extraction failed, cannot transcribe"
73
-
74
- def transcribe_audio(audio_path, api_key, model_id="scribe_v1"):
75
  start_time = time.time()
76
 
77
  if not api_key:
@@ -79,101 +32,133 @@ def transcribe_audio(audio_path, api_key, model_id="scribe_v1"):
79
 
80
  url = "https://api.elevenlabs.io/v1/speech-to-text"
81
  headers = {
82
- "xi-api-key": api_key,
83
- "Content-Type": "multipart/form-data" # Explicitly set content type
84
  }
85
 
86
  try:
87
  with open(audio_path, "rb") as f:
88
  files = {
89
- "file": (os.path.basename(audio_path), f, "audio/mpeg"),
90
  "model_id": (None, model_id)
91
  }
92
- response = requests.post(
93
- url,
94
- headers=headers,
95
- files=files
96
- )
97
-
98
- # More detailed error handling
99
- if response.status_code != 200:
100
- return {
101
- "error": f"API request failed with status {response.status_code}",
102
- "response_text": response.text
103
- }
104
-
105
  result = response.json()
106
  except requests.exceptions.RequestException as e:
107
  return {"error": f"API request failed: {str(e)}"}
108
  except json.JSONDecodeError:
109
  return {"error": "Failed to parse API response"}
110
- except Exception as e:
111
- return {"error": f"Unexpected error: {str(e)}"}
112
 
113
  end_time = time.time()
114
  processing_time = end_time - start_time
115
 
116
- # File size calculation
117
  file_size = os.path.getsize(audio_path) / (1024 * 1024)
118
 
119
- # Audio duration calculation with fallback
120
  try:
121
- # Attempt to get audio duration using soundfile
122
  audio_data, sample_rate = sf.read(audio_path)
123
  audio_duration = len(audio_data) / sample_rate
124
- except ImportError:
125
  try:
126
  import librosa
127
  audio_duration = librosa.get_duration(filename=audio_path)
128
  except:
129
  audio_duration = 0
130
 
131
- # Prepare comprehensive return dictionary
 
132
  return {
133
- "service": "ElevenLabs Scribe",
134
- "text": result.get('text', ''),
135
  "processing_time": processing_time,
136
- "file_size_mb": round(file_size, 2),
137
- "audio_duration": round(audio_duration, 2),
138
- "real_time_factor": round(processing_time / audio_duration, 2) if audio_duration > 0 else None,
139
- "processing_speed": round(audio_duration / processing_time, 2) if processing_time > 0 else None,
140
  "raw_response": result
141
  }
142
- with gr.Blocks(title="Video to Audio to Transcription") as app:
143
- gr.Markdown("# Video => Audio => Transcription")
144
-
145
- api_key = gr.Textbox(
146
- placeholder="Enter your ElevenLabs API key",
147
- label="ElevenLabs API Key",
148
- type="password",
149
- value=ELEVENLABS_API_KEY
150
- )
151
-
152
- model_id = gr.Dropdown(
153
- choices=["scribe_v1"],
154
- value="scribe_v1",
155
- label="Transcription Model"
156
- )
157
-
158
- with gr.Tabs():
159
- with gr.TabItem("Upload Video"):
160
- with gr.Row():
161
- with gr.Column():
162
- video_input = gr.Video(label="Upload Video")
163
- format_choice_file = gr.Radio(["mp3", "wav"], value="mp3", label="Output Format")
164
- extract_button_file = gr.Button("Extract Audio & Transcribe")
165
-
166
- with gr.Column():
167
- audio_output_file = gr.Audio(label="Extracted Audio", type="filepath")
168
- status_output_file = gr.Textbox(label="Audio Extraction Status")
169
- transcript_file_output = gr.File(label="Transcription Text File")
170
- transcript_status_output = gr.Textbox(label="Transcription Status")
171
-
172
- extract_button_file.click(
173
- fn=process_video_file,
174
- inputs=[video_input, format_choice_file, api_key, model_id],
175
- outputs=[audio_output_file, status_output_file, transcript_file_output, transcript_status_output]
 
 
 
 
 
 
 
 
176
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  if __name__ == "__main__":
179
- app.launch()
 
1
  import os
 
 
 
 
2
  import uuid
3
  import time
4
+ import json
5
+ import requests
6
  import soundfile as sf
7
+ import gradio as gr
8
+ from moviepy.editor import VideoFileClip
9
 
10
+ ELEVENLABS_API_KEY = os.environ.get('ELEVENLABS_API_KEY', '')
11
 
12
  def extract_audio(video_path, output_format="mp3"):
13
+
14
  if not video_path:
15
  return None, "No video provided"
16
 
 
24
  except Exception as e:
25
  return None, f"Error extracting audio: {str(e)}"
26
 
27
+ def transcribe_with_scribe(audio_path, api_key, model_id="scribe_v1"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  start_time = time.time()
29
 
30
  if not api_key:
 
32
 
33
  url = "https://api.elevenlabs.io/v1/speech-to-text"
34
  headers = {
35
+ "xi-api-key": api_key
 
36
  }
37
 
38
  try:
39
  with open(audio_path, "rb") as f:
40
  files = {
41
+ "file": f,
42
  "model_id": (None, model_id)
43
  }
44
+ response = requests.post(url, headers=headers, files=files)
45
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
46
  result = response.json()
47
  except requests.exceptions.RequestException as e:
48
  return {"error": f"API request failed: {str(e)}"}
49
  except json.JSONDecodeError:
50
  return {"error": "Failed to parse API response"}
 
 
51
 
52
  end_time = time.time()
53
  processing_time = end_time - start_time
54
 
 
55
  file_size = os.path.getsize(audio_path) / (1024 * 1024)
56
 
 
57
  try:
 
58
  audio_data, sample_rate = sf.read(audio_path)
59
  audio_duration = len(audio_data) / sample_rate
60
+ except:
61
  try:
62
  import librosa
63
  audio_duration = librosa.get_duration(filename=audio_path)
64
  except:
65
  audio_duration = 0
66
 
67
+ text = result.get('text', '')
68
+
69
  return {
70
+ "service": "Scribe",
71
+ "text": text,
72
  "processing_time": processing_time,
73
+ "file_size_mb": file_size,
74
+ "audio_duration": audio_duration,
75
+ "real_time_factor": processing_time / audio_duration if audio_duration > 0 else None,
76
+ "processing_speed": audio_duration / processing_time if audio_duration > 0 else None,
77
  "raw_response": result
78
  }
79
+
80
+ def save_transcription(transcription):
81
+ if "error" in transcription:
82
+ return None, transcription["error"]
83
+
84
+ transcript_filename = f"transcription_{uuid.uuid4().hex[:8]}.txt"
85
+
86
+ try:
87
+ with open(transcript_filename, "w", encoding="utf-8") as f:
88
+ f.write(transcription.get('text', 'No text found'))
89
+ return transcript_filename, "Transcription saved as text file"
90
+ except Exception as e:
91
+ return None, f"Error saving transcription: {str(e)}"
92
+
93
+ def process_video_file(video_input, output_format, api_key, model_id):
94
+
95
+ audio_output, audio_status = extract_audio(video_input, output_format)
96
+
97
+ if not audio_output:
98
+ return None, audio_status, None, audio_status
99
+
100
+ transcription = transcribe_with_scribe(audio_output, api_key, model_id)
101
+
102
+ transcript_file, transcript_status = save_transcription(transcription)
103
+
104
+ try:
105
+ os.remove(audio_output)
106
+ except Exception:
107
+ pass
108
+
109
+ return audio_output, audio_status, transcript_file, transcript_status
110
+
111
+ def create_interface():
112
+ with gr.Blocks(title="Video to Audio to Transcription") as app:
113
+ gr.Markdown("# Video => Audio => Transcription")
114
+
115
+ with gr.Row():
116
+ api_key = gr.Textbox(
117
+ placeholder="Enter your ElevenLabs API key",
118
+ label="ElevenLabs API Key",
119
+ type="password",
120
+ value=ELEVENLABS_API_KEY
121
  )
122
+ model_id = gr.Dropdown(
123
+ choices=["scribe_v1"],
124
+ value="scribe_v1",
125
+ label="Transcription Model"
126
+ )
127
+
128
+ with gr.Tabs():
129
+ with gr.TabItem("Upload Video"):
130
+ with gr.Row():
131
+ with gr.Column():
132
+ video_input = gr.Video(label="Upload Video")
133
+ format_choice_file = gr.Radio(
134
+ ["mp3"],
135
+ value="mp3",
136
+ label="Output Format"
137
+ )
138
+ extract_button_file = gr.Button("Extract Audio & Transcribe")
139
+
140
+ with gr.Column():
141
+ audio_output_file = gr.Audio(label="Extracted Audio", type="filepath")
142
+ status_output_file = gr.Textbox(label="Audio Extraction Status")
143
+ transcript_file_output = gr.File(label="Transcription Text File")
144
+ transcript_status_output = gr.Textbox(label="Transcription Status")
145
+
146
+ extract_button_file.click(
147
+ fn=process_video_file,
148
+ inputs=[video_input, format_choice_file, api_key, model_id],
149
+ outputs=[
150
+ audio_output_file,
151
+ status_output_file,
152
+ transcript_file_output,
153
+ transcript_status_output
154
+ ]
155
+ )
156
+
157
+ return app
158
+
159
+ def main():
160
+ app = create_interface()
161
+ app.launch()
162
 
163
  if __name__ == "__main__":
164
+ main()