Codegeass321 commited on
Commit
a38c567
·
1 Parent(s): e63867f

Added Audio Support

Browse files
Files changed (4) hide show
  1. __pycache__/utils.cpython-312.pyc +0 -0
  2. app.py +42 -11
  3. requirements.txt +4 -1
  4. utils.py +19 -1
__pycache__/utils.cpython-312.pyc CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
 
app.py CHANGED
@@ -7,7 +7,8 @@ from utils import (
7
  retrieve_context_approx,
8
  build_prompt,
9
  ask_gemini,
10
- load_documents_gradio, # Import the new function
 
11
  )
12
 
13
  client = authenticate()
@@ -37,20 +38,50 @@ def handle_question(query):
37
  answer = ask_gemini(prompt, client)
38
  return f"### My Insights :\n\n{answer.strip()}"
39
 
 
 
 
 
 
 
 
 
40
 
41
- with gr.Blocks(theme='NoCrypt/miku') as demo:
42
- gr.Markdown("## Ask Questions from Your Uploaded Documents")
43
- #gr.Image(value="bg.JPG", visible=True)
44
 
45
- file_input = gr.File(label="Upload Your File", file_types=['.pdf', '.txt', '.docx', '.csv', '.json', '.pptx', '.xml', '.xlsx'], file_count='multiple')
 
 
 
 
 
46
 
47
- process_btn = gr.Button("Process Document")
48
- status = gr.Textbox(label="Processing Status")
 
 
49
 
50
- question = gr.Textbox(label="Ask a Question")
51
- answer = gr.Markdown()
 
52
 
53
- process_btn.click(upload_and_process, inputs=file_input, outputs=status)
54
- question.submit(handle_question, inputs=question, outputs=answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  demo.launch(share=True) # Or demo.deploy(hf_space="your-username/your-space-name")
 
7
  retrieve_context_approx,
8
  build_prompt,
9
  ask_gemini,
10
+ load_documents_gradio,
11
+ transcribe
12
  )
13
 
14
  client = authenticate()
 
38
  answer = ask_gemini(prompt, client)
39
  return f"### My Insights :\n\n{answer.strip()}"
40
 
41
+ def route_question(text_input, audio_input):
42
+ if text_input.strip():
43
+ return handle_question(text_input)
44
+ elif audio_input is not None:
45
+ transcribed = transcribe(audio_input)
46
+ return handle_question(transcribed)
47
+ else:
48
+ return "Please provide a question by typing or speaking."
49
 
50
+ def show_audio():
51
+ return gr.update(visible=True)
 
52
 
53
+ css="""
54
+ #micbttn {
55
+ background-color: #FFCCCB;
56
+ font-size: 30px;
57
+ height: 59px;
58
+ }
59
 
60
+ #micINP {
61
+ background-color: #FFCCCB;
62
+ }
63
+ """
64
 
65
+ with gr.Blocks(css=css, theme='NoCrypt/miku') as demo:
66
+ gr.Markdown("## Ask Questions from Your Uploaded Documents")
67
+ file_input = gr.File(label="Upload Your File", file_types=['.pdf', '.txt', '.docx', '.csv', '.json', '.pptx', '.xml', '.xlsx'], file_count='multiple')
68
 
69
+ process_btn = gr.Button("Process Document")
70
+ status = gr.Textbox(label="Processing Status")
71
+
72
+ gr.Markdown("### Ask your question (type or speak):")
73
+
74
+ with gr.Row():
75
+ text_question = gr.Textbox(placeholder="Type your question...", scale=9, show_label=False)
76
+ mic_btn = gr.Button("🎤", scale=1, elem_id="micbttn")
77
+
78
+ audio_input = gr.Audio(sources=["microphone"], type="numpy", visible=False, label=None, elem_id="micINP")
79
+
80
+ submit_btn = gr.Button("Submit")
81
+ answer = gr.Markdown()
82
+
83
+ process_btn.click(upload_and_process, inputs=file_input, outputs=status)
84
+ mic_btn.click(show_audio, outputs=audio_input)
85
+ submit_btn.click(route_question, inputs=[text_question, audio_input], outputs=answer)
86
 
87
  demo.launch(share=True) # Or demo.deploy(hf_space="your-username/your-space-name")
requirements.txt CHANGED
@@ -11,4 +11,7 @@ unstructured[pdf]
11
  unstructured[docx]
12
  unstructured[ppt]
13
  unstructured[excel]
14
- unstructured[xml]
 
 
 
 
11
  unstructured[docx]
12
  unstructured[ppt]
13
  unstructured[excel]
14
+ unstructured[xml]
15
+ torch
16
+ torchaudio
17
+ transformers
utils.py CHANGED
@@ -12,6 +12,7 @@ warnings.filterwarnings("ignore")
12
  from google import genai
13
  from google.genai import types
14
  from sentence_transformers import SentenceTransformer
 
15
  from langchain_community.document_loaders import(
16
  UnstructuredPDFLoader,
17
  TextLoader,
@@ -138,4 +139,21 @@ def ask_gemini(prompt, client):
138
  contents=[prompt],
139
  config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
140
  )
141
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from google import genai
13
  from google.genai import types
14
  from sentence_transformers import SentenceTransformer
15
+ from transformers import pipeline
16
  from langchain_community.document_loaders import(
17
  UnstructuredPDFLoader,
18
  TextLoader,
 
139
  contents=[prompt],
140
  config=types.GenerateContentConfig(max_output_tokens=2048, temperature=0.5, seed=42),
141
  )
142
+ return response.text
143
+
144
+ # Speech2Text:
145
+ def transcribe(audio, model="openai/whisper-base.en"):
146
+ if audio is None:
147
+ raise ValueError("No audio detected!")
148
+
149
+ transcriber = pipeline("automatic-speech-recognition", model=model)
150
+ sr, y = audio # Sampling rate (KHz) and y= amplitude array
151
+
152
+ if y.ndim > 1: # Convert to Mono (CH=1) if Stereo (CH=2; L & R)
153
+ y = y.mean(1)
154
+
155
+ y = y.astype(np.float32)
156
+ y /= np.max(np.abs(y)) # Normalizing the amplitude values in range [-1,1]
157
+
158
+ result = transcriber({"sampling_rate" : sr, "raw" : y})
159
+ return result["text"]