Athspi commited on
Commit
3fd0067
·
verified ·
1 Parent(s): c02bb52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -122
app.py CHANGED
@@ -1,138 +1,182 @@
 
 
1
  import gradio as gr
2
- import asyncio
3
- import numpy as np
4
  from google import genai
5
  from google.genai import types
6
- import soundfile as sf
7
- import io
8
 
9
- # Configuration
10
- SAMPLE_RATE = 24000
11
- MODEL = "gemini-2.0-flash-exp" # Correct experimental model name
 
 
 
 
 
 
 
 
 
12
 
13
- class GeminiTTS:
14
- def __init__(self, api_key):
15
- if not api_key:
16
- raise ValueError("API key cannot be empty")
17
- self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
18
- self.config = types.LiveConnectConfig(
19
- response_modalities=["AUDIO"],
20
- speech_config=types.SpeechConfig(
21
- voice_config=types.VoiceConfig(
22
- prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
23
- )
24
- ),
25
- system_instruction=types.Content(
26
- parts=[types.Part.from_text(text="Speak exactly what the user says")],
27
- role="user"
28
- ),
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- async def text_to_speech(self, text):
32
- try:
33
- async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
34
- await session.send(input=text or " ", end_of_turn=True)
35
-
36
- async for response in session.receive():
37
- if audio_data := response.data:
38
- # Convert to numpy array
39
- audio_array = np.frombuffer(audio_data, dtype=np.float32)
40
-
41
- # Handle empty/quiet audio
42
- if audio_array.size == 0:
43
- audio_array = np.zeros(int(SAMPLE_RATE * 0.5)) # 0.5s of silence
44
-
45
- # Normalize audio to prevent processing warnings
46
- max_val = np.max(np.abs(audio_array))
47
- if max_val > 0:
48
- audio_array = audio_array / max_val
49
-
50
- # Convert to proper format for Gradio
51
- return self._create_audio_response(audio_array)
52
-
53
- if text_response := response.text:
54
- return text_response
55
-
56
- return None
57
- except Exception as e:
58
- return f"Error: {str(e)}"
59
 
60
- def _create_audio_response(self, audio_array):
61
- """Create properly formatted audio response for Gradio"""
62
- # Convert to 16-bit PCM format
63
- audio_array = (audio_array * 32767).astype(np.int16)
64
-
65
- # Create WAV file in memory
66
- with io.BytesIO() as wav_buffer:
67
- with sf.SoundFile(
68
- wav_buffer,
69
- mode='w',
70
- samplerate=SAMPLE_RATE,
71
- channels=1,
72
- format='WAV',
73
- subtype='PCM_16'
74
- ) as sf_file:
75
- sf_file.write(audio_array)
76
- wav_bytes = wav_buffer.getvalue()
77
-
78
- return (SAMPLE_RATE, wav_bytes)
79
 
80
- def create_interface():
81
- tts_engine = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- def init_engine(api_key):
84
- nonlocal tts_engine
85
- try:
86
- tts_engine = GeminiTTS(api_key)
87
- return "✅ TTS Initialized Successfully"
88
- except Exception as e:
89
- return f"❌ Initialization Failed: {str(e)}"
90
 
91
- async def generate_speech(text):
92
- if not tts_engine:
93
- raise gr.Error("Please initialize the TTS first")
94
-
95
- result = await tts_engine.text_to_speech(text)
 
96
 
97
- if isinstance(result, str):
98
- return None, result # Return error message
99
- elif result:
100
- return result, "" # Return audio and empty message
101
- return None, "No response received"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- with gr.Blocks(title="Gemini TTS") as app:
104
- gr.Markdown("# 🎤 Gemini Text-to-Speech")
105
-
106
- with gr.Row():
107
- api_key = gr.Textbox(
108
- label="API Key",
109
- type="password",
110
- placeholder="Enter your Gemini API key"
111
- )
112
- init_btn = gr.Button("Initialize")
113
-
114
- init_status = gr.Textbox(label="Status", interactive=False)
115
- init_btn.click(init_engine, inputs=api_key, outputs=init_status)
116
-
117
- with gr.Group():
118
- text_input = gr.Textbox(
119
- label="Input Text",
120
- lines=3,
121
- placeholder="Type something to speak..."
122
- )
123
- generate_btn = gr.Button("Generate Speech")
124
-
125
- audio_output = gr.Audio(label="Output Audio", type="filepath")
126
- text_output = gr.Textbox(label="Messages", interactive=False)
127
-
128
- generate_btn.click(
129
- generate_speech,
130
- inputs=text_input,
131
- outputs=[audio_output, text_output]
132
  )
133
 
134
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  if __name__ == "__main__":
137
- app = create_interface()
138
- app.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
  import gradio as gr
4
+ import requests
5
+ import markdownify
6
  from google import genai
7
  from google.genai import types
8
+ from urllib.robotparser import RobotFileParser
9
+ from urllib.parse import urlparse
10
 
11
+ # Configure browser tools
12
+ def can_crawl_url(url: str, user_agent: str = "*") -> bool:
13
+ """Check robots.txt permissions for a URL"""
14
+ try:
15
+ parsed_url = urlparse(url)
16
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
17
+ rp = RobotFileParser(robots_url)
18
+ rp.read()
19
+ return rp.can_fetch(user_agent, url)
20
+ except Exception as e:
21
+ print(f"Error checking robots.txt: {e}")
22
+ return False
23
 
24
+ def load_page(url: str) -> str:
25
+ """Load webpage content as markdown"""
26
+ if not can_crawl_url(url):
27
+ return f"URL {url} failed robots.txt check"
28
+ try:
29
+ response = requests.get(url, timeout=10)
30
+ return markdownify.markdownify(response.text)
31
+ except Exception as e:
32
+ return f"Error loading page: {str(e)}"
33
+
34
+ # Configure Gemini client
35
+ client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
36
+ MODEL = "gemini-2.0-flash"
37
+ TOOLS = [
38
+ types.Tool(
39
+ function_declarations=[
40
+ types.FunctionDeclaration(
41
+ name="load_page",
42
+ description="Load webpage content as markdown",
43
+ parameters={
44
+ "type": "object",
45
+ "properties": {
46
+ "url": {"type": "string", "description": "Full URL to load"}
47
+ },
48
+ "required": ["url"]
49
+ }
50
+ )
51
+ ]
52
+ ),
53
+ types.Tool(google_search=types.GoogleSearch()),
54
+ types.Tool(code_execution=types.ToolCodeExecution())
55
+ ]
56
 
57
+ SYSTEM_INSTRUCTION = """You are an AI assistant with multiple capabilities:
58
+ 1. Web browsing through search and direct page access
59
+ 2. Code execution for calculations, simulations, and data analysis
60
+ 3. File I/O operations for data processing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ Use this decision tree:
63
+ - For factual questions: Use search
64
+ - For time-sensitive data: Use browser tool
65
+ - For math/data processing: Generate and execute code
66
+ - Always explain your reasoning"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ def format_code_response(parts):
69
+ """Format code execution parts for Markdown display"""
70
+ formatted = []
71
+ for part in parts:
72
+ if part.text:
73
+ formatted.append(part.text)
74
+ if part.executable_code:
75
+ formatted.append(f"```python\n{part.executable_code.code}\n```")
76
+ if part.code_execution_result:
77
+ formatted.append(f"**Result**:\n{part.code_execution_result.output}")
78
+ if part.inline_data:
79
+ formatted.append(f"![Generated Image](data:image/png;base64,{base64.b64encode(part.inline_data.data).decode()})")
80
+ return "\n\n".join(formatted)
81
+
82
+ def generate_response(user_input):
83
+ full_response = ""
84
+ chat = client.chats.create(
85
+ model=MODEL,
86
+ config=types.GenerateContentConfig(
87
+ temperature=0.7,
88
+ tools=TOOLS,
89
+ system_instruction=SYSTEM_INSTRUCTION
90
+ )
91
+ )
92
 
93
+ # Initial request
94
+ response = chat.send_message(user_input)
 
 
 
 
 
95
 
96
+ # Process all response parts
97
+ response_parts = []
98
+ for part in response.candidates[0].content.parts:
99
+ response_parts.append(part)
100
+ full_response = format_code_response(response_parts)
101
+ yield full_response
102
 
103
+ # Handle function calls
104
+ if part.function_call:
105
+ fn = part.function_call
106
+ if fn.name == "load_page":
107
+ result = load_page(**fn.args)
108
+ chat.send_message(
109
+ types.Content(
110
+ parts=[
111
+ types.Part(
112
+ function_response=types.FunctionResponse(
113
+ name=fn.name,
114
+ id=fn.id,
115
+ response={"result": result}
116
+ )
117
+ )
118
+ ]
119
+ )
120
+ )
121
+ # Get final response after tool execution
122
+ final_response = chat.send_message("")
123
+ for final_part in final_response.candidates[0].content.parts:
124
+ response_parts.append(final_part)
125
+ full_response = format_code_response(response_parts)
126
+ yield full_response
127
 
128
+ # Gradio Interface
129
+ with gr.Blocks(title="Gemini 2.0 AI Assistant") as demo:
130
+ gr.Markdown("# 🚀 Gemini 2.0 AI Assistant")
131
+ gr.Markdown("Web Access • Code Execution • Data Analysis")
132
+
133
+ with gr.Row():
134
+ input_box = gr.Textbox(
135
+ label="Your Query",
136
+ placeholder="Ask anything or request code execution...",
137
+ lines=3,
138
+ max_lines=10,
139
+ autofocus=True
140
+ )
141
+ output_box = gr.Markdown(
142
+ label="Assistant Response",
143
+ elem_classes="markdown-output"
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  )
145
 
146
+ with gr.Row():
147
+ submit_btn = gr.Button("Submit", variant="primary")
148
+ clear_btn = gr.Button("Clear")
149
+
150
+ def clear():
151
+ return ["", ""]
152
+
153
+ submit_btn.click(
154
+ fn=generate_response,
155
+ inputs=input_box,
156
+ outputs=output_box,
157
+ queue=True
158
+ )
159
+
160
+ clear_btn.click(
161
+ fn=clear,
162
+ inputs=[],
163
+ outputs=[input_box, output_box]
164
+ )
165
 
166
  if __name__ == "__main__":
167
+ demo.launch(
168
+ server_name="0.0.0.0",
169
+ server_port=7860,
170
+ css="""
171
+ .markdown-output {
172
+ padding: 20px;
173
+ border-radius: 5px;
174
+ background: #f9f9f9;
175
+ }
176
+ .markdown-output code {
177
+ background: #f3f3f3;
178
+ padding: 2px 5px;
179
+ border-radius: 3px;
180
+ }
181
+ """
182
+ )