Athspi commited on
Commit
f0fbb06
·
verified ·
1 Parent(s): 513f7a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -62
app.py CHANGED
@@ -13,21 +13,20 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
13
  MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
14
 
15
  # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
16
- EXECUTION_PROVIDER = "cpu"
17
  MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
18
- # Ensure requirements.txt lists: onnxruntime-genai
19
  # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
20
 
21
  # --- (Optional) Alternative GPU Configuration ---
22
- # EXECUTION_PROVIDER = "cuda"
23
  # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
24
- # Ensure requirements.txt lists: onnxruntime-genai-cuda
25
  # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
26
 
27
  LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
28
  HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
29
  HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
30
  ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
 
31
 
32
  # Global variables for model and tokenizer
33
  model = None
@@ -68,41 +67,41 @@ def initialize_model():
68
  model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
69
  logging.info(model_status)
70
  try:
71
- # Determine device type based on execution provider string
72
- if EXECUTION_PROVIDER.lower() == "cuda":
73
- og_device_type = og.DeviceType.CUDA
74
- elif EXECUTION_PROVIDER.lower() == "dml":
75
- og_device_type = og.DeviceType.DML # Requires onnxruntime-genai-directml
76
- else: # Default to CPU
77
- og_device_type = og.DeviceType.CPU
78
-
79
- model = og.Model(model_path, og_device_type)
80
  tokenizer = og.Tokenizer(model)
81
  model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
82
  logging.info("Model and Tokenizer loaded successfully.")
 
 
 
 
 
83
  except Exception as e:
84
  logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
85
  model_status = f"Error loading model: {e}"
86
  raise RuntimeError(f"Failed to load model: {e}")
87
 
88
- # --- Generation Function ---
89
- def generate_response(prompt, history, max_length, temperature, top_p, top_k):
90
- """Generates a response using the Phi-4 ONNX model, yielding partial results."""
91
  global model_status
92
  if not model or not tokenizer:
93
  model_status = "Error: Model not initialized!"
94
  yield "Error: Model not initialized. Please check logs."
95
  return
96
- if not prompt:
97
- yield "Please enter a prompt."
98
- return
99
 
100
  # --- Prepare the prompt using the Phi-4 instruct format ---
101
  full_prompt = ""
102
- for user_msg, assistant_msg in history:
 
103
  full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
104
- if assistant_msg:
105
  full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
 
 
106
  full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
107
 
108
  logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
@@ -127,11 +126,11 @@ def generate_response(prompt, history, max_length, temperature, top_p, top_k):
127
 
128
  start_time = time.time()
129
  generator = og.Generator(model, params)
130
- response_text = ""
131
  model_status = "Generating..." # Update status indicator
132
  logging.info("Streaming response...")
133
 
134
  first_token_time = None
 
135
  while not generator.is_done():
136
  generator.compute_logits()
137
  generator.generate_next_token()
@@ -144,45 +143,83 @@ def generate_response(prompt, history, max_length, temperature, top_p, top_k):
144
  break
145
 
146
  decoded_chunk = tokenizer.decode([next_token])
 
147
 
148
  # Handle potential decoding issues or special tokens if necessary
149
- # (e.g., some models might output "<|end|>" which you might want to strip)
150
  if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
151
  logging.info("Assistant explicitly generated <|end|> token.")
152
  break
 
 
 
153
 
154
- response_text += decoded_chunk
155
- yield response_text # Yield intermediate results for streaming effect
156
 
157
  end_time = time.time()
158
  ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
159
  total_time = end_time - start_time
160
- token_count = len(tokenizer.decode(generator.get_output_sequences()[0])) # Approx token count
161
  tps = (token_count / total_time) if total_time > 0 else 0
162
 
163
- logging.info(f"Generation complete. Tokens: ~{token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
164
  model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
165
 
166
- # Final yield with the complete text
167
- yield response_text.strip()
168
-
169
  except Exception as e:
170
  logging.error(f"Error during generation: {e}", exc_info=True)
171
  model_status = f"Error during generation: {e}"
172
- yield f"Sorry, an error occurred during generation: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- # --- Clear Chat Function ---
 
 
 
 
 
 
175
  def clear_chat():
176
- return None, None # Clears Textbox and Chatbot
 
 
 
 
 
 
 
 
177
 
178
  # --- Initialize Model on App Start ---
179
- # Wrap in try-except to allow Gradio UI to potentially load even if model fails
180
  try:
181
  initialize_model()
182
  except Exception as e:
183
  print(f"FATAL: Model initialization failed: {e}")
184
- model_status = f"FATAL ERROR during init: {e}"
185
- # The UI will still load, but generation will fail. The status will show the error.
186
 
187
  # --- Gradio Interface ---
188
  logging.info("Creating Gradio Interface...")
@@ -193,10 +230,6 @@ theme = gr.themes.Soft(
193
  secondary_hue="sky",
194
  neutral_hue="slate",
195
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
196
- ).set(
197
- # Customize specific component styles if needed
198
- # button_primary_background_fill="*primary_500",
199
- # button_primary_background_fill_hover="*primary_400",
200
  )
201
 
202
  with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
@@ -206,10 +239,11 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
206
  gr.Markdown(f"""
207
  # Phi-4 Mini Instruct ONNX Chat 🤖
208
  Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
209
- running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}).
210
  """)
211
  with gr.Column(scale=1, min_width=150):
212
  gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
 
213
  model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
214
 
215
 
@@ -222,7 +256,7 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
222
  height=600,
223
  layout="bubble",
224
  bubble_full_width=False,
225
- avatar_images=(None, "https://microsoft.github.io/phi/assets/img/logo-final.png") # (user, bot) - Optional: Add user avatar path/URL if desired
226
  )
227
  with gr.Row():
228
  prompt_input = gr.Textbox(
@@ -231,8 +265,10 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
231
  lines=4,
232
  scale=9 # Make textbox wider
233
  )
234
- submit_button = gr.Button("Send", variant="primary", scale=1, min_width=120) # Primary send button
235
- clear_button = gr.Button("🗑️ Clear", variant="secondary", scale=1, min_width=120) # Secondary clear button
 
 
236
 
237
 
238
  # Settings Column
@@ -246,37 +282,51 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
246
 
247
  gr.Markdown("---") # Separator
248
  gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
 
249
 
250
 
251
  # Event Listeners (Connecting UI components to functions)
252
 
253
- # Define reusable inputs list for generation
254
- gen_inputs = [prompt_input, chatbot, max_length, temperature, top_p, top_k]
255
-
256
- # Submit action (using streaming yields from generate_response)
257
- submit_button.click(
258
- fn=generate_response,
259
- inputs=gen_inputs,
260
- outputs=[chatbot], # Output directly streams to chatbot
261
- queue=True # Enable queuing
 
 
 
 
 
 
 
 
262
  )
263
- # Allow submitting via Enter key in the textbox as well
264
- prompt_input.submit(
265
- fn=generate_response,
266
- inputs=gen_inputs,
 
 
 
 
 
267
  outputs=[chatbot],
268
- queue=True
269
  )
270
 
271
  # Clear button action
272
  clear_button.click(
273
  fn=clear_chat,
274
  inputs=None,
275
- outputs=[prompt_input, chatbot], # Clear both input and chat history
276
- queue=False # No need to queue clearing
277
  )
278
 
279
  # Launch the Gradio app
280
  logging.info("Launching Gradio App...")
281
- demo.queue() # Enable queuing for handling concurrent users/requests
282
- demo.launch(show_error=True, max_threads=40) # show_error=True helps debug in Spaces
 
13
  MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
14
 
15
  # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
16
+ EXECUTION_PROVIDER = "cpu" # Corresponds to installing 'onnxruntime-genai'
17
  MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
 
18
  # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
19
 
20
  # --- (Optional) Alternative GPU Configuration ---
21
+ # EXECUTION_PROVIDER = "cuda" # Corresponds to installing 'onnxruntime-genai-cuda'
22
  # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
 
23
  # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
24
 
25
  LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
26
  HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
27
  HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
28
  ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
29
+ PHI_LOGO_URL = "https://microsoft.github.io/phi/assets/img/logo-final.png" # Phi logo for bot avatar
30
 
31
  # Global variables for model and tokenizer
32
  model = None
 
67
  model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
68
  logging.info(model_status)
69
  try:
70
+ # FIX: Remove explicit DeviceType. Let the library infer or use string if needed by constructor.
71
+ # The simple constructor often works by detecting the installed ORT package.
72
+ logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
73
+ model = og.Model(model_path) # Simplified model loading
 
 
 
 
 
74
  tokenizer = og.Tokenizer(model)
75
  model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
76
  logging.info("Model and Tokenizer loaded successfully.")
77
+ except AttributeError as ae:
78
+ logging.error(f"AttributeError during model/tokenizer init: {ae}", exc_info=True)
79
+ logging.error("This might indicate an installation issue or version incompatibility with onnxruntime_genai.")
80
+ model_status = f"Init Error: {ae}"
81
+ raise RuntimeError(f"Failed to initialize model/tokenizer: {ae}")
82
  except Exception as e:
83
  logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
84
  model_status = f"Error loading model: {e}"
85
  raise RuntimeError(f"Failed to load model: {e}")
86
 
87
+ # --- Generation Function (Core Logic) ---
88
+ def generate_response_stream(prompt, history, max_length, temperature, top_p, top_k):
89
+ """Generates a response using the Phi-4 ONNX model, yielding text chunks."""
90
  global model_status
91
  if not model or not tokenizer:
92
  model_status = "Error: Model not initialized!"
93
  yield "Error: Model not initialized. Please check logs."
94
  return
 
 
 
95
 
96
  # --- Prepare the prompt using the Phi-4 instruct format ---
97
  full_prompt = ""
98
+ # History format is [[user1, bot1], [user2, bot2], ...]
99
+ for user_msg, assistant_msg in history: # history here is *before* the current prompt
100
  full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
101
+ if assistant_msg: # Append assistant message only if it exists
102
  full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
103
+
104
+ # Add the current user prompt and the trigger for the assistant's response
105
  full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
106
 
107
  logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
 
126
 
127
  start_time = time.time()
128
  generator = og.Generator(model, params)
 
129
  model_status = "Generating..." # Update status indicator
130
  logging.info("Streaming response...")
131
 
132
  first_token_time = None
133
+ token_count = 0
134
  while not generator.is_done():
135
  generator.compute_logits()
136
  generator.generate_next_token()
 
143
  break
144
 
145
  decoded_chunk = tokenizer.decode([next_token])
146
+ token_count += 1
147
 
148
  # Handle potential decoding issues or special tokens if necessary
 
149
  if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
150
  logging.info("Assistant explicitly generated <|end|> token.")
151
  break
152
+ if decoded_chunk == tokenizer.eos_token: # Check against tokenizer's eos_token string
153
+ logging.info("Assistant generated EOS token string.")
154
+ break
155
 
156
+
157
+ yield decoded_chunk # Yield just the text chunk
158
 
159
  end_time = time.time()
160
  ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
161
  total_time = end_time - start_time
 
162
  tps = (token_count / total_time) if total_time > 0 else 0
163
 
164
+ logging.info(f"Generation complete. Tokens: {token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
165
  model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
166
 
 
 
 
167
  except Exception as e:
168
  logging.error(f"Error during generation: {e}", exc_info=True)
169
  model_status = f"Error during generation: {e}"
170
+ yield f"\n\nSorry, an error occurred during generation: {e}" # Yield error message
171
+
172
+ # --- Gradio Interface Functions ---
173
+
174
+ # 1. Function to add user message to chat history
175
+ def add_user_message(user_message, history):
176
+ """Adds the user's message to the chat history for display."""
177
+ if not user_message:
178
+ raise gr.Error("Please enter a message.")
179
+ history = history + [[user_message, None]] # Append user message, leave bot response None
180
+ return "", history # Clear input textbox, return updated history
181
+
182
+ # 2. Function to handle bot response generation and streaming
183
+ def generate_bot_response(history, max_length, temperature, top_p, top_k):
184
+ """Generates the bot's response based on the history and streams it."""
185
+ if not history or history[-1][1] is not None:
186
+ # This shouldn't happen in the normal flow, but good practice
187
+ return history
188
+
189
+ user_prompt = history[-1][0] # Get the latest user prompt
190
+ # Prepare history for the model (all turns *before* the current one)
191
+ model_history = history[:-1]
192
+
193
+ # Get the generator stream
194
+ response_stream = generate_response_stream(
195
+ user_prompt, model_history, max_length, temperature, top_p, top_k
196
+ )
197
 
198
+ # Stream the response chunks back to Gradio
199
+ history[-1][1] = "" # Initialize the bot response string
200
+ for chunk in response_stream:
201
+ history[-1][1] += chunk # Append the chunk to the bot's message in history
202
+ yield history # Yield the *entire updated history* back to Chatbot
203
+
204
+ # 3. Function to clear chat
205
  def clear_chat():
206
+ """Clears the chat history and input."""
207
+ global model_status # Keep model status indicator updated
208
+ # Reset status only if it was showing an error from generation maybe?
209
+ # Or just always reset to Ready if model is loaded.
210
+ if model and tokenizer:
211
+ model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
212
+ # Keep the original error if init failed
213
+ return None, [], model_status # Clear Textbox, Chatbot history, and update status display
214
+
215
 
216
  # --- Initialize Model on App Start ---
 
217
  try:
218
  initialize_model()
219
  except Exception as e:
220
  print(f"FATAL: Model initialization failed: {e}")
221
+ # model_status is already set inside initialize_model on error
222
+
223
 
224
  # --- Gradio Interface ---
225
  logging.info("Creating Gradio Interface...")
 
230
  secondary_hue="sky",
231
  neutral_hue="slate",
232
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
 
 
 
 
233
  )
234
 
235
  with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
 
239
  gr.Markdown(f"""
240
  # Phi-4 Mini Instruct ONNX Chat 🤖
241
  Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
242
+ running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}) ({EXECUTION_PROVIDER.upper()}).
243
  """)
244
  with gr.Column(scale=1, min_width=150):
245
  gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
246
+ # Use the global model_status variable for the initial value
247
  model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
248
 
249
 
 
256
  height=600,
257
  layout="bubble",
258
  bubble_full_width=False,
259
+ avatar_images=(None, PHI_LOGO_URL) # (user, bot)
260
  )
261
  with gr.Row():
262
  prompt_input = gr.Textbox(
 
265
  lines=4,
266
  scale=9 # Make textbox wider
267
  )
268
+ # Combine Send and Clear Buttons Vertically? Or keep side-by-side? Side-by-side looks better
269
+ with gr.Column(scale=1, min_width=120):
270
+ submit_button = gr.Button("Send", variant="primary", size="lg")
271
+ clear_button = gr.Button("🗑️ Clear Chat", variant="secondary")
272
 
273
 
274
  # Settings Column
 
282
 
283
  gr.Markdown("---") # Separator
284
  gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
285
+ gr.Markdown(f"Running on **{EXECUTION_PROVIDER.upper()}**.")
286
 
287
 
288
  # Event Listeners (Connecting UI components to functions)
289
 
290
+ # Define inputs for the bot response generator
291
+ bot_response_inputs = [chatbot, max_length, temperature, top_p, top_k]
292
+
293
+ # Chain actions:
294
+ # 1. User presses Enter or clicks Send
295
+ # 2. `add_user_message` updates history, clears input
296
+ # 3. `generate_bot_response` streams bot reply into history
297
+ submit_event = prompt_input.submit(
298
+ fn=add_user_message,
299
+ inputs=[prompt_input, chatbot],
300
+ outputs=[prompt_input, chatbot], # Update textbox and history
301
+ queue=False, # Submit is fast
302
+ ).then(
303
+ fn=generate_bot_response, # Call the generator function
304
+ inputs=bot_response_inputs, # Pass history and params
305
+ outputs=[chatbot], # Stream output directly to chatbot
306
+ api_name="chat" # Optional: name for API usage
307
  )
308
+
309
+ submit_button.click( # Mirror actions for button click
310
+ fn=add_user_message,
311
+ inputs=[prompt_input, chatbot],
312
+ outputs=[prompt_input, chatbot],
313
+ queue=False,
314
+ ).then(
315
+ fn=generate_bot_response,
316
+ inputs=bot_response_inputs,
317
  outputs=[chatbot],
318
+ api_name=False # Don't expose button click as separate API endpoint
319
  )
320
 
321
  # Clear button action
322
  clear_button.click(
323
  fn=clear_chat,
324
  inputs=None,
325
+ outputs=[prompt_input, chatbot, model_status_text], # Clear input, chat, and update status text
326
+ queue=False # Clearing is fast
327
  )
328
 
329
  # Launch the Gradio app
330
  logging.info("Launching Gradio App...")
331
+ demo.queue(max_size=20) # Enable queuing with a limit
332
+ demo.launch(show_error=True, max_threads=40)