Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 20 days ago

Commit

f0fbb06

verified ·

1 Parent(s): 513f7a6

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -62

app.py CHANGED Viewed

@@ -13,21 +13,20 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
 # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
-EXECUTION_PROVIDER = "cpu"
 MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
-# Ensure requirements.txt lists: onnxruntime-genai
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 # --- (Optional) Alternative GPU Configuration ---
-# EXECUTION_PROVIDER = "cuda"
 # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
-# Ensure requirements.txt lists: onnxruntime-genai-cuda
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
 HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
 HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
 ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
 # Global variables for model and tokenizer
 model = None
@@ -68,41 +67,41 @@ def initialize_model():
     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
-        # Determine device type based on execution provider string
-        if EXECUTION_PROVIDER.lower() == "cuda":
-            og_device_type = og.DeviceType.CUDA
-        elif EXECUTION_PROVIDER.lower() == "dml":
-             og_device_type = og.DeviceType.DML # Requires onnxruntime-genai-directml
-        else: # Default to CPU
-            og_device_type = og.DeviceType.CPU
-        model = og.Model(model_path, og_device_type)
         tokenizer = og.Tokenizer(model)
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
         logging.info("Model and Tokenizer loaded successfully.")
     except Exception as e:
         logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
         model_status = f"Error loading model: {e}"
         raise RuntimeError(f"Failed to load model: {e}")
-# --- Generation Function ---
-def generate_response(prompt, history, max_length, temperature, top_p, top_k):
-    """Generates a response using the Phi-4 ONNX model, yielding partial results."""
     global model_status
     if not model or not tokenizer:
         model_status = "Error: Model not initialized!"
         yield "Error: Model not initialized. Please check logs."
         return
-    if not prompt:
-        yield "Please enter a prompt."
-        return
     # --- Prepare the prompt using the Phi-4 instruct format ---
     full_prompt = ""
-    for user_msg, assistant_msg in history:
         full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
-        if assistant_msg:
              full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
@@ -127,11 +126,11 @@ def generate_response(prompt, history, max_length, temperature, top_p, top_k):
         start_time = time.time()
         generator = og.Generator(model, params)
-        response_text = ""
         model_status = "Generating..." # Update status indicator
         logging.info("Streaming response...")
         first_token_time = None
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
@@ -144,45 +143,83 @@ def generate_response(prompt, history, max_length, temperature, top_p, top_k):
                  break
             decoded_chunk = tokenizer.decode([next_token])
             # Handle potential decoding issues or special tokens if necessary
-            # (e.g., some models might output "<|end|>" which you might want to strip)
             if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
                 logging.info("Assistant explicitly generated <|end|> token.")
                 break
-            response_text += decoded_chunk
-            yield response_text # Yield intermediate results for streaming effect
         end_time = time.time()
         ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
         total_time = end_time - start_time
-        token_count = len(tokenizer.decode(generator.get_output_sequences()[0])) # Approx token count
         tps = (token_count / total_time) if total_time > 0 else 0
-        logging.info(f"Generation complete. Tokens: ~{token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
-        # Final yield with the complete text
-        yield response_text.strip()
     except Exception as e:
         logging.error(f"Error during generation: {e}", exc_info=True)
         model_status = f"Error during generation: {e}"
-        yield f"Sorry, an error occurred during generation: {e}"
-# --- Clear Chat Function ---
 def clear_chat():
-    return None, None # Clears Textbox and Chatbot
 # --- Initialize Model on App Start ---
-# Wrap in try-except to allow Gradio UI to potentially load even if model fails
 try:
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
-    model_status = f"FATAL ERROR during init: {e}"
-    # The UI will still load, but generation will fail. The status will show the error.
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
@@ -193,10 +230,6 @@ theme = gr.themes.Soft(
     secondary_hue="sky",
     neutral_hue="slate",
     font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
-).set(
-    # Customize specific component styles if needed
-    # button_primary_background_fill="*primary_500",
-    # button_primary_background_fill_hover="*primary_400",
 )
 with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
@@ -206,10 +239,11 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
             gr.Markdown(f"""
             # Phi-4 Mini Instruct ONNX Chat 🤖
             Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
-            running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}).
             """)
         with gr.Column(scale=1, min_width=150):
              gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
              model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
@@ -222,7 +256,7 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
                 height=600,
                 layout="bubble",
                 bubble_full_width=False,
-                avatar_images=(None, "https://microsoft.github.io/phi/assets/img/logo-final.png") # (user, bot) - Optional: Add user avatar path/URL if desired
             )
             with gr.Row():
                  prompt_input = gr.Textbox(
@@ -231,8 +265,10 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
                     lines=4,
                     scale=9 # Make textbox wider
                  )
-                 submit_button = gr.Button("Send", variant="primary", scale=1, min_width=120) # Primary send button
-                 clear_button = gr.Button("🗑️ Clear", variant="secondary", scale=1, min_width=120) # Secondary clear button
         # Settings Column
@@ -246,37 +282,51 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
             gr.Markdown("---") # Separator
             gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
     # Event Listeners (Connecting UI components to functions)
-    # Define reusable inputs list for generation
-    gen_inputs = [prompt_input, chatbot, max_length, temperature, top_p, top_k]
-    # Submit action (using streaming yields from generate_response)
-    submit_button.click(
-        fn=generate_response,
-        inputs=gen_inputs,
-        outputs=[chatbot], # Output directly streams to chatbot
-        queue=True # Enable queuing
     )
-    # Allow submitting via Enter key in the textbox as well
-    prompt_input.submit(
-        fn=generate_response,
-        inputs=gen_inputs,
         outputs=[chatbot],
-        queue=True
     )
     # Clear button action
     clear_button.click(
         fn=clear_chat,
         inputs=None,
-        outputs=[prompt_input, chatbot], # Clear both input and chat history
-        queue=False # No need to queue clearing
     )
 # Launch the Gradio app
 logging.info("Launching Gradio App...")
-demo.queue() # Enable queuing for handling concurrent users/requests
-demo.launch(show_error=True, max_threads=40) # show_error=True helps debug in Spaces

 MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
 # --- Defaulting to CPU INT4 for Hugging Face Spaces ---
+EXECUTION_PROVIDER = "cpu" # Corresponds to installing 'onnxruntime-genai'
 MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 # --- (Optional) Alternative GPU Configuration ---
+# EXECUTION_PROVIDER = "cuda" # Corresponds to installing 'onnxruntime-genai-cuda'
 # MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
 # --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
 LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
 HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
 HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
 ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
+PHI_LOGO_URL = "https://microsoft.github.io/phi/assets/img/logo-final.png" # Phi logo for bot avatar
 # Global variables for model and tokenizer
 model = None
     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
+        # FIX: Remove explicit DeviceType. Let the library infer or use string if needed by constructor.
+        # The simple constructor often works by detecting the installed ORT package.
+        logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
+        model = og.Model(model_path) # Simplified model loading
         tokenizer = og.Tokenizer(model)
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
         logging.info("Model and Tokenizer loaded successfully.")
+    except AttributeError as ae:
+         logging.error(f"AttributeError during model/tokenizer init: {ae}", exc_info=True)
+         logging.error("This might indicate an installation issue or version incompatibility with onnxruntime_genai.")
+         model_status = f"Init Error: {ae}"
+         raise RuntimeError(f"Failed to initialize model/tokenizer: {ae}")
     except Exception as e:
         logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
         model_status = f"Error loading model: {e}"
         raise RuntimeError(f"Failed to load model: {e}")
+# --- Generation Function (Core Logic) ---
+def generate_response_stream(prompt, history, max_length, temperature, top_p, top_k):
+    """Generates a response using the Phi-4 ONNX model, yielding text chunks."""
     global model_status
     if not model or not tokenizer:
         model_status = "Error: Model not initialized!"
         yield "Error: Model not initialized. Please check logs."
         return
     # --- Prepare the prompt using the Phi-4 instruct format ---
     full_prompt = ""
+    # History format is [[user1, bot1], [user2, bot2], ...]
+    for user_msg, assistant_msg in history: # history here is *before* the current prompt
         full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
+        if assistant_msg: # Append assistant message only if it exists
              full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
+    # Add the current user prompt and the trigger for the assistant's response
     full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
         start_time = time.time()
         generator = og.Generator(model, params)
         model_status = "Generating..." # Update status indicator
         logging.info("Streaming response...")
         first_token_time = None
+        token_count = 0
         while not generator.is_done():
             generator.compute_logits()
             generator.generate_next_token()
                  break
             decoded_chunk = tokenizer.decode([next_token])
+            token_count += 1
             # Handle potential decoding issues or special tokens if necessary
             if decoded_chunk == "<|end|>": # Example: Stop if assistant outputs end token explicitly
                 logging.info("Assistant explicitly generated <|end|> token.")
                 break
+            if decoded_chunk == tokenizer.eos_token: # Check against tokenizer's eos_token string
+                logging.info("Assistant generated EOS token string.")
+                break
+            yield decoded_chunk # Yield just the text chunk
         end_time = time.time()
         ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
         total_time = end_time - start_time
         tps = (token_count / total_time) if total_time > 0 else 0
+        logging.info(f"Generation complete. Tokens: {token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
     except Exception as e:
         logging.error(f"Error during generation: {e}", exc_info=True)
         model_status = f"Error during generation: {e}"
+        yield f"\n\nSorry, an error occurred during generation: {e}" # Yield error message
+# --- Gradio Interface Functions ---
+# 1. Function to add user message to chat history
+def add_user_message(user_message, history):
+    """Adds the user's message to the chat history for display."""
+    if not user_message:
+        raise gr.Error("Please enter a message.")
+    history = history + [[user_message, None]] # Append user message, leave bot response None
+    return "", history # Clear input textbox, return updated history
+# 2. Function to handle bot response generation and streaming
+def generate_bot_response(history, max_length, temperature, top_p, top_k):
+    """Generates the bot's response based on the history and streams it."""
+    if not history or history[-1][1] is not None:
+        # This shouldn't happen in the normal flow, but good practice
+        return history
+    user_prompt = history[-1][0] # Get the latest user prompt
+    # Prepare history for the model (all turns *before* the current one)
+    model_history = history[:-1]
+    # Get the generator stream
+    response_stream = generate_response_stream(
+        user_prompt, model_history, max_length, temperature, top_p, top_k
+    )
+    # Stream the response chunks back to Gradio
+    history[-1][1] = "" # Initialize the bot response string
+    for chunk in response_stream:
+        history[-1][1] += chunk # Append the chunk to the bot's message in history
+        yield history # Yield the *entire updated history* back to Chatbot
+# 3. Function to clear chat
 def clear_chat():
+    """Clears the chat history and input."""
+    global model_status # Keep model status indicator updated
+    # Reset status only if it was showing an error from generation maybe?
+    # Or just always reset to Ready if model is loaded.
+    if model and tokenizer:
+         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
+    # Keep the original error if init failed
+    return None, [], model_status # Clear Textbox, Chatbot history, and update status display
 # --- Initialize Model on App Start ---
 try:
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
+    # model_status is already set inside initialize_model on error
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
     secondary_hue="sky",
     neutral_hue="slate",
     font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
 )
 with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
             gr.Markdown(f"""
             # Phi-4 Mini Instruct ONNX Chat 🤖
             Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
+            running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}) ({EXECUTION_PROVIDER.upper()}).
             """)
         with gr.Column(scale=1, min_width=150):
              gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
+             # Use the global model_status variable for the initial value
              model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
                 height=600,
                 layout="bubble",
                 bubble_full_width=False,
+                avatar_images=(None, PHI_LOGO_URL) # (user, bot)
             )
             with gr.Row():
                  prompt_input = gr.Textbox(
                     lines=4,
                     scale=9 # Make textbox wider
                  )
+                 # Combine Send and Clear Buttons Vertically? Or keep side-by-side? Side-by-side looks better
+                 with gr.Column(scale=1, min_width=120):
+                     submit_button = gr.Button("Send", variant="primary", size="lg")
+                     clear_button = gr.Button("🗑️ Clear Chat", variant="secondary")
         # Settings Column
             gr.Markdown("---") # Separator
             gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
+            gr.Markdown(f"Running on **{EXECUTION_PROVIDER.upper()}**.")
     # Event Listeners (Connecting UI components to functions)
+    # Define inputs for the bot response generator
+    bot_response_inputs = [chatbot, max_length, temperature, top_p, top_k]
+    # Chain actions:
+    # 1. User presses Enter or clicks Send
+    # 2. `add_user_message` updates history, clears input
+    # 3. `generate_bot_response` streams bot reply into history
+    submit_event = prompt_input.submit(
+        fn=add_user_message,
+        inputs=[prompt_input, chatbot],
+        outputs=[prompt_input, chatbot], # Update textbox and history
+        queue=False, # Submit is fast
+    ).then(
+        fn=generate_bot_response, # Call the generator function
+        inputs=bot_response_inputs, # Pass history and params
+        outputs=[chatbot], # Stream output directly to chatbot
+        api_name="chat" # Optional: name for API usage
     )
+    submit_button.click( # Mirror actions for button click
+        fn=add_user_message,
+        inputs=[prompt_input, chatbot],
+        outputs=[prompt_input, chatbot],
+        queue=False,
+    ).then(
+        fn=generate_bot_response,
+        inputs=bot_response_inputs,
         outputs=[chatbot],
+        api_name=False # Don't expose button click as separate API endpoint
     )
     # Clear button action
     clear_button.click(
         fn=clear_chat,
         inputs=None,
+        outputs=[prompt_input, chatbot, model_status_text], # Clear input, chat, and update status text
+        queue=False # Clearing is fast
     )
 # Launch the Gradio app
 logging.info("Launching Gradio App...")
+demo.queue(max_size=20) # Enable queuing with a limit
+demo.launch(show_error=True, max_threads=40)