Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 20 days ago

Commit

e3d67e9

verified ·

1 Parent(s): 751f392

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -64

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 from huggingface_hub import snapshot_download
 import argparse
 import logging
 # --- Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -67,7 +68,6 @@ def initialize_model():
     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
-        # FIX: Removed explicit DeviceType. Let the library infer or use string if needed by constructor.
         # The simple constructor often works by detecting the installed ORT package.
         logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
         model = og.Model(model_path) # Simplified model loading
@@ -107,10 +107,13 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
     try:
-        input_tokens = tokenizer.encode(full_prompt)
-        # FIX: Removed eos_token_id and pad_token_id as they are not attributes
-        # of onnxruntime_genai.Tokenizer and likely handled internally by the generator.
         search_options = {
             "max_length": max_length,
             "temperature": temperature,
@@ -121,8 +124,13 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
         params = og.GeneratorParams(model)
         params.set_search_options(**search_options)
-        # FIX: Use the set_inputs method as suggested by the error message
-        params.set_inputs(input_tokens)
         start_time = time.time()
         # Create generator AFTER setting parameters including inputs
@@ -134,22 +142,27 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
         token_count = 0
         # Rely primarily on generator.is_done()
         while not generator.is_done():
-            generator.compute_logits()
-            generator.generate_next_token()
-            if first_token_time is None:
-                 first_token_time = time.time() # Record time to first token
-            next_token = generator.get_next_tokens()[0]
-            decoded_chunk = tokenizer.decode([next_token])
-            token_count += 1
-            # Secondary check: Stop if the model explicitly generates the <|end|> string literal.
-            if decoded_chunk == "<|end|>":
-                logging.info("Assistant explicitly generated <|end|> token string.")
-                break
-            yield decoded_chunk # Yield just the text chunk
         end_time = time.time()
         ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
@@ -159,6 +172,12 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
         logging.info(f"Generation complete. Tokens: {token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
     except AttributeError as ae:
          # Catch potential future API changes or issues during generation setup
          logging.error(f"AttributeError during generation setup: {ae}", exc_info=True)
@@ -176,11 +195,7 @@ def generate_response_stream(prompt, history, max_length, temperature, top_p, to
 def add_user_message(user_message, history):
     """Adds the user's message to the chat history for display."""
     if not user_message:
-        # Returning original history prevents adding empty message
-        # Use gr.Warning or gr.Info for user feedback? Or raise gr.Error?
-        # gr.Warning("Please enter a message.") # Shows warning toast
         return "", history # Clear input, return unchanged history
-        # raise gr.Error("Please enter a message.") # Stops execution, shows error
     history = history + [[user_message, None]] # Append user message, leave bot response None
     return "", history # Clear input textbox, return updated history
@@ -188,20 +203,15 @@ def add_user_message(user_message, history):
 def generate_bot_response(history, max_length, temperature, top_p, top_k):
     """Generates the bot's response based on the history and streams it."""
     if not history or history[-1][1] is not None:
-        # This case means user submitted empty message or something went wrong
-        # No need to generate if the last turn isn't user's pending turn
         return history
     user_prompt = history[-1][0] # Get the latest user prompt
-    # Prepare history for the model (all turns *before* the current one)
-    model_history = history[:-1]
-    # Get the generator stream
     response_stream = generate_response_stream(
         user_prompt, model_history, max_length, temperature, top_p, top_k
     )
-    # Stream the response chunks back to Gradio
     history[-1][1] = "" # Initialize the bot response string in the history
     for chunk in response_stream:
         history[-1][1] += chunk # Append the chunk to the bot's message in history
@@ -210,12 +220,9 @@ def generate_bot_response(history, max_length, temperature, top_p, top_k):
 # 3. Function to clear chat
 def clear_chat():
     """Clears the chat history and input."""
-    global model_status # Keep model status indicator updated
-    # Reset status only if it was showing an error from generation maybe?
-    # Or just always reset to Ready if model is loaded.
     if model and tokenizer and not model_status.startswith("Error") and not model_status.startswith("FATAL"):
          model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
-    # Keep the original error if init failed, otherwise show ready status
     return None, [], model_status # Clear Textbox, Chatbot history, and update status display
@@ -224,13 +231,11 @@ try:
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
-    # model_status is already set inside initialize_model on error
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
-# Select a theme
 theme = gr.themes.Soft(
     primary_hue="blue",
     secondary_hue="sky",
@@ -249,11 +254,9 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
             """)
         with gr.Column(scale=1, min_width=150):
              gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
-             # Use the global model_status variable for the initial value
              model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
-    # Main Layout (Chat on Left, Settings on Right)
     with gr.Row():
         # Chat Column
         with gr.Column(scale=3):
@@ -262,57 +265,47 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
                 height=600,
                 layout="bubble",
                 bubble_full_width=False,
-                avatar_images=(None, PHI_LOGO_URL) # (user, bot)
             )
             with gr.Row():
                  prompt_input = gr.Textbox(
                     label="Your Message",
                     placeholder="<|user|>\nType your message here...\n<|end|>",
                     lines=4,
-                    scale=9 # Make textbox wider
                  )
-                 # Combine Send and Clear Buttons Vertically? Or keep side-by-side? Side-by-side looks better
                  with gr.Column(scale=1, min_width=120):
                      submit_button = gr.Button("Send", variant="primary", size="lg")
                      clear_button = gr.Button("🗑️ Clear Chat", variant="secondary")
         # Settings Column
         with gr.Column(scale=1, min_width=250):
             gr.Markdown("### ⚙️ Generation Settings")
-            with gr.Group(): # Group settings visually
                 max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Max tokens in response.")
                 temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="0.0 = deterministic\n>1.0 = more random")
                 top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P", info="Nucleus sampling probability.")
                 top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Limit to K most likely tokens (0=disable).")
-            gr.Markdown("---") # Separator
             gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
             gr.Markdown(f"Running on **{EXECUTION_PROVIDER.upper()}**.")
-    # Event Listeners (Connecting UI components to functions)
-    # Define inputs for the bot response generator
     bot_response_inputs = [chatbot, max_length, temperature, top_p, top_k]
-    # Chain actions:
-    # 1. User presses Enter or clicks Send
-    # 2. `add_user_message` updates history, clears input
-    # 3. `generate_bot_response` streams bot reply into history
     submit_event = prompt_input.submit(
         fn=add_user_message,
         inputs=[prompt_input, chatbot],
-        outputs=[prompt_input, chatbot], # Update textbox and history
-        queue=False, # Submit is fast
     ).then(
-        fn=generate_bot_response, # Call the generator function
-        inputs=bot_response_inputs, # Pass history and params
-        outputs=[chatbot], # Stream output directly to chatbot
-        api_name="chat" # Optional: name for API usage
     )
-    submit_button.click( # Mirror actions for button click
         fn=add_user_message,
         inputs=[prompt_input, chatbot],
         outputs=[prompt_input, chatbot],
@@ -321,18 +314,17 @@ with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
         fn=generate_bot_response,
         inputs=bot_response_inputs,
         outputs=[chatbot],
-        api_name=False # Don't expose button click as separate API endpoint
     )
-    # Clear button action
     clear_button.click(
         fn=clear_chat,
         inputs=None,
-        outputs=[prompt_input, chatbot, model_status_text], # Clear input, chat, and update status text
-        queue=False # Clearing is fast
     )
 # Launch the Gradio app
 logging.info("Launching Gradio App...")
-demo.queue(max_size=20) # Enable queuing with a limit
 demo.launch(show_error=True, max_threads=40)

 from huggingface_hub import snapshot_download
 import argparse
 import logging
+import numpy as np # Import numpy
 # --- Logging Setup ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
     logging.info(model_status)
     try:
         # The simple constructor often works by detecting the installed ORT package.
         logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
         model = og.Model(model_path) # Simplified model loading
     logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
     try:
+        input_tokens_list = tokenizer.encode(full_prompt) # Encode returns a list/array
+        # Ensure input_tokens is a numpy array of the correct type (int32 is common)
+        input_tokens = np.array(input_tokens_list, dtype=np.int32)
+        # Reshape to (batch_size, sequence_length), which is (1, N) for single prompt
+        input_tokens = input_tokens.reshape((1, -1))
         search_options = {
             "max_length": max_length,
             "temperature": temperature,
         params = og.GeneratorParams(model)
         params.set_search_options(**search_options)
+        # FIX: Create a dictionary mapping input names to tensors (numpy arrays)
+        #      and pass this dictionary to set_inputs.
+        #      Assuming the standard input name "input_ids".
+        inputs = {"input_ids": input_tokens}
+        logging.info(f"Setting inputs with keys: {inputs.keys()} and shape for 'input_ids': {inputs['input_ids'].shape}")
+        params.set_inputs(inputs)
         start_time = time.time()
         # Create generator AFTER setting parameters including inputs
         token_count = 0
         # Rely primarily on generator.is_done()
         while not generator.is_done():
+            try:
+                generator.compute_logits()
+                generator.generate_next_token()
+                if first_token_time is None:
+                    first_token_time = time.time() # Record time to first token
+                next_token = generator.get_next_tokens()[0]
+                decoded_chunk = tokenizer.decode([next_token])
+                token_count += 1
+                # Secondary check: Stop if the model explicitly generates the <|end|> string literal.
+                if decoded_chunk == "<|end|>":
+                    logging.info("Assistant explicitly generated <|end|> token string.")
+                    break
+                yield decoded_chunk # Yield just the text chunk
+            except Exception as loop_error:
+                logging.error(f"Error inside generation loop: {loop_error}", exc_info=True)
+                yield f"\n\nError during token generation: {loop_error}"
+                break # Exit loop on error
         end_time = time.time()
         ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
         logging.info(f"Generation complete. Tokens: {token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
+    except TypeError as te:
+        # Catch type errors specifically during setup if the input format is still wrong
+        logging.error(f"TypeError during generation setup: {te}", exc_info=True)
+        logging.error("Check if the input format {'input_ids': token_array} is correct.")
+        model_status = f"Generation Setup TypeError: {te}"
+        yield f"\n\nSorry, a TypeError occurred setting up generation: {te}"
     except AttributeError as ae:
          # Catch potential future API changes or issues during generation setup
          logging.error(f"AttributeError during generation setup: {ae}", exc_info=True)
 def add_user_message(user_message, history):
     """Adds the user's message to the chat history for display."""
     if not user_message:
         return "", history # Clear input, return unchanged history
     history = history + [[user_message, None]] # Append user message, leave bot response None
     return "", history # Clear input textbox, return updated history
 def generate_bot_response(history, max_length, temperature, top_p, top_k):
     """Generates the bot's response based on the history and streams it."""
     if not history or history[-1][1] is not None:
         return history
     user_prompt = history[-1][0] # Get the latest user prompt
+    model_history = history[:-1] # Prepare history for the model
     response_stream = generate_response_stream(
         user_prompt, model_history, max_length, temperature, top_p, top_k
     )
     history[-1][1] = "" # Initialize the bot response string in the history
     for chunk in response_stream:
         history[-1][1] += chunk # Append the chunk to the bot's message in history
 # 3. Function to clear chat
 def clear_chat():
     """Clears the chat history and input."""
+    global model_status
     if model and tokenizer and not model_status.startswith("Error") and not model_status.startswith("FATAL"):
          model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
     return None, [], model_status # Clear Textbox, Chatbot history, and update status display
     initialize_model()
 except Exception as e:
     print(f"FATAL: Model initialization failed: {e}")
 # --- Gradio Interface ---
 logging.info("Creating Gradio Interface...")
 theme = gr.themes.Soft(
     primary_hue="blue",
     secondary_hue="sky",
             """)
         with gr.Column(scale=1, min_width=150):
              gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
              model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
+    # Main Layout
     with gr.Row():
         # Chat Column
         with gr.Column(scale=3):
                 height=600,
                 layout="bubble",
                 bubble_full_width=False,
+                avatar_images=(None, PHI_LOGO_URL)
             )
             with gr.Row():
                  prompt_input = gr.Textbox(
                     label="Your Message",
                     placeholder="<|user|>\nType your message here...\n<|end|>",
                     lines=4,
+                    scale=9
                  )
                  with gr.Column(scale=1, min_width=120):
                      submit_button = gr.Button("Send", variant="primary", size="lg")
                      clear_button = gr.Button("🗑️ Clear Chat", variant="secondary")
         # Settings Column
         with gr.Column(scale=1, min_width=250):
             gr.Markdown("### ⚙️ Generation Settings")
+            with gr.Group():
                 max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Max tokens in response.")
                 temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="0.0 = deterministic\n>1.0 = more random")
                 top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P", info="Nucleus sampling probability.")
                 top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Limit to K most likely tokens (0=disable).")
+            gr.Markdown("---")
             gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
             gr.Markdown(f"Running on **{EXECUTION_PROVIDER.upper()}**.")
+    # Event Listeners
     bot_response_inputs = [chatbot, max_length, temperature, top_p, top_k]
     submit_event = prompt_input.submit(
         fn=add_user_message,
         inputs=[prompt_input, chatbot],
+        outputs=[prompt_input, chatbot],
+        queue=False,
     ).then(
+        fn=generate_bot_response,
+        inputs=bot_response_inputs,
+        outputs=[chatbot],
+        api_name="chat"
     )
+    submit_button.click(
         fn=add_user_message,
         inputs=[prompt_input, chatbot],
         outputs=[prompt_input, chatbot],
         fn=generate_bot_response,
         inputs=bot_response_inputs,
         outputs=[chatbot],
+        api_name=False
     )
     clear_button.click(
         fn=clear_chat,
         inputs=None,
+        outputs=[prompt_input, chatbot, model_status_text],
+        queue=False
     )
 # Launch the Gradio app
 logging.info("Launching Gradio App...")
+demo.queue(max_size=20)
 demo.launch(show_error=True, max_threads=40)