Spaces:

Athspi
/

Yyyy

Runtime error

App Files Files Community

Athspi commited on 20 days ago

Commit

21cab83

verified ·

1 Parent(s): cd6a828

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -306

app.py CHANGED Viewed

@@ -1,330 +1,210 @@
 import gradio as gr
-import onnxruntime_genai as og
-import time
-import os
-from huggingface_hub import snapshot_download
-import argparse
-import logging
-import numpy as np # Import numpy
-# --- Logging Setup ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Configuration ---
-MODEL_REPO = "microsoft/Phi-4-mini-instruct-onnx"
-# --- Defaulting to CPU INT4 for Hugging Face Spaces ---
-EXECUTION_PROVIDER = "cpu" # Corresponds to installing 'onnxruntime-genai'
-MODEL_VARIANT_GLOB = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
-# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
-# --- (Optional) Alternative GPU Configuration ---
-# EXECUTION_PROVIDER = "cuda" # Corresponds to installing 'onnxruntime-genai-cuda'
-# MODEL_VARIANT_GLOB = "gpu/gpu-int4-rtn-block-32/*"
-# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
-LOCAL_MODEL_DIR = "./phi4-mini-onnx-model" # Directory within the Space
-HF_LOGO_URL = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
-HF_MODEL_URL = f"https://huggingface.co/{MODEL_REPO}"
-ORT_GENAI_URL = "https://github.com/microsoft/onnxruntime-genai"
-PHI_LOGO_URL = "https://microsoft.github.io/phi/assets/img/logo-final.png" # Phi logo for bot avatar
-# Global variables for model and tokenizer
-model = None
-tokenizer = None
-model_variant_name = os.path.basename(os.path.dirname(MODEL_VARIANT_GLOB)) # For display
-model_status = "Initializing..."
-# --- Model Download and Load ---
-def initialize_model():
-    """Downloads and loads the ONNX model and tokenizer."""
-    global model, tokenizer, model_status
-    logging.info("--- Initializing ONNX Runtime GenAI ---")
-    model_status = "Downloading model..."
-    logging.info(model_status)
-    # --- Download ---
-    model_variant_dir = os.path.join(LOCAL_MODEL_DIR, os.path.dirname(MODEL_VARIANT_GLOB))
-    if os.path.exists(model_variant_dir) and os.listdir(model_variant_dir):
-        logging.info(f"Model variant found in {model_variant_dir}. Skipping download.")
-        model_path = model_variant_dir
-    else:
-        logging.info(f"Downloading model variant '{MODEL_VARIANT_GLOB}' from {MODEL_REPO}...")
-        try:
-            snapshot_download(
-                MODEL_REPO,
-                allow_patterns=[MODEL_VARIANT_GLOB],
-                local_dir=LOCAL_MODEL_DIR,
-                local_dir_use_symlinks=False
-            )
-            model_path = model_variant_dir
-            logging.info(f"Model downloaded to: {model_path}")
-        except Exception as e:
-            logging.error(f"Error downloading model: {e}", exc_info=True)
-            model_status = f"Error downloading model: {e}"
-            raise RuntimeError(f"Failed to download model: {e}")
-    # --- Load ---
-    model_status = f"Loading model ({EXECUTION_PROVIDER.upper()})..."
-    logging.info(model_status)
     try:
-        # The simple constructor often works by detecting the installed ORT package.
-        logging.info(f"Using provider based on installed package (expecting: {EXECUTION_PROVIDER})")
-        model = og.Model(model_path) # Simplified model loading
-        tokenizer = og.Tokenizer(model)
-        model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
-        logging.info("Model and Tokenizer loaded successfully.")
-    except AttributeError as ae:
-         logging.error(f"AttributeError during model/tokenizer init: {ae}", exc_info=True)
-         logging.error("This might indicate an installation issue or version incompatibility with onnxruntime_genai.")
-         model_status = f"Init Error: {ae}"
-         raise RuntimeError(f"Failed to initialize model/tokenizer: {ae}")
     except Exception as e:
-        logging.error(f"Error loading model or tokenizer: {e}", exc_info=True)
-        model_status = f"Error loading model: {e}"
-        raise RuntimeError(f"Failed to load model: {e}")
-# --- Generation Function (Core Logic) ---
-def generate_response_stream(prompt, history, max_length, temperature, top_p, top_k):
-    """Generates a response using the Phi-4 ONNX model, yielding text chunks."""
-    global model_status
-    if not model or not tokenizer:
-        model_status = "Error: Model not initialized!"
-        yield "Error: Model not initialized. Please check logs."
-        return
-    # --- Prepare the prompt using the Phi-4 instruct format ---
-    full_prompt = ""
-    # History format is [[user1, bot1], [user2, bot2], ...]
-    for user_msg, assistant_msg in history: # history here is *before* the current prompt
-        full_prompt += f"<|user|>\n{user_msg}<|end|>\n"
-        if assistant_msg: # Append assistant message only if it exists
-             full_prompt += f"<|assistant|>\n{assistant_msg}<|end|>\n"
-    # Add the current user prompt and the trigger for the assistant's response
-    full_prompt += f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n"
-    logging.info(f"Generating response (MaxL: {max_length}, Temp: {temperature}, TopP: {top_p}, TopK: {top_k})")
-    try:
-        input_tokens_list = tokenizer.encode(full_prompt) # Encode returns a list/array
-        # Ensure input_tokens is a numpy array of the correct type (int32 is common)
-        input_tokens_np = np.array(input_tokens_list, dtype=np.int32)
-        # Reshape to (batch_size, sequence_length), which is (1, N) for single prompt
-        input_tokens_np = input_tokens_np.reshape((1, -1))
-        logging.info(f"Prepared input_tokens shape: {input_tokens_np.shape}, dtype: {input_tokens_np.dtype}")
-        search_options = {
-            "max_length": max_length,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": top_k,
-            "do_sample": True,
-        }
-        params = og.GeneratorParams(model)
-        params.set_search_options(**search_options)
-        # FIX: Reverting to direct assignment based on official examples,
-        #      ensuring the numpy array is correctly shaped *before* assignment.
-        logging.info("Attempting direct assignment: params.input_ids = input_tokens_np")
-        params.input_ids = input_tokens_np # Use the reshaped numpy array
-        start_time = time.time()
-        # Create generator AFTER setting parameters including input_ids
-        generator = og.Generator(model, params)
-        model_status = "Generating..." # Update status indicator
-        logging.info("Streaming response...")
-        first_token_time = None
-        token_count = 0
-        # Rely primarily on generator.is_done()
-        while not generator.is_done():
-            try:
-                generator.compute_logits()
-                generator.generate_next_token()
-                if first_token_time is None:
-                    first_token_time = time.time() # Record time to first token
-                next_token = generator.get_next_tokens()[0]
-                decoded_chunk = tokenizer.decode([next_token])
-                token_count += 1
-                # Secondary check: Stop if the model explicitly generates the <|end|> string literal.
-                if decoded_chunk == "<|end|>":
-                    logging.info("Assistant explicitly generated <|end|> token string.")
-                    break
-                yield decoded_chunk # Yield just the text chunk
-            except Exception as loop_error:
-                logging.error(f"Error inside generation loop: {loop_error}", exc_info=True)
-                yield f"\n\nError during token generation: {loop_error}"
-                break # Exit loop on error
-        end_time = time.time()
-        ttft = (first_token_time - start_time) * 1000 if first_token_time else -1
-        total_time = end_time - start_time
-        tps = (token_count / total_time) if total_time > 0 else 0
-        logging.info(f"Generation complete. Tokens: {token_count}, Total Time: {total_time:.2f}s, TTFT: {ttft:.2f}ms, TPS: {tps:.2f}")
-        model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})" # Reset status
-    except AttributeError as ae:
-         # Catching this specifically again after trying direct assignment
-         logging.error(f"AttributeError during generation setup (using params.input_ids): {ae}", exc_info=True)
-         logging.error("This suggests the 'input_ids' attribute is not available in this version, despite examples.")
-         model_status = f"Generation Setup AttributeError: {ae}"
-         yield f"\n\nSorry, an AttributeError occurred setting up generation: {ae}"
-    except TypeError as te:
-        # Catch type errors specifically during setup if the input format is still wrong
-        logging.error(f"TypeError during generation setup: {te}", exc_info=True)
-        logging.error("Check input data types and shapes if this occurs.")
-        model_status = f"Generation Setup TypeError: {te}"
-        yield f"\n\nSorry, a TypeError occurred setting up generation: {te}"
-    except Exception as e:
-        logging.error(f"Error during generation: {e}", exc_info=True)
-        model_status = f"Error during generation: {e}"
-        yield f"\n\nSorry, an error occurred during generation: {e}" # Yield error message
-# --- Gradio Interface Functions ---
-# 1. Function to add user message to chat history
-def add_user_message(user_message, history):
-    """Adds the user's message to the chat history for display."""
-    if not user_message:
-        return "", history # Clear input, return unchanged history
-    history = history + [[user_message, None]] # Append user message, leave bot response None
-    return "", history # Clear input textbox, return updated history
-# 2. Function to handle bot response generation and streaming
-def generate_bot_response(history, max_length, temperature, top_p, top_k):
-    """Generates the bot's response based on the history and streams it."""
-    if not history or history[-1][1] is not None:
-        return history
-    user_prompt = history[-1][0] # Get the latest user prompt
-    model_history = history[:-1] # Prepare history for the model
-    response_stream = generate_response_stream(
-        user_prompt, model_history, max_length, temperature, top_p, top_k
     )
-    history[-1][1] = "" # Initialize the bot response string in the history
-    for chunk in response_stream:
-        history[-1][1] += chunk # Append the chunk to the bot's message in history
-        yield history # Yield the *entire updated history* back to Chatbot
-# 3. Function to clear chat
-def clear_chat():
-    """Clears the chat history and input."""
-    global model_status
-    if model and tokenizer and not model_status.startswith("Error") and not model_status.startswith("FATAL"):
-         model_status = f"Model Ready ({EXECUTION_PROVIDER.upper()} / {model_variant_name})"
-    return None, [], model_status # Clear Textbox, Chatbot history, and update status display
-# --- Initialize Model on App Start ---
-try:
-    initialize_model()
-except Exception as e:
-    print(f"FATAL: Model initialization failed: {e}")
 # --- Gradio Interface ---
-logging.info("Creating Gradio Interface...")
-theme = gr.themes.Soft(
-    primary_hue="blue",
-    secondary_hue="sky",
-    neutral_hue="slate",
-    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
-)
-with gr.Blocks(theme=theme, title="Phi-4 Mini ONNX Chat") as demo:
-    # Header Section
-    with gr.Row(equal_height=False):
-        with gr.Column(scale=3):
-            gr.Markdown(f"""
-            # Phi-4 Mini Instruct ONNX Chat 🤖
-            Interact with the quantized `{model_variant_name}` version of [`{MODEL_REPO}`]({HF_MODEL_URL})
-            running efficiently via [`onnxruntime-genai`]({ORT_GENAI_URL}) ({EXECUTION_PROVIDER.upper()}).
-            """)
-        with gr.Column(scale=1, min_width=150):
-             gr.Image(HF_LOGO_URL, elem_id="hf-logo", show_label=False, show_download_button=False, container=False, height=50)
-             model_status_text = gr.Textbox(value=model_status, label="Model Status", interactive=False, max_lines=2)
-    # Main Layout
     with gr.Row():
-        # Chat Column
-        with gr.Column(scale=3):
-            chatbot = gr.Chatbot(
-                label="Conversation",
-                height=600,
-                layout="bubble",
-                bubble_full_width=False,
-                avatar_images=(None, PHI_LOGO_URL)
             )
-            with gr.Row():
-                 prompt_input = gr.Textbox(
-                    label="Your Message",
-                    placeholder="<|user|>\nType your message here...\n<|end|>",
-                    lines=4,
-                    scale=9
-                 )
-                 with gr.Column(scale=1, min_width=120):
-                     submit_button = gr.Button("Send", variant="primary", size="lg")
-                     clear_button = gr.Button("🗑️ Clear Chat", variant="secondary")
-        # Settings Column
-        with gr.Column(scale=1, min_width=250):
-            gr.Markdown("### ⚙️ Generation Settings")
-            with gr.Group():
-                max_length = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Length", info="Max tokens in response.")
-                temperature = gr.Slider(minimum=0.0, maximum=1.5, value=0.7, step=0.05, label="Temperature", info="0.0 = deterministic\n>1.0 = more random")
-                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-P", info="Nucleus sampling probability.")
-                top_k = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="Limit to K most likely tokens (0=disable).")
-            gr.Markdown("---")
-            gr.Markdown("ℹ️ **Note:** Uses Phi-4 instruction format: \n`<|user|>\nPROMPT<|end|>\n<|assistant|>`")
-            gr.Markdown(f"Running on **{EXECUTION_PROVIDER.upper()}**.")
-    # Event Listeners
-    bot_response_inputs = [chatbot, max_length, temperature, top_p, top_k]
-    submit_event = prompt_input.submit(
-        fn=add_user_message,
-        inputs=[prompt_input, chatbot],
-        outputs=[prompt_input, chatbot],
-        queue=False,
-    ).then(
-        fn=generate_bot_response,
-        inputs=bot_response_inputs,
-        outputs=[chatbot],
-        api_name="chat"
-    )
     submit_button.click(
-        fn=add_user_message,
-        inputs=[prompt_input, chatbot],
-        outputs=[prompt_input, chatbot],
-        queue=False,
-    ).then(
-        fn=generate_bot_response,
-        inputs=bot_response_inputs,
-        outputs=[chatbot],
-        api_name=False
     )
-    clear_button.click(
-        fn=clear_chat,
-        inputs=None,
-        outputs=[prompt_input, chatbot, model_status_text],
-        queue=False
     )
-# Launch the Gradio app
-logging.info("Launching Gradio App...")
-demo.queue(max_size=20)
-demo.launch(show_error=True, max_threads=40)

+import torch
 import gradio as gr
+from unsloth import FastLanguageModel
+from transformers import TextStreamer, GenerationConfig
+import warnings
+# Suppress specific warnings if needed (optional)
+warnings.filterwarnings("ignore", category=UserWarning, message=".*padding_mask.*")
 # --- Configuration ---
+MODEL_NAME = "unsloth/gemma-3-1b-it"
+MAX_SEQ_LENGTH = 4096  # Choose based on model's capabilities and your VRAM
+DTYPE = None  # None for auto detection, or torch.float16, torch.bfloat16
+LOAD_IN_4BIT = True # Use 4-bit quantization for lower memory usage
+# --- Load Model and Tokenizer ---
+print(f"Loading model: {MODEL_NAME}")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LENGTH,
+    dtype=DTYPE,
+    load_in_4bit=LOAD_IN_4BIT,
+    # token = "hf_...", # Add your Hugging Face token if needed (for gated models)
+)
+print("Model and tokenizer loaded successfully.")
+# Optimize for inference
+FastLanguageModel.for_inference(model)
+print("Model optimized for inference.")
+# --- Generation Function ---
+def generate_response(
+    prompt,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.9,
+    do_sample=True,
+    system_prompt=None # Optional system prompt
+):
+    """
+    Generates a response from the model given a prompt and parameters.
+    """
+    messages = []
+    if system_prompt and system_prompt.strip():
+         messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+    # Apply chat template
     try:
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True, # Ensures the '<start_of_turn>model' token is added
+            return_tensors="pt",
+        ).to(model.device) # Ensure inputs are on the same device as the model
     except Exception as e:
+        print(f"Error applying chat template: {e}")
+        # Fallback or simple concatenation if template fails (less ideal)
+        formatted_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+    # --- Use a TextStreamer for Gradio ---
+    # While streaming works well in terminals, Gradio updates per yield.
+    # For a simpler Gradio experience, we'll generate the full response at once.
+    # If you want streaming in Gradio, it requires more complex handling
+    # with gr.Textbox(interactive=False) and yielding chunks.
+    # --- Generate Full Response ---
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=do_sample,
+        pad_token_id=tokenizer.eos_token_id, # Use EOS token for padding
+        eos_token_id=tokenizer.eos_token_id,
     )
+    print("\nGenerating response...")
+    print(f"  Prompt length: {inputs.shape[1]} tokens")
+    print(f"  Max new tokens: {max_new_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  Top-P: {top_p}")
+    print(f"  Do Sample: {do_sample}")
+    with torch.inference_mode(): # Ensure no gradients are computed
+        outputs = model.generate(
+            input_ids=inputs,
+            attention_mask=torch.ones_like(inputs), # Provide attention mask
+            generation_config=generation_config,
+        )
+    # Decode the generated tokens, skipping the prompt part
+    # outputs[0] contains the full sequence (prompt + response)
+    input_length = inputs.shape[1]
+    response_tokens = outputs[0][input_length:]
+    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
+    print(f"  Response length: {len(response_tokens)} tokens")
+    print("Generation complete.")
+    return response_text.strip()
 # --- Gradio Interface ---
+print("Creating Gradio interface...")
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        f"""
+        #  चैट Unsloth Gemma 3.1B-IT Interface
+        Interact with the `{MODEL_NAME}` model optimized with Unsloth.
+        Enter your prompt below and adjust the generation parameters.
+        *Note: Running on {'GPU' if torch.cuda.is_available() else 'CPU'}. 4-bit quantization is {'enabled' if LOAD_IN_4BIT else 'disabled'}*.
+        """
+    )
     with gr.Row():
+        with gr.Column(scale=2):
+            prompt_input = gr.Textbox(
+                label="Your Prompt",
+                placeholder="Ask me anything...",
+                lines=4,
+                show_copy_button=True,
+            )
+            system_prompt_input = gr.Textbox(
+                label="System Prompt (Optional)",
+                placeholder="Example: You are a helpful assistant.",
+                lines=2
+            )
+            submit_button = gr.Button("Generate Response", variant="primary")
+        with gr.Column(scale=1):
+            gr.Markdown("### Generation Parameters")
+            max_new_tokens_slider = gr.Slider(
+                minimum=32,
+                maximum=2048, # Adjust max based on VRAM and needs
+                value=512,
+                step=32,
+                label="Max New Tokens",
+                info="Maximum number of tokens to generate."
+            )
+            temperature_slider = gr.Slider(
+                minimum=0.1,
+                maximum=1.5,
+                value=0.6, # Adjusted default temperature slightly lower
+                step=0.05,
+                label="Temperature",
+                info="Controls randomness. Lower values are more deterministic."
+            )
+            top_p_slider = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.9,
+                step=0.05,
+                label="Top-P (Nucleus Sampling)",
+                info="Considers only the most probable tokens with cumulative probability P."
+            )
+            do_sample_checkbox = gr.Checkbox(
+                value=True,
+                label="Use Sampling",
+                info="If unchecked, uses greedy decoding (picks the most likely token)."
             )
+    output_textbox = gr.Markdown(label="Model Response", value="*Response will appear here...*")
+    # --- Connect Components ---
     submit_button.click(
+        fn=generate_response,
+        inputs=[
+            prompt_input,
+            max_new_tokens_slider,
+            temperature_slider,
+            top_p_slider,
+            do_sample_checkbox,
+            system_prompt_input,
+        ],
+        outputs=output_textbox,
+        api_name="generate" # Allows API access if needed
     )
+    # --- Examples ---
+    gr.Examples(
+        examples=[
+            ["Explain the concept of Large Language Models (LLMs) in simple terms.", 512, 0.7, 0.9, True, ""],
+            ["Write a short story about a robot exploring a futuristic city.", 768, 0.8, 0.95, True, ""],
+            ["Provide 5 ideas for a healthy breakfast.", 256, 0.6, 0.9, True, ""],
+            ["Translate 'Hello, how are you?' to French.", 64, 0.5, 0.9, False, ""],
+            ["What is the capital of Australia?", 64, 0.3, 0.9, False, "You are a factual answering bot."]
+        ],
+        inputs=[
+            prompt_input,
+            max_new_tokens_slider,
+            temperature_slider,
+            top_p_slider,
+            do_sample_checkbox,
+            system_prompt_input,
+        ],
+        outputs=output_textbox, # Output examples to the main output area
+        fn=generate_response,   # Make examples clickable
+        cache_examples=False,   # Recalculate examples on click if needed, or True to cache
     )
+# --- Launch the Interface ---
+if __name__ == "__main__":
+    print("Launching Gradio interface...")
+    # share=True creates a public link (use with caution)
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
+    # Use server_name="0.0.0.0" to make it accessible on your local network
+    # Use server_port=7860 (or another) to specify the port
+    print("Gradio interface launched. Access it at http://<your-ip-address>:7860 (or http://127.0.0.1:7860 locally)")