Spaces:

Avinash109
/

qwen2.5

Sleeping

App Files Files Community

Avinash109 commited on Nov 12, 2024

Commit

088f906

verified ·

1 Parent(s): 4897d60

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -128

app.py CHANGED Viewed

@@ -1,165 +1,166 @@
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import datetime
-# Set Streamlit page configuration
 st.set_page_config(
     page_title="Qwen2.5-Coder Chat",
     page_icon="💬",
-    layout="wide",
 )
-# Title of the app
-st.title("💬 Qwen2.5-Coder Chat Interface")
-# Initialize session state for messages (store conversation history)
-st.session_state.setdefault('messages', [])
-# Load the model and tokenizer
 @st.cache_resource
-def load_model():
-    model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"  # Replace with the correct model path
-    # Define BitsAndBytesConfig for 8-bit quantization
-    quantization_config = BitsAndBytesConfig(
-        load_in_8bit=True,                        # Enable 8-bit loading
-        llm_int8_enable_fp32_cpu_offload=True     # Optional: Enables offloading to CPU
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        quantization_config=quantization_config,
         torch_dtype=torch.float16,
-        device_map="auto"
     )
     return tokenizer, model
-# Load tokenizer and model with error handling
-try:
-    with st.spinner("Loading model... This may take a while..."):
-        tokenizer, model = load_model()
-except Exception as e:
-    st.error(f"Error loading model: {e}")
-    st.stop()
-# Function to generate model response
-def generate_response(messages, tokenizer, model, max_tokens=150, temperature=0.7, top_p=0.9):
-    """
-    Generates a response from the model based on the conversation history.
-    Args:
-        messages (list): List of message dictionaries containing 'role' and 'content'.
-        tokenizer: The tokenizer instance.
-        model: The language model instance.
-        max_tokens (int): Maximum number of tokens for the response.
-        temperature (float): Sampling temperature.
-        top_p (float): Nucleus sampling probability.
-    Returns:
-        str: The generated response text.
-    """
-    # Concatenate all previous messages
-    conversation = ""
-    for message in messages:
-        role = "You" if message['role'] == 'user' else "Qwen2.5-Coder"
-        conversation += f"**{role}:** {message['content']}\n"
-    # Append the latest user input
-    conversation += f"**You:** {messages[-1]['content']}\n**Qwen2.5-Coder:**"
-    # Tokenize the conversation
-    inputs = tokenizer.encode(conversation, return_tensors="pt").to(model.device)
-    # Generate a response
-    with torch.no_grad():
-        outputs = model.generate(
-            inputs,
-            max_length=inputs.shape[1] + max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            do_sample=True,
-            num_return_sequences=1,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    # Decode the response
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract the generated response after the conversation
-    generated_response = response.split("Qwen2.5-Coder:")[-1].strip()
-    return generated_response
-# Layout: Two columns for the main chat and sidebar
-chat_col, sidebar_col = st.columns([4, 1])
-with chat_col:
-    st.markdown("### Chat")
-    chat_container = st.container()
-    with chat_container:
-        for message in st.session_state['messages']:
-            time = message.get('timestamp', '')
-            if message['role'] == 'user':
-                st.markdown(f"**You:** {message['content']} _({time})_")
-            else:
-                st.markdown(f"**Qwen2.5-Coder:** {message['content']} _({time})_")
-    # Input area for user message
-    with st.form(key='chat_form', clear_on_submit=True):
-        user_input = st.text_area("You:", height=100)
-        submit_button = st.form_submit_button(label='Send')
-    if submit_button and user_input.strip():
-        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        # Append the user's message to the chat history
-        st.session_state['messages'].append({'role': 'user', 'content': user_input, 'timestamp': timestamp})
-        # Generate and append the model's response
-        try:
-            with st.spinner("Qwen2.5-Coder is typing..."):
-                response = generate_response(
-                    st.session_state['messages'],
-                    tokenizer,
-                    model,
-                    max_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p
-                )
-            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-            st.session_state['messages'].append({'role': 'assistant', 'content': response, 'timestamp': timestamp})
-        except Exception as e:
-            st.error(f"Error generating response: {e}")
-with sidebar_col:
-    st.sidebar.header("Settings")
-    max_tokens = st.sidebar.slider(
-        "Maximum Tokens",
-        min_value=50,
         max_value=4096,
         value=512,
-        step=256,
-        help="Set the maximum number of tokens for the model's response."
     )
-    temperature = st.sidebar.slider(
         "Temperature",
         min_value=0.1,
-        max_value=1.0,
         value=0.7,
         step=0.1,
-        help="Controls the randomness of the model's output."
     )
-    top_p = st.sidebar.slider(
-        "Top-p (Nucleus Sampling)",
         min_value=0.1,
         max_value=1.0,
         value=0.9,
         step=0.1,
-        help="Controls the diversity of the model's output."
     )
-    if st.sidebar.button("Clear Chat"):
-        st.session_state['messages'] = []
-        st.success("Chat history cleared.")

 import streamlit as st
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import datetime
+# Page configuration
 st.set_page_config(
     page_title="Qwen2.5-Coder Chat",
     page_icon="💬",
+    layout="wide"
 )
+# Initialize session state for conversation history
+if 'messages' not in st.session_state:
+    st.session_state.messages = []
+# Cache the model loading
 @st.cache_resource
+def load_model_and_tokenizer():
+    model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
+    # Configure quantization
+    bnb_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=False,
     )
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        trust_remote_code=True
+    )
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
+        quantization_config=bnb_config,
         torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
     )
     return tokenizer, model
+# Main title
+st.title("💬 Qwen2.5-Coder Chat")
+# Sidebar settings
+with st.sidebar:
+    st.header("Settings")
+    max_length = st.slider(
+        "Maximum Length",
+        min_value=64,
         max_value=4096,
         value=512,
+        step=64,
+        help="Maximum number of tokens to generate"
     )
+    temperature = st.slider(
         "Temperature",
         min_value=0.1,
+        max_value=2.0,
         value=0.7,
         step=0.1,
+        help="Higher values make output more random, lower values more deterministic"
     )
+    top_p = st.slider(
+        "Top P",
         min_value=0.1,
         max_value=1.0,
         value=0.9,
         step=0.1,
+        help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
     )
+    if st.button("Clear Conversation"):
+        st.session_state.messages = []
+        st.rerun()
+# Load model with error handling
+try:
+    with st.spinner("Loading model... Please wait..."):
+        tokenizer, model = load_model_and_tokenizer()
+except Exception as e:
+    st.error(f"Error loading model: {str(e)}")
+    st.stop()
+def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
+    """Generate response from the model"""
+    try:
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        # Decode and return response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the model's response (after the prompt)
+        response = response[len(prompt):].strip()
+        return response
+    except Exception as e:
+        st.error(f"Error generating response: {str(e)}")
+        return None
+# Display chat history
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(f"{message['content']}\n\n_{message['timestamp']}_")
+# Chat input
+if prompt := st.chat_input("Ask me anything about coding..."):
+    # Add user message to chat
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.messages.append({
+        "role": "user",
+        "content": prompt,
+        "timestamp": timestamp
+    })
+    # Display user message
+    with st.chat_message("user"):
+        st.write(f"{prompt}\n\n_{timestamp}_")
+    # Generate and display response
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            # Prepare conversation history
+            conversation = ""
+            for msg in st.session_state.messages:
+                if msg["role"] == "user":
+                    conversation += f"Human: {msg['content']}\n"
+                else:
+                    conversation += f"Assistant: {msg['content']}\n"
+            conversation += "Assistant:"
+            response = generate_response(
+                conversation,
+                max_new_tokens=max_length,
+                temperature=temperature,
+                top_p=top_p
+            )
+            if response:
+                timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                st.write(f"{response}\n\n_{timestamp}_")
+                # Add assistant response to chat history
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": response,
+                    "timestamp": timestamp
+                })