Avinash109 commited on
Commit
e8d7bea
Β·
verified Β·
1 Parent(s): ed64278

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -73
app.py CHANGED
@@ -2,129 +2,130 @@ import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import datetime
5
- import gc
6
- import os
7
-
8
- # Enable memory efficient options
9
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
10
 
11
  # Set page configuration
12
  st.set_page_config(
13
  page_title="Qwen2.5-Coder Chat",
14
  page_icon="πŸ’¬",
15
- layout="wide",
16
  )
17
 
18
  # Initialize session state
19
  if 'messages' not in st.session_state:
20
  st.session_state.messages = []
21
- if 'model_loaded' not in st.session_state:
22
- st.session_state.model_loaded = False
23
 
24
- @st.cache_resource(show_spinner=False)
25
  def load_model_and_tokenizer():
26
  try:
27
- model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
28
-
29
- with st.spinner("πŸ”„ Loading tokenizer..."):
 
30
  # Load tokenizer first
31
  tokenizer = AutoTokenizer.from_pretrained(
32
  model_name,
33
  trust_remote_code=True
34
  )
35
-
36
- with st.spinner("πŸ”„ Loading model... (this may take a few minutes on CPU)"):
37
- # Load model with 8-bit quantization for CPU
38
- model = AutoModelForCausalLM.from_pretrained(
39
- model_name,
40
- device_map={"": "cpu"},
41
- trust_remote_code=True,
42
- low_cpu_mem_usage=True,
43
- torch_dtype=torch.float32,
44
- load_in_8bit=True # Enable 8-bit quantization
45
- )
46
 
47
- # Force CPU mode and eval mode
48
- model = model.to("cpu").eval()
 
49
 
50
- # Clear memory after loading
51
- gc.collect()
52
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- st.session_state.model_loaded = True
55
  return tokenizer, model
56
-
57
  except Exception as e:
58
  st.error(f"❌ Error loading model: {str(e)}")
59
- return None, None
60
 
61
- def generate_response(prompt, model, tokenizer, max_length=256):
 
62
  try:
63
- # Clear memory before generation
64
- gc.collect()
65
 
66
- # Tokenize with shorter maximum length
67
- inputs = tokenizer(
68
- prompt,
69
- return_tensors="pt",
70
- max_length=512,
71
- truncation=True
72
- ).to("cpu")
73
-
74
- # Generate with minimal parameters for CPU
75
- with torch.no_grad(), st.spinner("πŸ€” Thinking... (please be patient)"):
76
  outputs = model.generate(
77
  **inputs,
78
- max_new_tokens=max_length,
79
- temperature=0.7,
80
- top_p=0.9,
81
  do_sample=True,
82
  pad_token_id=tokenizer.pad_token_id,
83
  eos_token_id=tokenizer.eos_token_id,
84
- num_beams=1, # Disable beam search
85
- early_stopping=True
86
  )
87
 
88
- # Clear memory after generation
89
- gc.collect()
90
-
91
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
  return response[len(prompt):].strip()
93
-
94
  except torch.cuda.OutOfMemoryError:
95
- st.error("πŸ’Ύ Memory exceeded. Try reducing the maximum length.")
96
  return None
97
  except Exception as e:
98
- st.error(f"❌ Error: {str(e)}")
99
  return None
100
 
101
  # Main UI
102
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
103
 
104
- # Sidebar with minimal settings
105
  with st.sidebar:
106
  st.header("βš™οΈ Settings")
107
 
 
108
  max_length = st.slider(
109
- "Response Length πŸ“",
110
  min_value=64,
111
- max_value=512,
112
- value=256,
113
- step=64,
114
- help="Shorter lengths are recommended for CPU"
 
 
 
 
 
 
 
115
  )
116
 
 
 
 
 
 
 
 
 
 
117
  if st.button("πŸ—‘οΈ Clear Conversation"):
118
  st.session_state.messages = []
119
  st.rerun()
120
 
121
  # Load model
122
- if not st.session_state.model_loaded:
123
- tokenizer, model = load_model_and_tokenizer()
124
- if model is None:
125
- st.stop()
126
- else:
127
  tokenizer, model = load_model_and_tokenizer()
 
 
 
128
 
129
  # Display conversation history
130
  for message in st.session_state.messages:
@@ -147,14 +148,19 @@ if prompt := st.chat_input("πŸ’­ Ask me anything about coding..."):
147
 
148
  # Generate and display response
149
  with st.chat_message("assistant"):
150
- # Keep only last message for context to reduce memory usage
151
- conversation = f"Human: {prompt}\nAssistant:"
 
 
 
152
 
153
  response = generate_response(
154
  conversation,
155
  model,
156
  tokenizer,
157
- max_length=max_length
 
 
158
  )
159
 
160
  if response:
@@ -168,7 +174,4 @@ if prompt := st.chat_input("πŸ’­ Ask me anything about coding..."):
168
  "timestamp": timestamp
169
  })
170
  else:
171
- st.error("❌ Failed to generate response. Please try again with a shorter length.")
172
-
173
- # Clear memory after response
174
- gc.collect()
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import datetime
 
 
 
 
 
5
 
6
  # Set page configuration
7
  st.set_page_config(
8
  page_title="Qwen2.5-Coder Chat",
9
  page_icon="πŸ’¬",
10
+ layout="wide"
11
  )
12
 
13
  # Initialize session state
14
  if 'messages' not in st.session_state:
15
  st.session_state.messages = []
 
 
16
 
17
+ @st.cache_resource
18
  def load_model_and_tokenizer():
19
  try:
20
+ # Display loading message
21
+ with st.spinner("πŸ”„ Loading model and tokenizer... This might take a few minutes..."):
22
+ model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
23
+
24
  # Load tokenizer first
25
  tokenizer = AutoTokenizer.from_pretrained(
26
  model_name,
27
  trust_remote_code=True
28
  )
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Determine device and display info
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ st.info(f"πŸ’» Using device: {device}")
33
 
34
+ # Load model with appropriate settings
35
+ if device == "cuda":
36
+ model = AutoModelForCausalLM.from_pretrained(
37
+ model_name,
38
+ torch_dtype=torch.float16, # Use float16 for GPU
39
+ device_map="auto",
40
+ trust_remote_code=True
41
+ ).eval() # Set to evaluation mode
42
+ else:
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ model_name,
45
+ device_map={"": device},
46
+ trust_remote_code=True,
47
+ low_cpu_mem_usage=True
48
+ ).eval() # Set to evaluation mode
49
 
 
50
  return tokenizer, model
 
51
  except Exception as e:
52
  st.error(f"❌ Error loading model: {str(e)}")
53
+ raise e
54
 
55
+ def generate_response(prompt, model, tokenizer, max_new_tokens=512, temperature=0.7, top_p=0.9):
56
+ """Generate response from the model with better error handling"""
57
  try:
58
+ # Tokenize input
59
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
60
 
61
+ # Generate response with progress bar
62
+ with torch.no_grad(), st.spinner("πŸ€” Thinking..."):
 
 
 
 
 
 
 
 
63
  outputs = model.generate(
64
  **inputs,
65
+ max_new_tokens=max_new_tokens,
66
+ temperature=temperature,
67
+ top_p=top_p,
68
  do_sample=True,
69
  pad_token_id=tokenizer.pad_token_id,
70
  eos_token_id=tokenizer.eos_token_id,
71
+ repetition_penalty=1.1,
72
+ no_repeat_ngram_size=3
73
  )
74
 
75
+ # Decode and return response
 
 
76
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
77
  return response[len(prompt):].strip()
78
+
79
  except torch.cuda.OutOfMemoryError:
80
+ st.error("πŸ’Ύ GPU memory exceeded. Try reducing the maximum length or clearing the conversation.")
81
  return None
82
  except Exception as e:
83
+ st.error(f"❌ Error generating response: {str(e)}")
84
  return None
85
 
86
  # Main UI
87
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
88
 
89
+ # Sidebar settings
90
  with st.sidebar:
91
  st.header("βš™οΈ Settings")
92
 
93
+ # Model settings
94
  max_length = st.slider(
95
+ "Maximum Length πŸ“",
96
  min_value=64,
97
+ max_value=2048,
98
+ value=512,
99
+ step=64
100
+ )
101
+
102
+ temperature = st.slider(
103
+ "Temperature 🌑️",
104
+ min_value=0.1,
105
+ max_value=2.0,
106
+ value=0.7,
107
+ step=0.1
108
  )
109
 
110
+ top_p = st.slider(
111
+ "Top P πŸ“Š",
112
+ min_value=0.1,
113
+ max_value=1.0,
114
+ value=0.9,
115
+ step=0.1
116
+ )
117
+
118
+ # Clear conversation button
119
  if st.button("πŸ—‘οΈ Clear Conversation"):
120
  st.session_state.messages = []
121
  st.rerun()
122
 
123
  # Load model
124
+ try:
 
 
 
 
125
  tokenizer, model = load_model_and_tokenizer()
126
+ except Exception as e:
127
+ st.error("❌ Failed to load model. Please check the logs and refresh the page.")
128
+ st.stop()
129
 
130
  # Display conversation history
131
  for message in st.session_state.messages:
 
148
 
149
  # Generate and display response
150
  with st.chat_message("assistant"):
151
+ # Prepare conversation context (limit to last 3 messages to prevent context overflow)
152
+ conversation = "\n".join(
153
+ f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
154
+ for msg in st.session_state.messages[-3:]
155
+ ) + "\nAssistant:"
156
 
157
  response = generate_response(
158
  conversation,
159
  model,
160
  tokenizer,
161
+ max_new_tokens=max_length,
162
+ temperature=temperature,
163
+ top_p=top_p
164
  )
165
 
166
  if response:
 
174
  "timestamp": timestamp
175
  })
176
  else:
177
+ st.error("❌ Failed to generate response. Please try again with different settings.")