Update app.py
Browse files
app.py
CHANGED
@@ -1,96 +1,85 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
4 |
import os
|
5 |
import gc
|
6 |
import psutil
|
7 |
-
import time
|
8 |
|
9 |
# --- Configuration ---
|
10 |
-
|
|
|
|
|
|
|
11 |
IM_START = "<|im_start|>"
|
12 |
IM_END = "<|im_end|>"
|
13 |
ASSISTANT_TAG = f"{IM_START}assistant\n"
|
14 |
|
15 |
def load_model():
|
16 |
-
"""Loads the fine-tuned model and tokenizer
|
17 |
print(f"Loading model from: {MODEL_DIR}")
|
18 |
-
|
19 |
# Force garbage collection before loading model
|
20 |
gc.collect()
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
#
|
25 |
-
#
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
# Option 1: 8-bit quantization
|
30 |
-
# quantization_config = BitsAndBytesConfig(
|
31 |
-
# load_in_8bit=True,
|
32 |
-
# bnb_8bit_quant_type="int8", # Standard 8-bit
|
33 |
-
# bnb_8bit_compute_dtype=torch.float32, # Compute in float32 on CPU
|
34 |
-
# )
|
35 |
-
|
36 |
-
# Option 2: 4-bit quantization (more memory saving, potentially slower on CPU)
|
37 |
-
quantization_config = BitsAndBytesConfig(
|
38 |
-
load_in_4bit=True,
|
39 |
-
bnb_4bit_quant_type="nf4", # NormalFloat 4-bit
|
40 |
-
bnb_4bit_use_double_quant=True, # Double quantization for slightly better accuracy
|
41 |
-
bnb_4bit_compute_dtype=torch.float32, # Compute in float32 on CPU
|
42 |
-
# bnb_4bit_quant_storage=torch.uint8, # Generally okay, sometimes bfloat16 might be slightly better if CPU supports it
|
43 |
-
)
|
44 |
-
|
45 |
# --- Loading ---
|
46 |
try:
|
47 |
# Load just the tokenizer first
|
48 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
|
49 |
-
|
50 |
# Check memory before loading model
|
51 |
process = psutil.Process(os.getpid())
|
52 |
print(f"Memory usage before model load: {process.memory_info().rss / (1024 * 1024):.2f} MB")
|
53 |
-
|
54 |
start_time = time.time()
|
55 |
print("Starting model loading...")
|
56 |
|
57 |
-
# Load model
|
|
|
|
|
58 |
model = AutoModelForCausalLM.from_pretrained(
|
59 |
MODEL_DIR,
|
60 |
trust_remote_code=True,
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
)
|
68 |
-
|
69 |
end_time = time.time()
|
70 |
print(f"Model loading took {end_time - start_time:.2f} seconds.")
|
71 |
|
72 |
-
# Model
|
73 |
-
#
|
74 |
-
# and could potentially cause a memory spike if device_map put anything elsewhere.
|
75 |
-
# Let's remove the explicit .to("cpu") after loading with device_map="cpu"
|
76 |
-
# model = model.to("cpu") # <-- REMOVED
|
77 |
-
|
78 |
-
if torch.cuda.is_available():
|
79 |
-
torch.cuda.empty_cache() # Clean up any potential residual GPU memory
|
80 |
|
81 |
# Set model to evaluation mode
|
82 |
model.eval()
|
83 |
-
|
84 |
# Add special tokens if needed
|
85 |
-
# Get current vocab size BEFORE adding tokens
|
86 |
original_vocab_size = len(tokenizer)
|
87 |
special_tokens = [IM_START, IM_END]
|
88 |
added_tokens_dict = {"additional_special_tokens": []}
|
89 |
-
|
90 |
for token in special_tokens:
|
91 |
-
if token
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
if added_tokens_dict["additional_special_tokens"]:
|
95 |
num_added = tokenizer.add_special_tokens(added_tokens_dict)
|
96 |
print(f"Added {num_added} special tokens: {added_tokens_dict['additional_special_tokens']}")
|
@@ -105,7 +94,7 @@ def load_model():
|
|
105 |
|
106 |
# Verify stop token
|
107 |
im_end_id = tokenizer.convert_tokens_to_ids(IM_END)
|
108 |
-
if im_end_id
|
109 |
print(f"Warning: '{IM_END}' not recognized by tokenizer. Using EOS token ({tokenizer.eos_token}) as stop sequence (ID: {tokenizer.eos_token_id}).")
|
110 |
stop_token_id = tokenizer.eos_token_id
|
111 |
else:
|
@@ -120,35 +109,37 @@ def load_model():
|
|
120 |
print(f"Number of model parameters: {model.num_parameters():,}") # Print parameter count
|
121 |
|
122 |
return model, tokenizer, stop_token_id
|
123 |
-
|
124 |
except Exception as e:
|
125 |
print(f"Error loading model: {e}")
|
126 |
print("Attempting to print traceback:")
|
127 |
import traceback
|
128 |
traceback.print_exc()
|
129 |
print("-" * 20)
|
130 |
-
print("Troubleshooting Steps:")
|
131 |
-
print("1. Verify the model path/name is correct.")
|
132 |
-
print("2.
|
133 |
-
print("3.
|
134 |
-
print("4.
|
135 |
-
print("5.
|
136 |
-
print("6.
|
137 |
return None, None, None
|
138 |
|
139 |
# Initialize model as None and load lazily
|
140 |
model, tokenizer, stop_token_id = None, None, None
|
141 |
|
142 |
-
# Rest of your code (generate_response, Gradio interface, __main__ block)
|
143 |
-
# remains largely the same. Only the load_model function needs significant changes.
|
144 |
-
|
145 |
-
# --- Rest of your code (copy/paste from your original script) ---
|
146 |
|
147 |
def lazy_load_model():
|
148 |
"""Lazily load model only when needed"""
|
149 |
global model, tokenizer, stop_token_id
|
150 |
if model is None:
|
|
|
151 |
model, tokenizer, stop_token_id = load_model()
|
|
|
|
|
|
|
|
|
152 |
return model is not None
|
153 |
|
154 |
def generate_response(
|
@@ -161,36 +152,60 @@ def generate_response(
|
|
161 |
# Lazily load model on first request
|
162 |
if not lazy_load_model():
|
163 |
return "Model loading failed. Check server logs for details."
|
164 |
-
|
165 |
# Build conversation history
|
166 |
context = []
|
167 |
for user_msg, bot_msg in history:
|
168 |
context.append(f"{IM_START}user\n{user_msg}\n{IM_END}")
|
169 |
if bot_msg:
|
170 |
-
|
171 |
-
|
|
|
|
|
172 |
# Add current input
|
173 |
context.append(f"{IM_START}user\n{user_input}\n{IM_END}")
|
174 |
context.append(ASSISTANT_TAG)
|
175 |
-
|
176 |
# Tokenize with efficient settings
|
177 |
input_text = "\n".join(context)
|
178 |
-
|
179 |
# Get max length from model config if available, default otherwise
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
inputs = tokenizer(
|
183 |
input_text,
|
184 |
return_tensors="pt",
|
185 |
-
truncation=True,
|
186 |
-
max_length=
|
187 |
padding=False, # Avoid unnecessary padding
|
188 |
).to(model.device) # Use model.device - will be 'cpu' due to device_map
|
189 |
|
190 |
-
# Generate response with optimized settings
|
191 |
with torch.no_grad():
|
192 |
try:
|
193 |
-
# Use more memory-efficient generation
|
|
|
|
|
194 |
outputs = model.generate(
|
195 |
**inputs,
|
196 |
max_new_tokens=max_tokens,
|
@@ -198,24 +213,31 @@ def generate_response(
|
|
198 |
top_p=top_p,
|
199 |
do_sample=True,
|
200 |
eos_token_id=stop_token_id, # Use the verified stop_token_id
|
201 |
-
|
|
|
202 |
repetition_penalty=1.2,
|
203 |
-
use_cache=True, # Enable KV caching
|
204 |
-
#
|
205 |
-
#
|
|
|
|
|
|
|
|
|
|
|
206 |
)
|
207 |
except Exception as e:
|
208 |
print(f"Error during generation: {e}")
|
209 |
# Fallback to simpler generation settings if needed
|
210 |
try:
|
211 |
-
print("Attempting simplified generation...")
|
212 |
outputs = model.generate(
|
213 |
**inputs,
|
214 |
max_new_tokens=max_tokens,
|
215 |
-
do_sample=False, # Force greedy decoding
|
216 |
use_cache=True,
|
217 |
eos_token_id=stop_token_id,
|
218 |
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
|
|
|
219 |
)
|
220 |
except Exception as e2:
|
221 |
print(f"Simplified generation also failed: {e2}")
|
@@ -224,74 +246,60 @@ def generate_response(
|
|
224 |
|
225 |
# Force garbage collection after generation
|
226 |
# This can be helpful to free up memory used during generation
|
227 |
-
del inputs
|
228 |
-
if '
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
# del outputs # Delete if you decode immediately after
|
233 |
-
pass # Keep for decoding below
|
234 |
-
elif isinstance(outputs, dict) and 'sequences' in outputs:
|
235 |
-
outputs['sequences'] = outputs['sequences'].cpu()
|
236 |
-
pass # Keep for decoding below
|
237 |
-
|
238 |
|
239 |
gc.collect()
|
240 |
-
|
241 |
-
torch.cuda.empty_cache() # Just in case
|
242 |
|
243 |
# Decode and clean response
|
244 |
# Ensure outputs is a tensor before decoding
|
245 |
if isinstance(outputs, torch.Tensor) and outputs.ndim == 2 and outputs.shape[0] > 0:
|
|
|
|
|
246 |
full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
247 |
else:
|
248 |
-
print("Warning: Generation output was not a tensor. Cannot decode.")
|
249 |
print(f"Output type: {type(outputs)}")
|
250 |
print(f"Output value: {outputs}")
|
251 |
return "Error: Failed to generate valid output."
|
252 |
|
253 |
# --- Parsing Logic (Keep your existing logic, it looks reasonable) ---
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
else:
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
# If even the full sequence isn't there, maybe just return everything after the *last* IM_END
|
266 |
-
last_im_end = full_text.rfind(IM_END)
|
267 |
-
if last_im_end != -1:
|
268 |
-
response_start = last_im_end + len(IM_END)
|
269 |
-
else:
|
270 |
-
# Last resort, return the whole thing or an error
|
271 |
-
return "Could not parse response."
|
272 |
-
|
273 |
-
|
274 |
-
response_end = len(full_text) # Default end is the end of the generated text
|
275 |
-
|
276 |
-
# Look for stop sequences *after* the assistant tag/start
|
277 |
-
stop_sequences = [IM_END, f"{IM_START}user"] # Add others if needed
|
278 |
-
|
279 |
-
earliest_stop = -1
|
280 |
-
for stop_seq in stop_sequences:
|
281 |
-
idx = full_text.find(stop_seq, response_start)
|
282 |
-
if idx != -1:
|
283 |
-
if earliest_stop == -1 or idx < earliest_stop:
|
284 |
-
earliest_stop = idx
|
285 |
-
|
286 |
-
if earliest_stop != -1:
|
287 |
-
response_end = earliest_stop
|
288 |
-
# --- End Parsing Logic ---
|
289 |
-
|
290 |
-
# Extract the response
|
291 |
-
response = full_text[response_start:response_end].strip()
|
292 |
-
|
293 |
-
# Clean up potential trailing tokens if parsing wasn't perfect
|
294 |
-
response = response.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
|
295 |
|
296 |
|
297 |
return response
|
@@ -304,17 +312,21 @@ demo = gr.ChatInterface(
|
|
304 |
gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
|
305 |
gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
|
306 |
],
|
307 |
-
title="Code Reasoning Assistant",
|
308 |
-
description="Fine-tuned coding assistant specialized in code reasoning and generation",
|
309 |
theme="soft"
|
310 |
)
|
311 |
|
312 |
if __name__ == "__main__":
|
313 |
# Create offload folder if it doesn't exist
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
316 |
# Use lazy loading - only load model when first query arrives
|
317 |
print("Starting server with lazy model loading...")
|
318 |
-
print("Ensure '
|
319 |
-
print("
|
|
|
320 |
demo.queue().launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig # Removed BitsAndBytesConfig
|
4 |
import os
|
5 |
import gc
|
6 |
import psutil
|
7 |
+
import time
|
8 |
|
9 |
# --- Configuration ---
|
10 |
+
# Your HF Hub model
|
11 |
+
# This model seems to be 7B parameters (based on typical naming for Llama/Mistral fine-tunes)
|
12 |
+
# A 7B model in float32 is approx 28GB. Loading this on 18GB RAM WILL require disk offloading.
|
13 |
+
MODEL_DIR = "ErenalpCet/E-Model-Reasoning-Coder-V1"
|
14 |
IM_START = "<|im_start|>"
|
15 |
IM_END = "<|im_end|>"
|
16 |
ASSISTANT_TAG = f"{IM_START}assistant\n"
|
17 |
|
18 |
def load_model():
|
19 |
+
"""Loads the fine-tuned model and tokenizer for CPU using offloading."""
|
20 |
print(f"Loading model from: {MODEL_DIR}")
|
21 |
+
|
22 |
# Force garbage collection before loading model
|
23 |
gc.collect()
|
24 |
+
# No need for cuda empty cache as we are on CPU
|
25 |
+
|
26 |
+
# --- Configuration (Removed Quantization config) ---
|
27 |
+
# BitsAndBytes requires CUDA for its quantization methods (load_in_4bit, load_in_8bit)
|
28 |
+
# Since you only have CPU, we remove the BitsAndBytesConfig.
|
29 |
+
# We will rely on device_map="cpu" and offloading to handle memory.
|
30 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# --- Loading ---
|
32 |
try:
|
33 |
# Load just the tokenizer first
|
34 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
|
35 |
+
|
36 |
# Check memory before loading model
|
37 |
process = psutil.Process(os.getpid())
|
38 |
print(f"Memory usage before model load: {process.memory_info().rss / (1024 * 1024):.2f} MB")
|
39 |
+
|
40 |
start_time = time.time()
|
41 |
print("Starting model loading...")
|
42 |
|
43 |
+
# Load model using device_map="cpu" and offloading for memory management
|
44 |
+
# This will load the model weights, likely in float32 (approx 28GB for 7B),
|
45 |
+
# splitting parts of it between RAM and disk ('offload_folder').
|
46 |
model = AutoModelForCausalLM.from_pretrained(
|
47 |
MODEL_DIR,
|
48 |
trust_remote_code=True,
|
49 |
+
# We are on CPU, so we don't need torch_dtype=torch.float16/bfloat16
|
50 |
+
# Float32 is the default and standard for CPU compute.
|
51 |
+
# Quantization via BitsAndBytesConfig is removed.
|
52 |
+
low_cpu_mem_usage=True, # Very important for large models on CPU
|
53 |
+
offload_folder="offload_folder", # Required with low_cpu_mem_usage if model > RAM
|
54 |
+
offload_state_dict=True, # Offload state dict during loading
|
55 |
+
device_map="cpu" # Explicitly set to CPU
|
56 |
+
# You can also use device_map="auto" with max_memory={0: "18GB", "cpu": "auto"}
|
57 |
+
# but device_map="cpu" is simpler if you know you only have CPU.
|
58 |
+
# Let's stick to the explicit "cpu" as requested.
|
59 |
)
|
60 |
+
|
61 |
end_time = time.time()
|
62 |
print(f"Model loading took {end_time - start_time:.2f} seconds.")
|
63 |
|
64 |
+
# Model is already on CPU due to device_map="cpu"
|
65 |
+
# model = model.to("cpu") # Redundant with device_map="cpu"
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
# Set model to evaluation mode
|
68 |
model.eval()
|
69 |
+
|
70 |
# Add special tokens if needed
|
|
|
71 |
original_vocab_size = len(tokenizer)
|
72 |
special_tokens = [IM_START, IM_END]
|
73 |
added_tokens_dict = {"additional_special_tokens": []}
|
74 |
+
|
75 |
for token in special_tokens:
|
76 |
+
# Check if token is already in the main vocab or added special tokens
|
77 |
+
# Use tokenizer.convert_tokens_to_ids and check against unknown token ID
|
78 |
+
token_id = tokenizer.convert_tokens_to_ids(token)
|
79 |
+
if token_id is None or token_id == tokenizer.unk_token_id:
|
80 |
+
added_tokens_dict["additional_special_tokens"].append(token)
|
81 |
+
|
82 |
+
|
83 |
if added_tokens_dict["additional_special_tokens"]:
|
84 |
num_added = tokenizer.add_special_tokens(added_tokens_dict)
|
85 |
print(f"Added {num_added} special tokens: {added_tokens_dict['additional_special_tokens']}")
|
|
|
94 |
|
95 |
# Verify stop token
|
96 |
im_end_id = tokenizer.convert_tokens_to_ids(IM_END)
|
97 |
+
if im_end_id is None or im_end_id == tokenizer.unk_token_id:
|
98 |
print(f"Warning: '{IM_END}' not recognized by tokenizer. Using EOS token ({tokenizer.eos_token}) as stop sequence (ID: {tokenizer.eos_token_id}).")
|
99 |
stop_token_id = tokenizer.eos_token_id
|
100 |
else:
|
|
|
109 |
print(f"Number of model parameters: {model.num_parameters():,}") # Print parameter count
|
110 |
|
111 |
return model, tokenizer, stop_token_id
|
112 |
+
|
113 |
except Exception as e:
|
114 |
print(f"Error loading model: {e}")
|
115 |
print("Attempting to print traceback:")
|
116 |
import traceback
|
117 |
traceback.print_exc()
|
118 |
print("-" * 20)
|
119 |
+
print("Troubleshooting Steps (for CPU loading without BitsAndBytes):")
|
120 |
+
print("1. Verify the model path/name is correct on Hugging Face Hub.")
|
121 |
+
print(f"2. Ensure you have sufficient *disk space* in the '{os.path.abspath('offload_folder')}' for offloading (likely tens of GB needed).")
|
122 |
+
print("3. Ensure you have the 'accelerate' library installed (`pip install accelerate`). This is crucial for low_cpu_mem_usage and offloading.")
|
123 |
+
print("4. The model might still be too large even with offloading if total system memory (RAM + swap) is insufficient or disk I/O is a bottleneck during loading.")
|
124 |
+
print("5. Check system logs for any out-of-memory errors or disk full errors.")
|
125 |
+
print("6. Consider if your system has sufficient swap space configured, as offloading might rely on it.")
|
126 |
return None, None, None
|
127 |
|
128 |
# Initialize model as None and load lazily
|
129 |
model, tokenizer, stop_token_id = None, None, None
|
130 |
|
131 |
+
# --- Rest of your code (generate_response, Gradio interface, __main__ block) ---
|
|
|
|
|
|
|
132 |
|
133 |
def lazy_load_model():
|
134 |
"""Lazily load model only when needed"""
|
135 |
global model, tokenizer, stop_token_id
|
136 |
if model is None:
|
137 |
+
print("Model not loaded, attempting to load now...")
|
138 |
model, tokenizer, stop_token_id = load_model()
|
139 |
+
if model is None:
|
140 |
+
print("Model loading failed.")
|
141 |
+
else:
|
142 |
+
print("Model loaded successfully.")
|
143 |
return model is not None
|
144 |
|
145 |
def generate_response(
|
|
|
152 |
# Lazily load model on first request
|
153 |
if not lazy_load_model():
|
154 |
return "Model loading failed. Check server logs for details."
|
155 |
+
|
156 |
# Build conversation history
|
157 |
context = []
|
158 |
for user_msg, bot_msg in history:
|
159 |
context.append(f"{IM_START}user\n{user_msg}\n{IM_END}")
|
160 |
if bot_msg:
|
161 |
+
# Ensure bot_msg is not None or empty before adding
|
162 |
+
if bot_msg.strip():
|
163 |
+
context.append(f"{ASSISTANT_TAG}{bot_msg}\n{IM_END}")
|
164 |
+
|
165 |
# Add current input
|
166 |
context.append(f"{IM_START}user\n{user_input}\n{IM_END}")
|
167 |
context.append(ASSISTANT_TAG)
|
168 |
+
|
169 |
# Tokenize with efficient settings
|
170 |
input_text = "\n".join(context)
|
171 |
+
|
172 |
# Get max length from model config if available, default otherwise
|
173 |
+
# Some models might have a different config attribute for context length
|
174 |
+
max_model_input_length = getattr(model.config, "max_position_embeddings", None)
|
175 |
+
if max_model_input_length is None:
|
176 |
+
# Fallback for models without max_position_embeddings (e.g., some Llama configs use hidden_size or similar indirectly)
|
177 |
+
# A common safe value for many 7B models is 4096 or 8192
|
178 |
+
max_model_input_length = 4096 # Or check model card/config.json
|
179 |
+
print(f"Warning: model.config.max_position_embeddings not found. Using default max_length: {max_model_input_length}")
|
180 |
+
|
181 |
+
# Ensure we don't truncate the input so much that there's no space for output
|
182 |
+
# A better calculation might consider the token size of the prompt vs total length
|
183 |
+
# A safer approach is to limit input length if it gets too long,
|
184 |
+
# but let's keep the current logic which reserves space for max_tokens output.
|
185 |
+
# However, the truncation setting itself `max_length` applies to the *input*.
|
186 |
+
# If input exceeds max_length, it's truncated. Need enough length for prompt + max_tokens
|
187 |
+
effective_max_length = max_model_input_length
|
188 |
+
if effective_max_length - max_tokens < len(tokenizer.encode(input_text)):
|
189 |
+
# If the required input length *plus* desired output length exceeds model capacity,
|
190 |
+
# something is wrong or the input is too long. Let's just truncate input firmly.
|
191 |
+
# A common pattern is to limit conversation history or simply let truncation handle it.
|
192 |
+
# Let's rely on truncation below.
|
193 |
+
pass # No specific action here, tokenizer(..., max_length=...) handles truncation
|
194 |
+
|
195 |
inputs = tokenizer(
|
196 |
input_text,
|
197 |
return_tensors="pt",
|
198 |
+
truncation=True, # Explicitly enable truncation if input is too long
|
199 |
+
max_length=effective_max_length, # Truncate input if it exceeds model capacity
|
200 |
padding=False, # Avoid unnecessary padding
|
201 |
).to(model.device) # Use model.device - will be 'cpu' due to device_map
|
202 |
|
203 |
+
# Generate response with optimized settings for CPU
|
204 |
with torch.no_grad():
|
205 |
try:
|
206 |
+
# Use more memory-efficient generation settings for CPU if possible
|
207 |
+
# num_beams > 1 increases memory significantly, stick to 1
|
208 |
+
# Early stopping can save computation
|
209 |
outputs = model.generate(
|
210 |
**inputs,
|
211 |
max_new_tokens=max_tokens,
|
|
|
213 |
top_p=top_p,
|
214 |
do_sample=True,
|
215 |
eos_token_id=stop_token_id, # Use the verified stop_token_id
|
216 |
+
# Use pad_token_id if available, otherwise eos_token_id is common fallback
|
217 |
+
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
|
218 |
repetition_penalty=1.2,
|
219 |
+
use_cache=True, # Enable KV caching (still beneficial on CPU)
|
220 |
+
num_beams=1, # Keep beam search off for lower memory
|
221 |
+
# Add stop sequences if needed for multi-token stops not covered by eos_token_id
|
222 |
+
# For chat, frequently [IM_END, f"{IM_START}user"] are good stops
|
223 |
+
# You can provide list of token IDs or list of strings
|
224 |
+
# Example using strings (requires generate to handle them, which it usually does)
|
225 |
+
# stopping_criteria=transformers.StoppingCriteriaList([transformers.TextGenerationStopCriteria(...)]) # More complex
|
226 |
+
# Simple approach: rely on eos_token_id and post-processing
|
227 |
)
|
228 |
except Exception as e:
|
229 |
print(f"Error during generation: {e}")
|
230 |
# Fallback to simpler generation settings if needed
|
231 |
try:
|
232 |
+
print("Attempting simplified generation (greedy decoding)...")
|
233 |
outputs = model.generate(
|
234 |
**inputs,
|
235 |
max_new_tokens=max_tokens,
|
236 |
+
do_sample=False, # Force greedy decoding - less memory, potentially faster
|
237 |
use_cache=True,
|
238 |
eos_token_id=stop_token_id,
|
239 |
pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
|
240 |
+
num_beams=1,
|
241 |
)
|
242 |
except Exception as e2:
|
243 |
print(f"Simplified generation also failed: {e2}")
|
|
|
246 |
|
247 |
# Force garbage collection after generation
|
248 |
# This can be helpful to free up memory used during generation
|
249 |
+
del inputs # Delete input tensors
|
250 |
+
# Move output tensors to CPU if they aren't already (should be with device_map="cpu")
|
251 |
+
if 'outputs' in locals() and isinstance(outputs, torch.Tensor):
|
252 |
+
outputs = outputs.cpu()
|
253 |
+
# Don't delete outputs yet, we need it for decoding
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
gc.collect()
|
256 |
+
# No need for cuda empty cache on CPU
|
|
|
257 |
|
258 |
# Decode and clean response
|
259 |
# Ensure outputs is a tensor before decoding
|
260 |
if isinstance(outputs, torch.Tensor) and outputs.ndim == 2 and outputs.shape[0] > 0:
|
261 |
+
# Decode the generated tokens, stopping at the end of the sequence if it includes the stop token
|
262 |
+
# Decode the whole sequence first, then parse
|
263 |
full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
|
264 |
else:
|
265 |
+
print("Warning: Generation output was not a tensor or was empty. Cannot decode.")
|
266 |
print(f"Output type: {type(outputs)}")
|
267 |
print(f"Output value: {outputs}")
|
268 |
return "Error: Failed to generate valid output."
|
269 |
|
270 |
# --- Parsing Logic (Keep your existing logic, it looks reasonable) ---
|
271 |
+
# Find the start of the assistant's response tag
|
272 |
+
response_start_marker = f"{ASSISTANT_TAG}" # Look for the exact tag
|
273 |
+
response_start_idx = full_text.rfind(response_start_marker)
|
274 |
+
|
275 |
+
response = "Error: Could not parse response." # Default error message
|
276 |
+
|
277 |
+
if response_start_idx != -1:
|
278 |
+
response_start = response_start_idx + len(response_start_marker)
|
279 |
+
# Look for stop sequences *after* the assistant tag
|
280 |
+
stop_sequences = [IM_END, f"{IM_START}user"] # Common chat stops
|
281 |
+
|
282 |
+
earliest_stop = len(full_text) # Default end is the end of the generated text
|
283 |
+
|
284 |
+
for stop_seq in stop_sequences:
|
285 |
+
idx = full_text.find(stop_seq, response_start)
|
286 |
+
if idx != -1:
|
287 |
+
if earliest_stop == len(full_text) or idx < earliest_stop:
|
288 |
+
earliest_stop = idx
|
289 |
+
|
290 |
+
response = full_text[response_start:earliest_stop].strip()
|
291 |
+
|
292 |
+
# Clean up potential trailing tokens if parsing wasn't perfect
|
293 |
+
response = response.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
|
294 |
+
|
295 |
else:
|
296 |
+
# Fallback if the assistant tag wasn't found in the output
|
297 |
+
print(f"Warning: Assistant tag '{ASSISTANT_TAG}' not found in generated text. Generated text:")
|
298 |
+
print(full_text)
|
299 |
+
# Attempt a simpler cleanup, maybe just removing special tokens
|
300 |
+
response = full_text.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
|
301 |
+
if not response:
|
302 |
+
response = "Warning: Could not find assistant response tag in generated text."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
|
305 |
return response
|
|
|
312 |
gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
|
313 |
gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
|
314 |
],
|
315 |
+
title="Code Reasoning Assistant (CPU)",
|
316 |
+
description="Fine-tuned coding assistant specialized in code reasoning and generation (Running on CPU)",
|
317 |
theme="soft"
|
318 |
)
|
319 |
|
320 |
if __name__ == "__main__":
|
321 |
# Create offload folder if it doesn't exist
|
322 |
+
OFFLOAD_DIR = "offload_folder"
|
323 |
+
os.makedirs(OFFLOAD_DIR, exist_ok=True)
|
324 |
+
print(f"Offload folder '{os.path.abspath(OFFLOAD_DIR)}' created or already exists.")
|
325 |
+
|
326 |
+
|
327 |
# Use lazy loading - only load model when first query arrives
|
328 |
print("Starting server with lazy model loading...")
|
329 |
+
print(f"Ensure '{os.path.abspath(OFFLOAD_DIR)}' has sufficient disk space for model offloading.")
|
330 |
+
print("Performance will be limited by CPU and disk speed.")
|
331 |
+
print("Monitoring memory (RAM + swap) and disk usage during the first query is highly recommended.")
|
332 |
demo.queue().launch()
|