ErenalpCet commited on
Commit
f46605a
·
verified ·
1 Parent(s): e6107ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -143
app.py CHANGED
@@ -1,96 +1,85 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
4
  import os
5
  import gc
6
  import psutil
7
- import time # Added for timing loading
8
 
9
  # --- Configuration ---
10
- MODEL_DIR = "ErenalpCet/E-Model-Reasoning-Coder-V1" # Your HF Hub model
 
 
 
11
  IM_START = "<|im_start|>"
12
  IM_END = "<|im_end|>"
13
  ASSISTANT_TAG = f"{IM_START}assistant\n"
14
 
15
  def load_model():
16
- """Loads the fine-tuned model and tokenizer using standard HF Transformers with optimized memory usage"""
17
  print(f"Loading model from: {MODEL_DIR}")
18
-
19
  # Force garbage collection before loading model
20
  gc.collect()
21
- if torch.cuda.is_available():
22
- torch.cuda.empty_cache() # Still good practice just in case
23
-
24
- # --- Configure Quantization ---
25
- # For CPU, 8-bit is generally better supported and faster than 4-bit,
26
- # but 4-bit saves more RAM. Start with 8-bit if 18GB is tight but maybe possible.
27
- # If 8-bit still fails, try 4-bit.
28
-
29
- # Option 1: 8-bit quantization
30
- # quantization_config = BitsAndBytesConfig(
31
- # load_in_8bit=True,
32
- # bnb_8bit_quant_type="int8", # Standard 8-bit
33
- # bnb_8bit_compute_dtype=torch.float32, # Compute in float32 on CPU
34
- # )
35
-
36
- # Option 2: 4-bit quantization (more memory saving, potentially slower on CPU)
37
- quantization_config = BitsAndBytesConfig(
38
- load_in_4bit=True,
39
- bnb_4bit_quant_type="nf4", # NormalFloat 4-bit
40
- bnb_4bit_use_double_quant=True, # Double quantization for slightly better accuracy
41
- bnb_4bit_compute_dtype=torch.float32, # Compute in float32 on CPU
42
- # bnb_4bit_quant_storage=torch.uint8, # Generally okay, sometimes bfloat16 might be slightly better if CPU supports it
43
- )
44
-
45
  # --- Loading ---
46
  try:
47
  # Load just the tokenizer first
48
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
49
-
50
  # Check memory before loading model
51
  process = psutil.Process(os.getpid())
52
  print(f"Memory usage before model load: {process.memory_info().rss / (1024 * 1024):.2f} MB")
53
-
54
  start_time = time.time()
55
  print("Starting model loading...")
56
 
57
- # Load model with quantization and offload settings
 
 
58
  model = AutoModelForCausalLM.from_pretrained(
59
  MODEL_DIR,
60
  trust_remote_code=True,
61
- # torch_dtype=torch.float16, # Not needed when using quantization on CPU
62
- quantization_config=quantization_config, # <<--- ADDED QUANTIZATION
63
- low_cpu_mem_usage=True, # Helps during loading
64
- offload_folder="offload_folder", # Enable disk offloading (still useful)
65
- offload_state_dict=True, # Offload state dict during loading
66
- device_map="cpu" # <<--- EXPLICITLY SET TO CPU
 
 
 
 
67
  )
68
-
69
  end_time = time.time()
70
  print(f"Model loading took {end_time - start_time:.2f} seconds.")
71
 
72
- # Model should already be on CPU due to device_map="cpu"
73
- # Explicit .to("cpu") is redundant here if device_map="cpu" is used,
74
- # and could potentially cause a memory spike if device_map put anything elsewhere.
75
- # Let's remove the explicit .to("cpu") after loading with device_map="cpu"
76
- # model = model.to("cpu") # <-- REMOVED
77
-
78
- if torch.cuda.is_available():
79
- torch.cuda.empty_cache() # Clean up any potential residual GPU memory
80
 
81
  # Set model to evaluation mode
82
  model.eval()
83
-
84
  # Add special tokens if needed
85
- # Get current vocab size BEFORE adding tokens
86
  original_vocab_size = len(tokenizer)
87
  special_tokens = [IM_START, IM_END]
88
  added_tokens_dict = {"additional_special_tokens": []}
89
-
90
  for token in special_tokens:
91
- if token not in tokenizer.get_vocab():
92
- added_tokens_dict["additional_special_tokens"].append(token)
93
-
 
 
 
 
94
  if added_tokens_dict["additional_special_tokens"]:
95
  num_added = tokenizer.add_special_tokens(added_tokens_dict)
96
  print(f"Added {num_added} special tokens: {added_tokens_dict['additional_special_tokens']}")
@@ -105,7 +94,7 @@ def load_model():
105
 
106
  # Verify stop token
107
  im_end_id = tokenizer.convert_tokens_to_ids(IM_END)
108
- if im_end_id == tokenizer.unk_token_id or im_end_id is None:
109
  print(f"Warning: '{IM_END}' not recognized by tokenizer. Using EOS token ({tokenizer.eos_token}) as stop sequence (ID: {tokenizer.eos_token_id}).")
110
  stop_token_id = tokenizer.eos_token_id
111
  else:
@@ -120,35 +109,37 @@ def load_model():
120
  print(f"Number of model parameters: {model.num_parameters():,}") # Print parameter count
121
 
122
  return model, tokenizer, stop_token_id
123
-
124
  except Exception as e:
125
  print(f"Error loading model: {e}")
126
  print("Attempting to print traceback:")
127
  import traceback
128
  traceback.print_exc()
129
  print("-" * 20)
130
- print("Troubleshooting Steps:")
131
- print("1. Verify the model path/name is correct.")
132
- print("2. Check if you have enough disk space for the 'offload_folder'.")
133
- print("3. Try switching between 8-bit and 4-bit quantization in the code.")
134
- print("4. Ensure you have the 'accelerate' library installed (`pip install accelerate`).")
135
- print("5. Ensure you have the 'bitsandbytes' library installed (`pip install bitsandbytes`). On Windows, specific installation might be needed (see bitsandbytes docs).")
136
- print("6. The model might simply be too large even with 4-bit and offloading for 18GB RAM.")
137
  return None, None, None
138
 
139
  # Initialize model as None and load lazily
140
  model, tokenizer, stop_token_id = None, None, None
141
 
142
- # Rest of your code (generate_response, Gradio interface, __main__ block)
143
- # remains largely the same. Only the load_model function needs significant changes.
144
-
145
- # --- Rest of your code (copy/paste from your original script) ---
146
 
147
  def lazy_load_model():
148
  """Lazily load model only when needed"""
149
  global model, tokenizer, stop_token_id
150
  if model is None:
 
151
  model, tokenizer, stop_token_id = load_model()
 
 
 
 
152
  return model is not None
153
 
154
  def generate_response(
@@ -161,36 +152,60 @@ def generate_response(
161
  # Lazily load model on first request
162
  if not lazy_load_model():
163
  return "Model loading failed. Check server logs for details."
164
-
165
  # Build conversation history
166
  context = []
167
  for user_msg, bot_msg in history:
168
  context.append(f"{IM_START}user\n{user_msg}\n{IM_END}")
169
  if bot_msg:
170
- context.append(f"{ASSISTANT_TAG}{bot_msg}\n{IM_END}")
171
-
 
 
172
  # Add current input
173
  context.append(f"{IM_START}user\n{user_input}\n{IM_END}")
174
  context.append(ASSISTANT_TAG)
175
-
176
  # Tokenize with efficient settings
177
  input_text = "\n".join(context)
178
-
179
  # Get max length from model config if available, default otherwise
180
- max_model_input_length = getattr(model.config, "max_position_embeddings", 2048)
181
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  inputs = tokenizer(
183
  input_text,
184
  return_tensors="pt",
185
- truncation=True,
186
- max_length=max_model_input_length - max_tokens, # Ensure space for generation
187
  padding=False, # Avoid unnecessary padding
188
  ).to(model.device) # Use model.device - will be 'cpu' due to device_map
189
 
190
- # Generate response with optimized settings
191
  with torch.no_grad():
192
  try:
193
- # Use more memory-efficient generation
 
 
194
  outputs = model.generate(
195
  **inputs,
196
  max_new_tokens=max_tokens,
@@ -198,24 +213,31 @@ def generate_response(
198
  top_p=top_p,
199
  do_sample=True,
200
  eos_token_id=stop_token_id, # Use the verified stop_token_id
201
- pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id, # Use pad or eos
 
202
  repetition_penalty=1.2,
203
- use_cache=True, # Enable KV caching for efficiency
204
- # Add stop sequences for multi-token stopping if needed, but eos_token_id is usually sufficient
205
- # num_beams=1, # Generally use num_beams=1 for chat to avoid high memory usage
 
 
 
 
 
206
  )
207
  except Exception as e:
208
  print(f"Error during generation: {e}")
209
  # Fallback to simpler generation settings if needed
210
  try:
211
- print("Attempting simplified generation...")
212
  outputs = model.generate(
213
  **inputs,
214
  max_new_tokens=max_tokens,
215
- do_sample=False, # Force greedy decoding
216
  use_cache=True,
217
  eos_token_id=stop_token_id,
218
  pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
 
219
  )
220
  except Exception as e2:
221
  print(f"Simplified generation also failed: {e2}")
@@ -224,74 +246,60 @@ def generate_response(
224
 
225
  # Force garbage collection after generation
226
  # This can be helpful to free up memory used during generation
227
- del inputs
228
- if 'outputs' in locals():
229
- # You might want to keep outputs if decoding later, but free tensors
230
- if isinstance(outputs, torch.Tensor):
231
- outputs = outputs.cpu() # Move to CPU before potential deletion
232
- # del outputs # Delete if you decode immediately after
233
- pass # Keep for decoding below
234
- elif isinstance(outputs, dict) and 'sequences' in outputs:
235
- outputs['sequences'] = outputs['sequences'].cpu()
236
- pass # Keep for decoding below
237
-
238
 
239
  gc.collect()
240
- if torch.cuda.is_available():
241
- torch.cuda.empty_cache() # Just in case
242
 
243
  # Decode and clean response
244
  # Ensure outputs is a tensor before decoding
245
  if isinstance(outputs, torch.Tensor) and outputs.ndim == 2 and outputs.shape[0] > 0:
 
 
246
  full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
247
  else:
248
- print("Warning: Generation output was not a tensor. Cannot decode.")
249
  print(f"Output type: {type(outputs)}")
250
  print(f"Output value: {outputs}")
251
  return "Error: Failed to generate valid output."
252
 
253
  # --- Parsing Logic (Keep your existing logic, it looks reasonable) ---
254
- response_start = full_text.rfind(ASSISTANT_TAG)
255
- if response_start != -1:
256
- response_start += len(ASSISTANT_TAG)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  else:
258
- # Fallback if the tag wasn't in the output for some reason
259
- print("Warning: Assistant tag not found in generated text. Attempting simple cleanup.")
260
- # Find the end of the input prompt
261
- input_end_idx = full_text.rfind(f"{IM_END}\n{ASSISTANT_TAG}")
262
- if input_end_idx != -1:
263
- response_start = input_end_idx + len(f"{IM_END}\n{ASSISTANT_TAG}")
264
- else:
265
- # If even the full sequence isn't there, maybe just return everything after the *last* IM_END
266
- last_im_end = full_text.rfind(IM_END)
267
- if last_im_end != -1:
268
- response_start = last_im_end + len(IM_END)
269
- else:
270
- # Last resort, return the whole thing or an error
271
- return "Could not parse response."
272
-
273
-
274
- response_end = len(full_text) # Default end is the end of the generated text
275
-
276
- # Look for stop sequences *after* the assistant tag/start
277
- stop_sequences = [IM_END, f"{IM_START}user"] # Add others if needed
278
-
279
- earliest_stop = -1
280
- for stop_seq in stop_sequences:
281
- idx = full_text.find(stop_seq, response_start)
282
- if idx != -1:
283
- if earliest_stop == -1 or idx < earliest_stop:
284
- earliest_stop = idx
285
-
286
- if earliest_stop != -1:
287
- response_end = earliest_stop
288
- # --- End Parsing Logic ---
289
-
290
- # Extract the response
291
- response = full_text[response_start:response_end].strip()
292
-
293
- # Clean up potential trailing tokens if parsing wasn't perfect
294
- response = response.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
295
 
296
 
297
  return response
@@ -304,17 +312,21 @@ demo = gr.ChatInterface(
304
  gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
305
  gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
306
  ],
307
- title="Code Reasoning Assistant",
308
- description="Fine-tuned coding assistant specialized in code reasoning and generation",
309
  theme="soft"
310
  )
311
 
312
  if __name__ == "__main__":
313
  # Create offload folder if it doesn't exist
314
- os.makedirs("offload_folder", exist_ok=True)
315
-
 
 
 
316
  # Use lazy loading - only load model when first query arrives
317
  print("Starting server with lazy model loading...")
318
- print("Ensure 'offload_folder' exists and you have sufficient disk space.")
319
- print("Monitoring memory during the first query is recommended.")
 
320
  demo.queue().launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig # Removed BitsAndBytesConfig
4
  import os
5
  import gc
6
  import psutil
7
+ import time
8
 
9
  # --- Configuration ---
10
+ # Your HF Hub model
11
+ # This model seems to be 7B parameters (based on typical naming for Llama/Mistral fine-tunes)
12
+ # A 7B model in float32 is approx 28GB. Loading this on 18GB RAM WILL require disk offloading.
13
+ MODEL_DIR = "ErenalpCet/E-Model-Reasoning-Coder-V1"
14
  IM_START = "<|im_start|>"
15
  IM_END = "<|im_end|>"
16
  ASSISTANT_TAG = f"{IM_START}assistant\n"
17
 
18
  def load_model():
19
+ """Loads the fine-tuned model and tokenizer for CPU using offloading."""
20
  print(f"Loading model from: {MODEL_DIR}")
21
+
22
  # Force garbage collection before loading model
23
  gc.collect()
24
+ # No need for cuda empty cache as we are on CPU
25
+
26
+ # --- Configuration (Removed Quantization config) ---
27
+ # BitsAndBytes requires CUDA for its quantization methods (load_in_4bit, load_in_8bit)
28
+ # Since you only have CPU, we remove the BitsAndBytesConfig.
29
+ # We will rely on device_map="cpu" and offloading to handle memory.
30
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # --- Loading ---
32
  try:
33
  # Load just the tokenizer first
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
35
+
36
  # Check memory before loading model
37
  process = psutil.Process(os.getpid())
38
  print(f"Memory usage before model load: {process.memory_info().rss / (1024 * 1024):.2f} MB")
39
+
40
  start_time = time.time()
41
  print("Starting model loading...")
42
 
43
+ # Load model using device_map="cpu" and offloading for memory management
44
+ # This will load the model weights, likely in float32 (approx 28GB for 7B),
45
+ # splitting parts of it between RAM and disk ('offload_folder').
46
  model = AutoModelForCausalLM.from_pretrained(
47
  MODEL_DIR,
48
  trust_remote_code=True,
49
+ # We are on CPU, so we don't need torch_dtype=torch.float16/bfloat16
50
+ # Float32 is the default and standard for CPU compute.
51
+ # Quantization via BitsAndBytesConfig is removed.
52
+ low_cpu_mem_usage=True, # Very important for large models on CPU
53
+ offload_folder="offload_folder", # Required with low_cpu_mem_usage if model > RAM
54
+ offload_state_dict=True, # Offload state dict during loading
55
+ device_map="cpu" # Explicitly set to CPU
56
+ # You can also use device_map="auto" with max_memory={0: "18GB", "cpu": "auto"}
57
+ # but device_map="cpu" is simpler if you know you only have CPU.
58
+ # Let's stick to the explicit "cpu" as requested.
59
  )
60
+
61
  end_time = time.time()
62
  print(f"Model loading took {end_time - start_time:.2f} seconds.")
63
 
64
+ # Model is already on CPU due to device_map="cpu"
65
+ # model = model.to("cpu") # Redundant with device_map="cpu"
 
 
 
 
 
 
66
 
67
  # Set model to evaluation mode
68
  model.eval()
69
+
70
  # Add special tokens if needed
 
71
  original_vocab_size = len(tokenizer)
72
  special_tokens = [IM_START, IM_END]
73
  added_tokens_dict = {"additional_special_tokens": []}
74
+
75
  for token in special_tokens:
76
+ # Check if token is already in the main vocab or added special tokens
77
+ # Use tokenizer.convert_tokens_to_ids and check against unknown token ID
78
+ token_id = tokenizer.convert_tokens_to_ids(token)
79
+ if token_id is None or token_id == tokenizer.unk_token_id:
80
+ added_tokens_dict["additional_special_tokens"].append(token)
81
+
82
+
83
  if added_tokens_dict["additional_special_tokens"]:
84
  num_added = tokenizer.add_special_tokens(added_tokens_dict)
85
  print(f"Added {num_added} special tokens: {added_tokens_dict['additional_special_tokens']}")
 
94
 
95
  # Verify stop token
96
  im_end_id = tokenizer.convert_tokens_to_ids(IM_END)
97
+ if im_end_id is None or im_end_id == tokenizer.unk_token_id:
98
  print(f"Warning: '{IM_END}' not recognized by tokenizer. Using EOS token ({tokenizer.eos_token}) as stop sequence (ID: {tokenizer.eos_token_id}).")
99
  stop_token_id = tokenizer.eos_token_id
100
  else:
 
109
  print(f"Number of model parameters: {model.num_parameters():,}") # Print parameter count
110
 
111
  return model, tokenizer, stop_token_id
112
+
113
  except Exception as e:
114
  print(f"Error loading model: {e}")
115
  print("Attempting to print traceback:")
116
  import traceback
117
  traceback.print_exc()
118
  print("-" * 20)
119
+ print("Troubleshooting Steps (for CPU loading without BitsAndBytes):")
120
+ print("1. Verify the model path/name is correct on Hugging Face Hub.")
121
+ print(f"2. Ensure you have sufficient *disk space* in the '{os.path.abspath('offload_folder')}' for offloading (likely tens of GB needed).")
122
+ print("3. Ensure you have the 'accelerate' library installed (`pip install accelerate`). This is crucial for low_cpu_mem_usage and offloading.")
123
+ print("4. The model might still be too large even with offloading if total system memory (RAM + swap) is insufficient or disk I/O is a bottleneck during loading.")
124
+ print("5. Check system logs for any out-of-memory errors or disk full errors.")
125
+ print("6. Consider if your system has sufficient swap space configured, as offloading might rely on it.")
126
  return None, None, None
127
 
128
  # Initialize model as None and load lazily
129
  model, tokenizer, stop_token_id = None, None, None
130
 
131
+ # --- Rest of your code (generate_response, Gradio interface, __main__ block) ---
 
 
 
132
 
133
  def lazy_load_model():
134
  """Lazily load model only when needed"""
135
  global model, tokenizer, stop_token_id
136
  if model is None:
137
+ print("Model not loaded, attempting to load now...")
138
  model, tokenizer, stop_token_id = load_model()
139
+ if model is None:
140
+ print("Model loading failed.")
141
+ else:
142
+ print("Model loaded successfully.")
143
  return model is not None
144
 
145
  def generate_response(
 
152
  # Lazily load model on first request
153
  if not lazy_load_model():
154
  return "Model loading failed. Check server logs for details."
155
+
156
  # Build conversation history
157
  context = []
158
  for user_msg, bot_msg in history:
159
  context.append(f"{IM_START}user\n{user_msg}\n{IM_END}")
160
  if bot_msg:
161
+ # Ensure bot_msg is not None or empty before adding
162
+ if bot_msg.strip():
163
+ context.append(f"{ASSISTANT_TAG}{bot_msg}\n{IM_END}")
164
+
165
  # Add current input
166
  context.append(f"{IM_START}user\n{user_input}\n{IM_END}")
167
  context.append(ASSISTANT_TAG)
168
+
169
  # Tokenize with efficient settings
170
  input_text = "\n".join(context)
171
+
172
  # Get max length from model config if available, default otherwise
173
+ # Some models might have a different config attribute for context length
174
+ max_model_input_length = getattr(model.config, "max_position_embeddings", None)
175
+ if max_model_input_length is None:
176
+ # Fallback for models without max_position_embeddings (e.g., some Llama configs use hidden_size or similar indirectly)
177
+ # A common safe value for many 7B models is 4096 or 8192
178
+ max_model_input_length = 4096 # Or check model card/config.json
179
+ print(f"Warning: model.config.max_position_embeddings not found. Using default max_length: {max_model_input_length}")
180
+
181
+ # Ensure we don't truncate the input so much that there's no space for output
182
+ # A better calculation might consider the token size of the prompt vs total length
183
+ # A safer approach is to limit input length if it gets too long,
184
+ # but let's keep the current logic which reserves space for max_tokens output.
185
+ # However, the truncation setting itself `max_length` applies to the *input*.
186
+ # If input exceeds max_length, it's truncated. Need enough length for prompt + max_tokens
187
+ effective_max_length = max_model_input_length
188
+ if effective_max_length - max_tokens < len(tokenizer.encode(input_text)):
189
+ # If the required input length *plus* desired output length exceeds model capacity,
190
+ # something is wrong or the input is too long. Let's just truncate input firmly.
191
+ # A common pattern is to limit conversation history or simply let truncation handle it.
192
+ # Let's rely on truncation below.
193
+ pass # No specific action here, tokenizer(..., max_length=...) handles truncation
194
+
195
  inputs = tokenizer(
196
  input_text,
197
  return_tensors="pt",
198
+ truncation=True, # Explicitly enable truncation if input is too long
199
+ max_length=effective_max_length, # Truncate input if it exceeds model capacity
200
  padding=False, # Avoid unnecessary padding
201
  ).to(model.device) # Use model.device - will be 'cpu' due to device_map
202
 
203
+ # Generate response with optimized settings for CPU
204
  with torch.no_grad():
205
  try:
206
+ # Use more memory-efficient generation settings for CPU if possible
207
+ # num_beams > 1 increases memory significantly, stick to 1
208
+ # Early stopping can save computation
209
  outputs = model.generate(
210
  **inputs,
211
  max_new_tokens=max_tokens,
 
213
  top_p=top_p,
214
  do_sample=True,
215
  eos_token_id=stop_token_id, # Use the verified stop_token_id
216
+ # Use pad_token_id if available, otherwise eos_token_id is common fallback
217
+ pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
218
  repetition_penalty=1.2,
219
+ use_cache=True, # Enable KV caching (still beneficial on CPU)
220
+ num_beams=1, # Keep beam search off for lower memory
221
+ # Add stop sequences if needed for multi-token stops not covered by eos_token_id
222
+ # For chat, frequently [IM_END, f"{IM_START}user"] are good stops
223
+ # You can provide list of token IDs or list of strings
224
+ # Example using strings (requires generate to handle them, which it usually does)
225
+ # stopping_criteria=transformers.StoppingCriteriaList([transformers.TextGenerationStopCriteria(...)]) # More complex
226
+ # Simple approach: rely on eos_token_id and post-processing
227
  )
228
  except Exception as e:
229
  print(f"Error during generation: {e}")
230
  # Fallback to simpler generation settings if needed
231
  try:
232
+ print("Attempting simplified generation (greedy decoding)...")
233
  outputs = model.generate(
234
  **inputs,
235
  max_new_tokens=max_tokens,
236
+ do_sample=False, # Force greedy decoding - less memory, potentially faster
237
  use_cache=True,
238
  eos_token_id=stop_token_id,
239
  pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
240
+ num_beams=1,
241
  )
242
  except Exception as e2:
243
  print(f"Simplified generation also failed: {e2}")
 
246
 
247
  # Force garbage collection after generation
248
  # This can be helpful to free up memory used during generation
249
+ del inputs # Delete input tensors
250
+ # Move output tensors to CPU if they aren't already (should be with device_map="cpu")
251
+ if 'outputs' in locals() and isinstance(outputs, torch.Tensor):
252
+ outputs = outputs.cpu()
253
+ # Don't delete outputs yet, we need it for decoding
 
 
 
 
 
 
254
 
255
  gc.collect()
256
+ # No need for cuda empty cache on CPU
 
257
 
258
  # Decode and clean response
259
  # Ensure outputs is a tensor before decoding
260
  if isinstance(outputs, torch.Tensor) and outputs.ndim == 2 and outputs.shape[0] > 0:
261
+ # Decode the generated tokens, stopping at the end of the sequence if it includes the stop token
262
+ # Decode the whole sequence first, then parse
263
  full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
264
  else:
265
+ print("Warning: Generation output was not a tensor or was empty. Cannot decode.")
266
  print(f"Output type: {type(outputs)}")
267
  print(f"Output value: {outputs}")
268
  return "Error: Failed to generate valid output."
269
 
270
  # --- Parsing Logic (Keep your existing logic, it looks reasonable) ---
271
+ # Find the start of the assistant's response tag
272
+ response_start_marker = f"{ASSISTANT_TAG}" # Look for the exact tag
273
+ response_start_idx = full_text.rfind(response_start_marker)
274
+
275
+ response = "Error: Could not parse response." # Default error message
276
+
277
+ if response_start_idx != -1:
278
+ response_start = response_start_idx + len(response_start_marker)
279
+ # Look for stop sequences *after* the assistant tag
280
+ stop_sequences = [IM_END, f"{IM_START}user"] # Common chat stops
281
+
282
+ earliest_stop = len(full_text) # Default end is the end of the generated text
283
+
284
+ for stop_seq in stop_sequences:
285
+ idx = full_text.find(stop_seq, response_start)
286
+ if idx != -1:
287
+ if earliest_stop == len(full_text) or idx < earliest_stop:
288
+ earliest_stop = idx
289
+
290
+ response = full_text[response_start:earliest_stop].strip()
291
+
292
+ # Clean up potential trailing tokens if parsing wasn't perfect
293
+ response = response.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
294
+
295
  else:
296
+ # Fallback if the assistant tag wasn't found in the output
297
+ print(f"Warning: Assistant tag '{ASSISTANT_TAG}' not found in generated text. Generated text:")
298
+ print(full_text)
299
+ # Attempt a simpler cleanup, maybe just removing special tokens
300
+ response = full_text.replace(IM_START, "").replace(IM_END, "").replace("user\n", "").replace("assistant\n", "").strip()
301
+ if not response:
302
+ response = "Warning: Could not find assistant response tag in generated text."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
 
305
  return response
 
312
  gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
313
  gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
314
  ],
315
+ title="Code Reasoning Assistant (CPU)",
316
+ description="Fine-tuned coding assistant specialized in code reasoning and generation (Running on CPU)",
317
  theme="soft"
318
  )
319
 
320
  if __name__ == "__main__":
321
  # Create offload folder if it doesn't exist
322
+ OFFLOAD_DIR = "offload_folder"
323
+ os.makedirs(OFFLOAD_DIR, exist_ok=True)
324
+ print(f"Offload folder '{os.path.abspath(OFFLOAD_DIR)}' created or already exists.")
325
+
326
+
327
  # Use lazy loading - only load model when first query arrives
328
  print("Starting server with lazy model loading...")
329
+ print(f"Ensure '{os.path.abspath(OFFLOAD_DIR)}' has sufficient disk space for model offloading.")
330
+ print("Performance will be limited by CPU and disk speed.")
331
+ print("Monitoring memory (RAM + swap) and disk usage during the first query is highly recommended.")
332
  demo.queue().launch()