johnpaulbin commited on
Commit
cedd7b9
·
verified ·
1 Parent(s): 42b5300

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -137
app.py CHANGED
@@ -6,6 +6,7 @@ from huggingface_hub import hf_hub_download
6
  import threading
7
  import queue
8
  import multiprocessing
 
9
 
10
  # First check if GPU is available for maximum speed
11
  has_gpu = torch.cuda.is_available()
@@ -26,37 +27,22 @@ adapter_path = get_model_path(
26
  "articulate-V1-q8_0.gguf"
27
  )
28
 
29
- # Set up optimized environment variables for llama-cpp-python
30
- os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
31
  os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
32
- # For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
33
- os.environ["LLAMA_AVX"] = "1"
34
- os.environ["LLAMA_AVX2"] = "1"
35
- os.environ["LLAMA_F16"] = "1" # Use FP16 where available
36
 
37
- # Determine the most optimized backend
38
- if has_gpu:
39
- try:
40
- from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
41
- LlamaClass = GPULlama
42
- print("Using GPU-accelerated llama-cpp-python")
43
- n_gpu_layers = -1 # Use all layers on GPU
44
- except ImportError:
45
- from llama_cpp import Llama
46
- LlamaClass = Llama
47
- print("Using standard llama-cpp-python with GPU acceleration")
48
- n_gpu_layers = -1 # Use all layers on GPU
49
- else:
50
- from llama_cpp import Llama
51
- LlamaClass = Llama
52
- print("Using CPU-only llama-cpp-python")
53
- n_gpu_layers = 0
54
 
55
  # Cache for translations
56
  translation_cache = {}
57
- MAX_CACHE_SIZE = 1000
58
 
59
- # Pre-compute common translations
60
  COMMON_PHRASES = {
61
  "English to Spanish": [
62
  "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
@@ -76,46 +62,59 @@ COMMON_PHRASES = {
76
  ]
77
  }
78
 
79
- # Background worker for model loading and inference
80
- class ModelWorker:
81
- def __init__(self):
82
- self.model = None
 
 
 
 
 
 
83
  self.request_queue = queue.Queue()
84
  self.response_queue = queue.Queue()
85
- self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
86
- self.worker_thread.start()
87
-
88
- def _worker_loop(self):
89
- # Initialize model in the worker thread
90
- print("Initializing model in background thread...")
91
 
92
- # CPU optimization settings
93
  cpu_count = multiprocessing.cpu_count()
94
- optimal_threads = max(4, cpu_count - 2) # Leave two cores free
95
 
96
- # Initialize with the most optimized settings
97
- start_time = time.time()
98
- self.model = LlamaClass(
99
  model_path=base_model_path,
100
  lora_path=adapter_path,
101
- n_ctx=512, # Larger context for longer translations
102
  n_threads=optimal_threads, # Optimized thread count
103
- n_batch=1024, # Large batch for parallel processing
104
- use_mmap=True, # Efficient memory mapping
105
- n_gpu_layers=n_gpu_layers, # GPU acceleration if available
106
- seed=42, # Consistent results
107
- verbose=False, # Reduce overhead
108
- main_gpu=0, # Primary GPU
109
- tensor_split=None, # Auto-distribute across GPUs if multiple
110
- rope_freq_base=10000, # Optimized attention parameters
111
  rope_freq_scale=1.0,
112
  )
113
  print(f"Model loaded in {time.time() - start_time:.2f} seconds")
114
 
115
- # Pre-warm the model with common phrases
116
- self._prewarm_model()
 
 
 
117
 
118
- # Process requests
 
 
 
 
 
 
 
119
  while True:
120
  try:
121
  request = self.request_queue.get()
@@ -123,31 +122,51 @@ class ModelWorker:
123
  break
124
 
125
  direction, text, callback_id = request
126
- result = self._process_translation(direction, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  self.response_queue.put((callback_id, result))
 
128
  except Exception as e:
129
  print(f"Error in worker thread: {e}")
130
  self.response_queue.put((callback_id, f"Error: {str(e)}"))
 
131
 
132
  def _prewarm_model(self):
133
- """Pre-compute common translations to warm up the model"""
134
- print("Pre-warming model with common phrases...")
135
  start = time.time()
 
 
136
  for direction, phrases in COMMON_PHRASES.items():
137
- for phrase in phrases[:3]: # Just do a few to warm up
138
- self._process_translation(direction, phrase)
139
- print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
 
140
 
141
  def _process_translation(self, direction, text):
 
142
  # Skip empty inputs
143
  if not text or not text.strip():
144
  return ""
145
 
146
- # Check cache first for faster response
147
- cache_key = f"{direction}:{text}"
148
- if cache_key in translation_cache:
149
- return translation_cache[cache_key]
150
-
151
  # Start timing for performance tracking
152
  start_time = time.time()
153
 
@@ -164,147 +183,191 @@ class ModelWorker:
164
 
165
  source_lang, target_lang = lang_map[direction]
166
 
 
 
 
 
 
167
  # Efficient prompt format
168
  prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
169
 
170
- # Estimate appropriate token length based on input
171
  input_tokens = len(text.split())
172
- max_tokens = min(200, max(50, int(input_tokens * 1.5)))
173
-
174
- # Generate translation with optimized settings
175
- response = self.model.create_completion(
176
- prompt,
177
- max_tokens=max_tokens,
178
- temperature=0.0, # Deterministic for faster inference
179
- top_k=1, # Only consider most likely token
180
- top_p=1.0, # No sampling
181
- repeat_penalty=1.0, # No repeat penalty
182
- stream=False # Get complete response at once
183
- )
184
-
185
- translation = response['choices'][0]['text'].strip()
186
 
187
- # Cache result
188
- if len(translation_cache) >= MAX_CACHE_SIZE:
189
- # Remove oldest entry (first key)
190
- translation_cache.pop(next(iter(translation_cache)))
191
- translation_cache[cache_key] = translation
192
-
193
- # Log performance
194
- inference_time = time.time() - start_time
195
- tokens_per_second = (input_tokens + len(translation.split())) / inference_time
196
- print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
197
-
198
- return translation
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  def request_translation(self, direction, text, callback_id):
201
  """Queue a translation request"""
202
  self.request_queue.put((direction, text, callback_id))
203
 
204
- # Create worker instance
205
- worker = ModelWorker()
206
 
207
  # Counter for request IDs
208
  next_request_id = 0
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Gradio interface functions
211
  def translate(direction, text, progress=gr.Progress()):
212
- """Queue translation request and wait for result"""
213
  global next_request_id
214
 
215
- # Check cache first for immediate response
 
 
 
 
 
 
 
 
 
 
 
 
216
  cache_key = f"{direction}:{text}"
217
  if cache_key in translation_cache:
218
  return translation_cache[cache_key]
219
 
220
- # If input is very short, check if we have a similar cached phrase
221
  if len(text) < 20:
222
- for cached_key in translation_cache:
223
- cached_dir, cached_text = cached_key.split(":", 1)
224
- if cached_dir == direction and cached_text.lower().startswith(text.lower()):
225
- return translation_cache[cached_key]
226
 
227
  # Generate unique request ID
228
  request_id = next_request_id
229
  next_request_id += 1
230
 
231
  # Queue the request
232
- worker.request_translation(direction, text, request_id)
233
 
234
- # Wait for the response (with progress feedback)
235
  progress(0, desc="Translating...")
236
- max_wait = 30 # Maximum wait time in seconds
237
  start_time = time.time()
238
 
 
239
  while time.time() - start_time < max_wait:
240
  progress((time.time() - start_time) / max_wait)
241
 
242
  # Check for our response
243
  try:
244
- while not worker.response_queue.empty():
245
- resp_id, result = worker.response_queue.get_nowait()
246
  if resp_id == request_id:
 
 
247
  progress(1.0)
248
  return result
249
  except queue.Empty:
250
  pass
251
 
252
- # Small sleep to prevent CPU hogging
253
- time.sleep(0.05)
254
 
255
  progress(1.0)
256
- return "Translation timed out. Please try again."
257
 
258
- # Create Gradio interface
259
- with gr.Blocks(title="Ultra-Fast Translation App") as iface:
260
  gr.Markdown(f"""
261
- ## Ultra-Fast Translation App
262
- Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
 
263
  """)
264
 
265
  with gr.Row():
266
- direction = gr.Dropdown(
267
- choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
268
  label="Translation Direction",
269
  value="English to Spanish"
270
  )
271
 
272
  with gr.Row():
273
- input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
274
- output_text = gr.Textbox(lines=5, label="Translation")
275
 
276
  # Add translate button
277
  translate_btn = gr.Button("Translate")
278
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
279
 
280
- # Optimization options
281
- with gr.Accordion("Advanced Options", open=False):
282
- gr.Markdown("""
283
- ### Performance Tips
284
- - Short sentences translate faster than long paragraphs
285
- - Common phrases may be cached for instant results
286
- - First translation might be slower as the model warms up
287
- """)
288
-
289
- # Add examples with preloaded common phrases
290
  gr.Examples(
291
  examples=[
292
- ["English to Spanish", "Hello, how are you today?"],
293
- ["Spanish to English", "Hola, ¿cómo estás hoy?"],
294
- ["English to Korean", "The weather is nice today."],
295
- ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
296
  ],
297
  inputs=[direction, input_text],
298
  fn=translate,
299
  outputs=output_text
300
  )
 
 
 
 
 
 
 
 
 
301
 
302
  # Launch with optimized settings
303
- iface.launch(
304
- debug=False,
305
- show_error=True,
306
- share=False, # Don't share publicly by default
307
- quiet=True, # Reduce console output
308
- server_name="0.0.0.0",
309
- server_port=7860
310
- )
 
 
6
  import threading
7
  import queue
8
  import multiprocessing
9
+ from functools import lru_cache
10
 
11
  # First check if GPU is available for maximum speed
12
  has_gpu = torch.cuda.is_available()
 
27
  "articulate-V1-q8_0.gguf"
28
  )
29
 
30
+ # Optimize environment variables for CPU performance
31
+ os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only
32
  os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
33
+ os.environ["LLAMA_AVX"] = "1" # Enable AVX
34
+ os.environ["LLAMA_AVX2"] = "1" # Enable AVX2
35
+ os.environ["LLAMA_F16"] = "1" # Use FP16 where available
 
36
 
37
+ # Import the right module
38
+ from llama_cpp import Llama
39
+ print("Using CPU-optimized llama-cpp-python")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Cache for translations
42
  translation_cache = {}
43
+ MAX_CACHE_SIZE = 5000 # Increased cache size
44
 
45
+ # Common phrases for pre-loading
46
  COMMON_PHRASES = {
47
  "English to Spanish": [
48
  "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
 
62
  ]
63
  }
64
 
65
+ # Implement LRU cache for better performance
66
+ @lru_cache(maxsize=100)
67
+ def get_cached_translation(direction, text):
68
+ """LRU cache for translations"""
69
+ return None # This gets bypassed when there's a cache hit
70
+
71
+ # Create a worker pool for parallel translation
72
+ class ModelWorkerPool:
73
+ def __init__(self, num_workers=1):
74
+ self.num_workers = num_workers
75
  self.request_queue = queue.Queue()
76
  self.response_queue = queue.Queue()
77
+ self.workers = []
78
+ self.initialized = False
79
+
80
+ # Create shared model instance with optimized settings
81
+ print("Initializing model with CPU optimizations...")
82
+ start_time = time.time()
83
 
84
+ # CPU optimization settings - use fewer threads for Q8 model
85
  cpu_count = multiprocessing.cpu_count()
86
+ optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance
87
 
88
+ # Create a smaller context size for faster inference
89
+ self.model = Llama(
 
90
  model_path=base_model_path,
91
  lora_path=adapter_path,
92
+ n_ctx=256, # Reduced context for faster processing
93
  n_threads=optimal_threads, # Optimized thread count
94
+ n_batch=512, # Reduced batch size for CPU
95
+ use_mmap=True, # Efficient memory mapping
96
+ n_gpu_layers=0, # CPU only
97
+ seed=42, # Consistent results
98
+ verbose=False, # Reduce overhead
99
+ rope_freq_base=10000, # Default attention parameters
 
 
100
  rope_freq_scale=1.0,
101
  )
102
  print(f"Model loaded in {time.time() - start_time:.2f} seconds")
103
 
104
+ # Start worker threads
105
+ for i in range(num_workers):
106
+ worker = threading.Thread(target=self._worker_loop, daemon=True)
107
+ worker.start()
108
+ self.workers.append(worker)
109
 
110
+ self.initialized = True
111
+
112
+ # Pre-warm in background thread to not block startup
113
+ warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
114
+ warming_thread.start()
115
+
116
+ def _worker_loop(self):
117
+ """Worker thread that processes translation requests"""
118
  while True:
119
  try:
120
  request = self.request_queue.get()
 
122
  break
123
 
124
  direction, text, callback_id = request
125
+
126
+ # Check LRU cache first
127
+ cached = get_cached_translation(direction, text)
128
+ if cached is not None:
129
+ self.response_queue.put((callback_id, cached))
130
+ self.request_queue.task_done()
131
+ continue
132
+
133
+ # Check regular cache
134
+ cache_key = f"{direction}:{text}"
135
+ if cache_key in translation_cache:
136
+ result = translation_cache[cache_key]
137
+ else:
138
+ # Process new translation
139
+ result = self._process_translation(direction, text)
140
+ # Store in regular cache
141
+ if len(translation_cache) >= MAX_CACHE_SIZE:
142
+ translation_cache.pop(next(iter(translation_cache)))
143
+ translation_cache[cache_key] = result
144
+
145
  self.response_queue.put((callback_id, result))
146
+ self.request_queue.task_done()
147
  except Exception as e:
148
  print(f"Error in worker thread: {e}")
149
  self.response_queue.put((callback_id, f"Error: {str(e)}"))
150
+ self.request_queue.task_done()
151
 
152
  def _prewarm_model(self):
153
+ """Pre-compute common translations to warm up the model - minimal to save time"""
154
+ print("Pre-warming model with essential phrases (truncated for speed)...")
155
  start = time.time()
156
+
157
+ # Just warm up with one phrase per direction to speed up startup
158
  for direction, phrases in COMMON_PHRASES.items():
159
+ self._process_translation(direction, phrases[0])
160
+ # Only do the most common phrase to save startup time
161
+
162
+ print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
163
 
164
  def _process_translation(self, direction, text):
165
+ """Optimized translation function"""
166
  # Skip empty inputs
167
  if not text or not text.strip():
168
  return ""
169
 
 
 
 
 
 
170
  # Start timing for performance tracking
171
  start_time = time.time()
172
 
 
183
 
184
  source_lang, target_lang = lang_map[direction]
185
 
186
+ # Truncate long inputs for faster processing
187
+ max_input_length = 100 # Limit input length
188
+ if len(text) > max_input_length:
189
+ text = text[:max_input_length] + "..."
190
+
191
  # Efficient prompt format
192
  prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
193
 
194
+ # Reduce max tokens for faster inference
195
  input_tokens = len(text.split())
196
+ max_tokens = min(50, max(20, int(input_tokens * 1.2)))
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ # Generate translation with aggressive performance optimizations
199
+ try:
200
+ response = self.model.create_completion(
201
+ prompt,
202
+ max_tokens=max_tokens,
203
+ temperature=0.0, # Deterministic for faster inference
204
+ top_k=1, # Only consider most likely token
205
+ top_p=1.0, # No sampling
206
+ repeat_penalty=1.0, # No repeat penalty
207
+ stream=False, # Get complete response at once
208
+ stop=["[/", "\n\n"], # Stop early if possible
209
+ )
210
+
211
+ translation = response['choices'][0]['text'].strip()
212
+
213
+ # Log performance
214
+ inference_time = time.time() - start_time
215
+ tokens_per_second = (input_tokens + len(translation.split())) / inference_time
216
+ print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
217
+
218
+ return translation
219
+ except Exception as e:
220
+ print(f"Translation error: {e}")
221
+ return f"Error: Could not translate text. Try shorter input."
222
 
223
  def request_translation(self, direction, text, callback_id):
224
  """Queue a translation request"""
225
  self.request_queue.put((direction, text, callback_id))
226
 
227
+ # Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
228
+ worker_pool = ModelWorkerPool(num_workers=1)
229
 
230
  # Counter for request IDs
231
  next_request_id = 0
232
 
233
+ # Fast similarity check function for finding close matches in cache
234
+ def find_similar_cached(direction, text, threshold=0.8):
235
+ """Find similar translations in cache based on prefix matching"""
236
+ if len(text) < 5: # For very short inputs, look for exact matches
237
+ return None
238
+
239
+ text_lower = text.lower()
240
+ best_match = None
241
+ best_score = 0
242
+
243
+ for cached_key in list(translation_cache.keys()):
244
+ cached_dir, cached_text = cached_key.split(":", 1)
245
+ if cached_dir != direction:
246
+ continue
247
+
248
+ # Simple similarity - prefix matching
249
+ if cached_text.lower().startswith(text_lower[:5]):
250
+ similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
251
+ if similarity > best_score and similarity > threshold:
252
+ best_score = similarity
253
+ best_match = translation_cache[cached_key]
254
+
255
+ return best_match
256
+
257
  # Gradio interface functions
258
  def translate(direction, text, progress=gr.Progress()):
259
+ """Queue translation request and wait for result - optimized version"""
260
  global next_request_id
261
 
262
+ # Trim whitespace for better cache hits
263
+ text = text.strip()
264
+
265
+ # Skip empty inputs
266
+ if not text:
267
+ return ""
268
+
269
+ # Check LRU cache first
270
+ cached = get_cached_translation(direction, text)
271
+ if cached is not None:
272
+ return cached
273
+
274
+ # Check main cache
275
  cache_key = f"{direction}:{text}"
276
  if cache_key in translation_cache:
277
  return translation_cache[cache_key]
278
 
279
+ # For short inputs, try to find similar cached
280
  if len(text) < 20:
281
+ similar = find_similar_cached(direction, text)
282
+ if similar:
283
+ return similar
 
284
 
285
  # Generate unique request ID
286
  request_id = next_request_id
287
  next_request_id += 1
288
 
289
  # Queue the request
290
+ worker_pool.request_translation(direction, text, request_id)
291
 
292
+ # Wait for the response with reasonable timeout
293
  progress(0, desc="Translating...")
294
+ max_wait = 20 # Reduced maximum wait time
295
  start_time = time.time()
296
 
297
+ # Show progress while waiting
298
  while time.time() - start_time < max_wait:
299
  progress((time.time() - start_time) / max_wait)
300
 
301
  # Check for our response
302
  try:
303
+ while not worker_pool.response_queue.empty():
304
+ resp_id, result = worker_pool.response_queue.get_nowait()
305
  if resp_id == request_id:
306
+ # Update LRU cache
307
+ get_cached_translation.__wrapped__.__defaults__ = (result,)
308
  progress(1.0)
309
  return result
310
  except queue.Empty:
311
  pass
312
 
313
+ # Small sleep to prevent CPU hogging - reduced for faster response
314
+ time.sleep(0.01)
315
 
316
  progress(1.0)
317
+ return "Translation timed out. Please try a shorter text."
318
 
319
+ # Create Gradio interface with simplified UI for performance
320
+ with gr.Blocks(title="Fast CPU Translation App") as iface:
321
  gr.Markdown(f"""
322
+ ## Fast CPU Translation App
323
+ Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
324
+ **For best performance, use short sentences or phrases.**
325
  """)
326
 
327
  with gr.Row():
328
+ direction = gr.Radio(
329
+ choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
330
  label="Translation Direction",
331
  value="English to Spanish"
332
  )
333
 
334
  with gr.Row():
335
+ input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
336
+ output_text = gr.Textbox(lines=3, label="Translation")
337
 
338
  # Add translate button
339
  translate_btn = gr.Button("Translate")
340
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
341
 
342
+ # Add examples with common short phrases for quick results
 
 
 
 
 
 
 
 
 
343
  gr.Examples(
344
  examples=[
345
+ ["English to Spanish", "Hello"],
346
+ ["Spanish to English", "Hola"],
347
+ ["English to Korean", "Thank you"],
348
+ ["Korean to English", "감사합니다"]
349
  ],
350
  inputs=[direction, input_text],
351
  fn=translate,
352
  outputs=output_text
353
  )
354
+
355
+ # Add performance tips
356
+ gr.Markdown("""
357
+ ### Performance Tips
358
+ - Keep text under 50 characters for fastest results
359
+ - Common phrases are pre-cached
360
+ - First translation may be slow, subsequent ones faster
361
+ - Frequently used phrases use an LRU cache for speed
362
+ """)
363
 
364
  # Launch with optimized settings
365
+ if __name__ == "__main__":
366
+ iface.launch(
367
+ debug=False,
368
+ show_error=True,
369
+ share=False,
370
+ quiet=True,
371
+ server_name="0.0.0.0",
372
+ server_port=7860
373
+ )