johnpaulbin commited on
Commit
3ca3b8c
ยท
verified ยท
1 Parent(s): e73b223

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +350 -233
app.py CHANGED
@@ -1,23 +1,72 @@
1
  import os
2
  import time
3
- import torch
4
- import gradio as gr
5
- from huggingface_hub import hf_hub_download
6
  import threading
7
  import queue
8
  import multiprocessing
9
- from functools import lru_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # First check if GPU is available for maximum speed
12
  has_gpu = torch.cuda.is_available()
13
  gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
14
  print(f"GPU available: {has_gpu} - {gpu_name}")
15
 
 
 
 
 
 
16
  # Download model files
17
  def get_model_path(repo_id, filename):
18
  print(f"Obtaining {filename}...")
19
- return hf_hub_download(repo_id=repo_id, filename=filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
 
21
  base_model_path = get_model_path(
22
  "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
23
  "articulate-11-expspanish-base-merged-q8_0.gguf"
@@ -27,146 +76,120 @@ adapter_path = get_model_path(
27
  "articulate-V1-q8_0.gguf"
28
  )
29
 
30
- # Optimize environment variables for CPU performance
31
- os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only
32
- os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
33
- os.environ["LLAMA_AVX"] = "1" # Enable AVX
34
- os.environ["LLAMA_AVX2"] = "1" # Enable AVX2
35
- os.environ["LLAMA_F16"] = "1" # Use FP16 where available
36
 
37
- # Import the right module
38
  from llama_cpp import Llama
39
- print("Using CPU-optimized llama-cpp-python")
40
 
41
- # Cache for translations
42
  translation_cache = {}
43
- MAX_CACHE_SIZE = 5000 # Increased cache size
44
-
45
- # Common phrases for pre-loading
46
- COMMON_PHRASES = {
47
- "English to Spanish": [
48
- "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
49
- "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
50
- ],
51
- "Spanish to English": [
52
- "Hola", "Gracias", "Buenos dรญas", "ยฟCรณmo estรกs?", "ยฟCรณmo te llamas?",
53
- "No entiendo", "Por favor", "Lo siento", "Sรญ", "No", "Dรณnde estรก"
54
- ],
55
- "English to Korean": [
56
- "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
57
- "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
58
- ],
59
- "Korean to English": [
60
- "์•ˆ๋…•ํ•˜์„ธ์š”", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค", "์ข‹์€ ์•„์นจ์ž…๋‹ˆ๋‹ค", "์–ด๋–ป๊ฒŒ ์ง€๋‚ด์„ธ์š”?", "์ด๋ฆ„์ด ๋ญ์˜ˆ์š”?",
61
- "์ดํ•ด๊ฐ€ ์•ˆ ๋ผ์š”", "์ œ๋ฐœ", "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค", "๋„ค", "์•„๋‹ˆ์š”", "์–ด๋””์— ์žˆ์–ด์š”"
62
- ]
63
- }
64
-
65
- # Implement LRU cache for better performance
66
- @lru_cache(maxsize=100)
67
- def get_cached_translation(direction, text):
68
- """LRU cache for translations"""
69
- return None # This gets bypassed when there's a cache hit
70
 
71
- # Create a worker pool for parallel translation
72
- class ModelWorkerPool:
73
- def __init__(self, num_workers=1):
74
- self.num_workers = num_workers
75
  self.request_queue = queue.Queue()
76
  self.response_queue = queue.Queue()
77
- self.workers = []
78
- self.initialized = False
79
-
80
- # Create shared model instance with optimized settings
81
- print("Initializing model with CPU optimizations...")
82
- start_time = time.time()
83
-
84
- # CPU optimization settings - use fewer threads for Q8 model
85
- cpu_count = multiprocessing.cpu_count()
86
- optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance
87
-
88
- # Create a smaller context size for faster inference
89
- self.model = Llama(
90
- model_path=base_model_path,
91
- lora_path=adapter_path,
92
- n_ctx=256, # Reduced context for faster processing
93
- n_threads=optimal_threads, # Optimized thread count
94
- n_batch=512, # Reduced batch size for CPU
95
- use_mmap=True, # Efficient memory mapping
96
- n_gpu_layers=0, # CPU only
97
- seed=42, # Consistent results
98
- verbose=False, # Reduce overhead
99
- rope_freq_base=10000, # Default attention parameters
100
- rope_freq_scale=1.0,
101
- )
102
- print(f"Model loaded in {time.time() - start_time:.2f} seconds")
103
-
104
- # Start worker threads
105
- for i in range(num_workers):
106
- worker = threading.Thread(target=self._worker_loop, daemon=True)
107
- worker.start()
108
- self.workers.append(worker)
109
-
110
- self.initialized = True
111
-
112
- # Pre-warm in background thread to not block startup
113
- warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
114
- warming_thread.start()
115
 
116
- def _worker_loop(self):
117
- """Worker thread that processes translation requests"""
118
  while True:
119
  try:
 
120
  request = self.request_queue.get()
121
- if request is None: # Shutdown signal
122
  break
123
-
124
- direction, text, callback_id = request
125
 
126
- # Check LRU cache first
127
- cached = get_cached_translation(direction, text)
128
- if cached is not None:
129
- self.response_queue.put((callback_id, cached))
130
- self.request_queue.task_done()
131
- continue
 
 
 
 
 
 
 
 
132
 
133
- # Check regular cache
134
- cache_key = f"{direction}:{text}"
135
- if cache_key in translation_cache:
136
- result = translation_cache[cache_key]
137
- else:
138
- # Process new translation
139
- result = self._process_translation(direction, text)
140
- # Store in regular cache
141
- if len(translation_cache) >= MAX_CACHE_SIZE:
142
- translation_cache.pop(next(iter(translation_cache)))
143
- translation_cache[cache_key] = result
144
 
145
- self.response_queue.put((callback_id, result))
146
- self.request_queue.task_done()
147
  except Exception as e:
148
- print(f"Error in worker thread: {e}")
149
- self.response_queue.put((callback_id, f"Error: {str(e)}"))
150
- self.request_queue.task_done()
151
 
152
- def _prewarm_model(self):
153
- """Pre-compute common translations to warm up the model - minimal to save time"""
154
- print("Pre-warming model with essential phrases (truncated for speed)...")
155
- start = time.time()
156
-
157
- # Just warm up with one phrase per direction to speed up startup
158
- for direction, phrases in COMMON_PHRASES.items():
159
- self._process_translation(direction, phrases[0])
160
- # Only do the most common phrase to save startup time
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def _process_translation(self, direction, text):
165
- """Optimized translation function"""
166
- # Skip empty inputs
167
  if not text or not text.strip():
168
  return ""
169
 
 
 
 
 
 
 
170
  # Start timing for performance tracking
171
  start_time = time.time()
172
 
@@ -183,185 +206,279 @@ class ModelWorkerPool:
183
 
184
  source_lang, target_lang = lang_map[direction]
185
 
186
- # Truncate long inputs for faster processing
187
- max_input_length = 100 # Limit input length
188
- if len(text) > max_input_length:
189
- text = text[:max_input_length] + "..."
190
-
191
  # Efficient prompt format
192
  prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
193
 
194
- # Reduce max tokens for faster inference
195
- input_tokens = len(text.split())
196
- max_tokens = min(50, max(20, int(input_tokens * 1.2)))
197
 
198
- # Generate translation with aggressive performance optimizations
199
- try:
200
- response = self.model.create_completion(
201
- prompt,
202
- max_tokens=max_tokens,
203
- temperature=0.0, # Deterministic for faster inference
204
- top_k=1, # Only consider most likely token
205
- top_p=1.0, # No sampling
206
- repeat_penalty=1.0, # No repeat penalty
207
- stream=False, # Get complete response at once
208
- stop=["[/", "\n\n"], # Stop early if possible
209
- )
210
-
211
- translation = response['choices'][0]['text'].strip()
212
-
213
- # Log performance
214
- inference_time = time.time() - start_time
215
- tokens_per_second = (input_tokens + len(translation.split())) / inference_time
216
- print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
217
-
218
- return translation
219
- except Exception as e:
220
- print(f"Translation error: {e}")
221
- return f"Error: Could not translate text. Try shorter input."
 
222
 
223
  def request_translation(self, direction, text, callback_id):
224
  """Queue a translation request"""
225
  self.request_queue.put((direction, text, callback_id))
226
 
227
- # Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
228
- worker_pool = ModelWorkerPool(num_workers=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  # Counter for request IDs
231
  next_request_id = 0
232
 
233
- # Fast similarity check function for finding close matches in cache
234
- def find_similar_cached(direction, text, threshold=0.8):
235
- """Find similar translations in cache based on prefix matching"""
236
- if len(text) < 5: # For very short inputs, look for exact matches
237
- return None
238
-
239
- text_lower = text.lower()
240
- best_match = None
241
- best_score = 0
242
 
243
- for cached_key in list(translation_cache.keys()):
244
- cached_dir, cached_text = cached_key.split(":", 1)
245
- if cached_dir != direction:
 
 
 
 
 
 
 
 
246
  continue
247
 
248
- # Simple similarity - prefix matching
249
- if cached_text.lower().startswith(text_lower[:5]):
250
- similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
251
- if similarity > best_score and similarity > threshold:
252
- best_score = similarity
253
- best_match = translation_cache[cached_key]
 
 
254
 
255
- return best_match
 
 
 
 
 
 
 
 
 
 
256
 
257
  # Gradio interface functions
258
  def translate(direction, text, progress=gr.Progress()):
259
- """Queue translation request and wait for result - optimized version"""
260
  global next_request_id
261
 
262
- # Trim whitespace for better cache hits
263
- text = text.strip()
264
-
265
  # Skip empty inputs
266
- if not text:
267
  return ""
268
 
269
- # Check LRU cache first
270
- cached = get_cached_translation(direction, text)
271
- if cached is not None:
272
- return cached
273
-
274
- # Check main cache
275
  cache_key = f"{direction}:{text}"
276
  if cache_key in translation_cache:
277
  return translation_cache[cache_key]
278
 
279
- # For short inputs, try to find similar cached
280
- if len(text) < 20:
281
- similar = find_similar_cached(direction, text)
282
- if similar:
283
- return similar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
- # Generate unique request ID
286
  request_id = next_request_id
287
  next_request_id += 1
288
 
289
  # Queue the request
290
- worker_pool.request_translation(direction, text, request_id)
291
 
292
- # Wait for the response with reasonable timeout
293
- progress(0, desc="Translating...")
294
- max_wait = 20 # Reduced maximum wait time
295
  start_time = time.time()
 
296
 
297
- # Show progress while waiting
298
  while time.time() - start_time < max_wait:
299
- progress((time.time() - start_time) / max_wait)
300
 
301
  # Check for our response
302
  try:
303
- while not worker_pool.response_queue.empty():
304
- resp_id, result = worker_pool.response_queue.get_nowait()
305
  if resp_id == request_id:
306
- # Update LRU cache
307
- get_cached_translation.__wrapped__.__defaults__ = (result,)
308
  progress(1.0)
309
  return result
310
  except queue.Empty:
311
  pass
312
 
313
- # Small sleep to prevent CPU hogging - reduced for faster response
314
- time.sleep(0.01)
315
 
316
  progress(1.0)
317
- return "Translation timed out. Please try a shorter text."
318
 
319
- # Create Gradio interface with simplified UI for performance
320
- with gr.Blocks(title="Fast CPU Translation App") as iface:
321
  gr.Markdown(f"""
322
- ## Fast CPU Translation App
323
- Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
324
- **For best performance, use short sentences or phrases.**
325
  """)
326
 
327
  with gr.Row():
328
- direction = gr.Radio(
329
- choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
330
  label="Translation Direction",
331
  value="English to Spanish"
332
  )
333
 
334
  with gr.Row():
335
- input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
336
- output_text = gr.Textbox(lines=3, label="Translation")
337
 
338
  # Add translate button
339
  translate_btn = gr.Button("Translate")
340
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
341
 
342
- # Add examples with common short phrases for quick results
 
 
 
 
 
 
 
 
 
 
 
343
  gr.Examples(
344
  examples=[
345
- ["English to Spanish", "Hello"],
346
- ["Spanish to English", "Hola"],
347
- ["English to Korean", "Thank you"],
348
- ["Korean to English", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค"]
349
  ],
350
  inputs=[direction, input_text],
351
  fn=translate,
352
  outputs=output_text
353
  )
354
-
355
- # Add performance tips
356
- gr.Markdown("""
357
- ### Performance Tips
358
- - Keep text under 50 characters for fastest results
359
- - Common phrases are pre-cached
360
- - First translation may be slow, subsequent ones faster
361
- - Frequently used phrases use an LRU cache for speed
362
- """)
363
-
364
 
365
- iface.launch(
366
- show_error=True,
367
- )
 
 
 
 
 
 
 
 
1
  import os
2
  import time
 
 
 
3
  import threading
4
  import queue
5
  import multiprocessing
6
+ from pathlib import Path
7
+ import torch
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import numpy as np
11
+
12
+ # Set up environment variables for CPU optimization
13
+ os.environ["OMP_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1)) # Optimal OpenMP threads
14
+ os.environ["MKL_NUM_THREADS"] = str(max(1, multiprocessing.cpu_count() - 1)) # Optimal MKL threads
15
+ os.environ["LLAMA_AVX"] = "1"
16
+ os.environ["LLAMA_AVX2"] = "1"
17
+ os.environ["LLAMA_F16"] = "1"
18
+
19
+ # Cache directories
20
+ CACHE_DIR = Path.home() / ".cache" / "fast_translate"
21
+ MODEL_CACHE = CACHE_DIR / "models"
22
+ QUANTIZED_CACHE = CACHE_DIR / "quantized"
23
+ os.makedirs(MODEL_CACHE, exist_ok=True)
24
+ os.makedirs(QUANTIZED_CACHE, exist_ok=True)
25
 
26
+ # Check if we're running on CPU
27
  has_gpu = torch.cuda.is_available()
28
  gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
29
  print(f"GPU available: {has_gpu} - {gpu_name}")
30
 
31
+ # Configure CPU settings
32
+ cpu_count = multiprocessing.cpu_count()
33
+ optimal_threads = max(4, cpu_count - 1) # Leave one core free
34
+ print(f"Using {optimal_threads} of {cpu_count} CPU cores")
35
+
36
  # Download model files
37
  def get_model_path(repo_id, filename):
38
  print(f"Obtaining {filename}...")
39
+ # Download to our custom cache location
40
+ return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=MODEL_CACHE)
41
+
42
+ # Function to quantize model to int4 or int8
43
+ def quantize_model(input_model_path, output_model_path, quantization_type="q4_0"):
44
+ """Quantize model to lower precision for faster inference on CPU"""
45
+ try:
46
+ from llama_cpp import llama_model_quantize
47
+
48
+ # Check if quantized model already exists
49
+ if os.path.exists(output_model_path):
50
+ print(f"Using existing quantized model: {output_model_path}")
51
+ return output_model_path
52
+
53
+ print(f"Quantizing model to {quantization_type}...")
54
+ start_time = time.time()
55
+
56
+ # Quantize using llama-cpp-python built-in quantization
57
+ llama_model_quantize(
58
+ input_model_path,
59
+ output_model_path,
60
+ quantization_type
61
+ )
62
+
63
+ print(f"Quantization completed in {time.time() - start_time:.2f}s")
64
+ return output_model_path
65
+ except Exception as e:
66
+ print(f"Quantization failed: {e}, using original model")
67
+ return input_model_path
68
 
69
+ # Download models
70
  base_model_path = get_model_path(
71
  "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
72
  "articulate-11-expspanish-base-merged-q8_0.gguf"
 
76
  "articulate-V1-q8_0.gguf"
77
  )
78
 
79
+ # Quantize models (creates int4 versions for faster CPU inference)
80
+ quantized_base_path = str(QUANTIZED_CACHE / "articulate-base-q4_0.gguf")
81
+ quantized_adapter_path = str(QUANTIZED_CACHE / "articulate-adapter-q4_0.gguf")
82
+ base_model_path = quantize_model(base_model_path, quantized_base_path, "q4_0")
83
+ adapter_path = quantize_model(adapter_path, quantized_adapter_path, "q4_0")
 
84
 
85
+ # Import after setting environment variables
86
  from llama_cpp import Llama
 
87
 
88
+ # Translation cache
89
  translation_cache = {}
90
+ MAX_CACHE_SIZE = 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # Model worker with batching support
93
+ class ModelWorker:
94
+ def __init__(self):
95
+ self.model = None
96
  self.request_queue = queue.Queue()
97
  self.response_queue = queue.Queue()
98
+ self.batch_queue = []
99
+ self.batch_event = threading.Event()
100
+ self.batch_size = 4 # Process up to 4 requests at once
101
+ self.batch_timeout = 0.1 # Wait 100ms max to collect batch
102
+ self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
103
+ self.batch_thread = threading.Thread(target=self._batch_loop, daemon=True)
104
+ self.worker_thread.start()
105
+ self.batch_thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ def _batch_loop(self):
108
+ """Collect requests into batches for more efficient processing"""
109
  while True:
110
  try:
111
+ # Get a request
112
  request = self.request_queue.get()
113
+ if request is None:
114
  break
 
 
115
 
116
+ # Add to batch
117
+ self.batch_queue.append(request)
118
+
119
+ # Try to collect more requests for the batch
120
+ batch_start = time.time()
121
+ while (len(self.batch_queue) < self.batch_size and
122
+ time.time() - batch_start < self.batch_timeout):
123
+ try:
124
+ req = self.request_queue.get_nowait()
125
+ if req is None:
126
+ break
127
+ self.batch_queue.append(req)
128
+ except queue.Empty:
129
+ time.sleep(0.01)
130
 
131
+ # Signal worker to process the batch
132
+ current_batch = self.batch_queue.copy()
133
+ self.batch_queue = []
134
+ for req in current_batch:
135
+ self._process_request(req)
 
 
 
 
 
 
136
 
 
 
137
  except Exception as e:
138
+ print(f"Error in batch thread: {e}")
 
 
139
 
140
+ def _worker_loop(self):
141
+ """Initialize model and process requests"""
142
+ try:
143
+ # Initialize model with optimized settings
144
+ print("Initializing model in background thread...")
145
+ start_time = time.time()
146
+
147
+ # Create model context with very optimized settings for CPU
148
+ self.model = Llama(
149
+ model_path=base_model_path,
150
+ lora_path=adapter_path,
151
+ n_ctx=256, # Smaller context for speed
152
+ n_threads=optimal_threads, # Use all but one CPU core
153
+ n_batch=512, # Smaller batch for CPU
154
+ use_mmap=True, # Memory mapping (more efficient)
155
+ n_gpu_layers=0, # Force CPU only
156
+ seed=42, # Consistent results
157
+ rope_freq_base=10000, # Default RoPE settings
158
+ rope_freq_scale=1.0,
159
+ verbose=False # Reduce overhead
160
+ )
161
 
162
+ print(f"Model loaded in {time.time() - start_time:.2f} seconds")
163
+
164
+ # Pre-warm the model with common phrases by running a simple inference
165
+ print("Pre-warming model...")
166
+ self.model.create_completion("[ENGLISH]hello[SPANISH]", max_tokens=8)
167
+ print("Model ready for translation")
168
+
169
+ except Exception as e:
170
+ print(f"Failed to initialize model: {e}")
171
+
172
+ def _process_request(self, request):
173
+ """Process a single translation request"""
174
+ try:
175
+ direction, text, callback_id = request
176
+ result = self._process_translation(direction, text)
177
+ self.response_queue.put((callback_id, result))
178
+ except Exception as e:
179
+ print(f"Error processing request: {e}")
180
+ self.response_queue.put((callback_id, f"Error: {str(e)}"))
181
 
182
  def _process_translation(self, direction, text):
183
+ """Translate text with optimized settings"""
 
184
  if not text or not text.strip():
185
  return ""
186
 
187
+ # Check cache first for faster response
188
+ cache_key = f"{direction}:{text}"
189
+ if cache_key in translation_cache:
190
+ print("Cache hit!")
191
+ return translation_cache[cache_key]
192
+
193
  # Start timing for performance tracking
194
  start_time = time.time()
195
 
 
206
 
207
  source_lang, target_lang = lang_map[direction]
208
 
 
 
 
 
 
209
  # Efficient prompt format
210
  prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
211
 
212
+ # Estimate appropriate token length based on input
213
+ input_tokens = min(100, max(10, len(text.split())))
214
+ max_tokens = min(100, max(25, int(input_tokens * 1.3)))
215
 
216
+ # Generate translation with aggressively optimized settings for speed
217
+ response = self.model.create_completion(
218
+ prompt,
219
+ max_tokens=max_tokens,
220
+ temperature=0.0, # Deterministic
221
+ top_k=1, # Most likely token
222
+ top_p=1.0, # No sampling
223
+ repeat_penalty=1.0, # No penalty
224
+ stream=False # Get complete response
225
+ )
226
+
227
+ translation = response['choices'][0]['text'].strip()
228
+
229
+ # Cache result
230
+ if len(translation_cache) >= MAX_CACHE_SIZE:
231
+ # Remove oldest entry (first key)
232
+ translation_cache.pop(next(iter(translation_cache)))
233
+ translation_cache[cache_key] = translation
234
+
235
+ # Log performance
236
+ inference_time = time.time() - start_time
237
+ tokens_per_second = (input_tokens + len(translation.split())) / inference_time
238
+ print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
239
+
240
+ return translation
241
 
242
  def request_translation(self, direction, text, callback_id):
243
  """Queue a translation request"""
244
  self.request_queue.put((direction, text, callback_id))
245
 
246
+ # Model preloading thread that preloads and pre-computes common translations
247
+ def preload_common_phrases(worker):
248
+ # Dictionary of common phrases that will benefit from caching
249
+ common_phrases = {
250
+ "English to Spanish": [
251
+ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
252
+ "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
253
+ "How much does it cost?", "What time is it?", "I don't speak Spanish",
254
+ "Where is the bathroom?", "I need help", "Can you help me?"
255
+ ],
256
+ "Spanish to English": [
257
+ "Hola", "Gracias", "Buenos dรญas", "ยฟCรณmo estรกs?", "ยฟCรณmo te llamas?",
258
+ "No entiendo", "Por favor", "Lo siento", "Sรญ", "No", "Dรณnde estรก",
259
+ "ยฟCuรกnto cuesta?", "ยฟQuรฉ hora es?", "No hablo espaรฑol", "ยฟDรณnde estรก el baรฑo?",
260
+ "Necesito ayuda", "ยฟPuedes ayudarme?"
261
+ ],
262
+ "English to Korean": [
263
+ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
264
+ "I don't understand", "Please", "Sorry", "Yes", "No", "Where is",
265
+ "How much is this?", "What time is it?", "I don't speak Korean"
266
+ ],
267
+ "Korean to English": [
268
+ "์•ˆ๋…•ํ•˜์„ธ์š”", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค", "์ข‹์€ ์•„์นจ์ž…๋‹ˆ๋‹ค", "์–ด๋–ป๊ฒŒ ์ง€๋‚ด์„ธ์š”?", "์ด๋ฆ„์ด ๋ญ์˜ˆ์š”?",
269
+ "์ดํ•ด๊ฐ€ ์•ˆ ๋ผ์š”", "์ œ๋ฐœ", "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค", "๋„ค", "์•„๋‹ˆ์š”", "์–ด๋””์— ์žˆ์–ด์š”",
270
+ "์ด๊ฑฐ ์–ผ๋งˆ์˜ˆ์š”?", "์ง€๊ธˆ ๋ช‡ ์‹œ์˜ˆ์š”?", "ํ•œ๊ตญ์–ด๋ฅผ ๋ชปํ•ด์š”"
271
+ ]
272
+ }
273
+
274
+ preload_requests = []
275
+ for direction, phrases in common_phrases.items():
276
+ for phrase in phrases:
277
+ preload_requests.append((direction, phrase, f"preload_{len(preload_requests)}"))
278
+
279
+ # Process preloading in a separate thread
280
+ def preloader():
281
+ print(f"Preloading {len(preload_requests)} common phrases in background...")
282
+ for request in preload_requests:
283
+ worker.request_translation(*request)
284
+ # Small sleep to avoid overwhelming the queue
285
+ time.sleep(0.1)
286
+ print("Preloading complete")
287
+
288
+ thread = threading.Thread(target=preloader, daemon=True)
289
+ thread.start()
290
+ return thread
291
+
292
+ # Create worker instance
293
+ worker = ModelWorker()
294
+
295
+ # Start preloading common phrases in background
296
+ preload_thread = preload_common_phrases(worker)
297
 
298
  # Counter for request IDs
299
  next_request_id = 0
300
 
301
+ # Implementation of a faster sentence splitter for batching
302
+ def split_sentences(text, max_length=50):
303
+ """Split text into manageable chunks for faster translation"""
304
+ if len(text) <= max_length:
305
+ return [text]
 
 
 
 
306
 
307
+ # Split on natural boundaries
308
+ delimiters = ['. ', '! ', '? ', '.\n', '!\n', '?\n', '\n\n']
309
+ chunks = []
310
+ current_chunk = ""
311
+
312
+ lines = text.split('\n')
313
+ for line in lines:
314
+ if not line.strip():
315
+ if current_chunk:
316
+ chunks.append(current_chunk)
317
+ current_chunk = ""
318
  continue
319
 
320
+ words = line.split(' ')
321
+ for word in words:
322
+ test_chunk = f"{current_chunk} {word}".strip()
323
+ if len(test_chunk) > max_length:
324
+ chunks.append(current_chunk)
325
+ current_chunk = word
326
+ else:
327
+ current_chunk = test_chunk
328
 
329
+ # Check for natural breaks
330
+ for delimiter in delimiters:
331
+ if delimiter in current_chunk[-len(delimiter):]:
332
+ chunks.append(current_chunk)
333
+ current_chunk = ""
334
+ break
335
+
336
+ if current_chunk:
337
+ chunks.append(current_chunk)
338
+
339
+ return chunks
340
 
341
  # Gradio interface functions
342
  def translate(direction, text, progress=gr.Progress()):
343
+ """Fast translation with batching and caching"""
344
  global next_request_id
345
 
 
 
 
346
  # Skip empty inputs
347
+ if not text or not text.strip():
348
  return ""
349
 
350
+ # Check exact cache hit
 
 
 
 
 
351
  cache_key = f"{direction}:{text}"
352
  if cache_key in translation_cache:
353
  return translation_cache[cache_key]
354
 
355
+ # For longer texts, split into sentences for faster processing
356
+ if len(text) > 50:
357
+ progress(0.1, desc="Processing text...")
358
+ chunks = split_sentences(text)
359
+ if len(chunks) > 1:
360
+ results = []
361
+ for i, chunk in enumerate(chunks):
362
+ # Check if this chunk is in cache
363
+ chunk_key = f"{direction}:{chunk}"
364
+ if chunk_key in translation_cache:
365
+ results.append(translation_cache[chunk_key])
366
+ continue
367
+
368
+ # Request translation for this chunk
369
+ chunk_id = next_request_id
370
+ next_request_id += 1
371
+ worker.request_translation(direction, chunk, chunk_id)
372
+
373
+ # Wait for response
374
+ chunk_start = time.time()
375
+ while time.time() - chunk_start < 10: # 10 second timeout per chunk
376
+ progress((i + 0.5) / len(chunks), desc=f"Translating part {i+1}/{len(chunks)}")
377
+
378
+ try:
379
+ while not worker.response_queue.empty():
380
+ resp_id, result = worker.response_queue.get_nowait()
381
+ if resp_id == chunk_id:
382
+ results.append(result)
383
+ chunk_found = True
384
+ break
385
+ except queue.Empty:
386
+ pass
387
+
388
+ time.sleep(0.05)
389
+
390
+ if len(results) != i + 1:
391
+ results.append(f"[Translation failed for part {i+1}]")
392
+
393
+ combined = " ".join(results)
394
+ translation_cache[cache_key] = combined
395
+ progress(1.0)
396
+ return combined
397
 
398
+ # For single sentences
399
  request_id = next_request_id
400
  next_request_id += 1
401
 
402
  # Queue the request
403
+ worker.request_translation(direction, text, request_id)
404
 
405
+ # Wait for the response
406
+ progress(0.2, desc="Translating...")
 
407
  start_time = time.time()
408
+ max_wait = 20 # Maximum wait time in seconds
409
 
 
410
  while time.time() - start_time < max_wait:
411
+ progress(0.2 + 0.8 * ((time.time() - start_time) / max_wait), desc="Translating...")
412
 
413
  # Check for our response
414
  try:
415
+ while not worker.response_queue.empty():
416
+ resp_id, result = worker.response_queue.get_nowait()
417
  if resp_id == request_id:
 
 
418
  progress(1.0)
419
  return result
420
  except queue.Empty:
421
  pass
422
 
423
+ # Small sleep to prevent CPU hogging
424
+ time.sleep(0.05)
425
 
426
  progress(1.0)
427
+ return "Translation timed out. Please try again with a shorter text."
428
 
429
+ # Create Gradio interface
430
+ with gr.Blocks(title="Ultra-Fast Translation App (CPU Optimized)") as iface:
431
  gr.Markdown(f"""
432
+ ## Ultra-Fast Translation App (CPU Optimized)
433
+ Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU optimized with int4 quantization'}
 
434
  """)
435
 
436
  with gr.Row():
437
+ direction = gr.Dropdown(
438
+ choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
439
  label="Translation Direction",
440
  value="English to Spanish"
441
  )
442
 
443
  with gr.Row():
444
+ input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
445
+ output_text = gr.Textbox(lines=5, label="Translation")
446
 
447
  # Add translate button
448
  translate_btn = gr.Button("Translate")
449
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
450
 
451
+ # Optimization options
452
+ with gr.Accordion("Performance Tips", open=True):
453
+ gr.Markdown("""
454
+ ### Speed Optimization Tips
455
+ - โœ… The model has been quantized to int4 for faster CPU execution
456
+ - โœ… Common phrases are pre-cached for instant results
457
+ - โœ… Long text is automatically split into smaller chunks
458
+ - โœ… First translation will be slower as the model warms up
459
+ - โœ… Short sentences (< 50 chars) translate much faster
460
+ """)
461
+
462
+ # Add examples with preloaded common phrases
463
  gr.Examples(
464
  examples=[
465
+ ["English to Spanish", "Hello, how are you today?"],
466
+ ["Spanish to English", "Hola, ยฟcรณmo estรกs hoy?"],
467
+ ["English to Korean", "The weather is nice today."],
468
+ ["Korean to English", "์•ˆ๋…•ํ•˜์„ธ์š”, ๋งŒ๋‚˜์„œ ๋ฐ˜๊ฐ‘์Šต๋‹ˆ๋‹ค."]
469
  ],
470
  inputs=[direction, input_text],
471
  fn=translate,
472
  outputs=output_text
473
  )
 
 
 
 
 
 
 
 
 
 
474
 
475
+ # Launch with optimized settings
476
+ if __name__ == "__main__":
477
+ iface.launch(
478
+ debug=False,
479
+ show_error=True,
480
+ share=False,
481
+ quiet=True,
482
+ server_name="0.0.0.0",
483
+ server_port=7860
484
+ )