File size: 14,105 Bytes
f06b197
 
 
7f36089
f06b197
 
 
b27a850
cedd7b9
7f36089
f06b197
 
 
 
 
 
b27a850
 
 
7f36089
b27a850
 
 
 
 
 
 
 
 
cedd7b9
 
f06b197
cedd7b9
 
 
b27a850
cedd7b9
 
 
b27a850
f06b197
b27a850
cedd7b9
7f36089
cedd7b9
f06b197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
f06b197
 
cedd7b9
 
 
 
 
 
f06b197
cedd7b9
f06b197
cedd7b9
f06b197
cedd7b9
 
f06b197
 
cedd7b9
f06b197
cedd7b9
 
 
 
 
 
f06b197
 
 
 
cedd7b9
 
 
 
 
f06b197
cedd7b9
 
 
 
 
 
 
 
f06b197
 
 
 
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
cedd7b9
f06b197
 
 
cedd7b9
b27a850
f06b197
cedd7b9
 
f06b197
cedd7b9
 
f06b197
cedd7b9
 
 
 
b27a850
f06b197
cedd7b9
f06b197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cedd7b9
 
 
 
 
f06b197
 
 
cedd7b9
f06b197
cedd7b9
c2b521a
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
 
 
 
cedd7b9
 
f06b197
 
 
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
cedd7b9
f06b197
 
cedd7b9
 
 
 
 
 
 
 
 
 
 
 
 
f06b197
 
 
 
cedd7b9
f06b197
cedd7b9
 
 
f06b197
 
 
 
 
 
cedd7b9
f06b197
cedd7b9
f06b197
cedd7b9
f06b197
 
cedd7b9
f06b197
 
c2b521a
f06b197
 
cedd7b9
 
f06b197
cedd7b9
 
f06b197
 
 
 
 
cedd7b9
 
f06b197
 
cedd7b9
7f36089
cedd7b9
 
f06b197
cedd7b9
 
 
f06b197
b27a850
 
cedd7b9
 
b27a850
 
 
 
 
cedd7b9
 
b27a850
 
 
 
 
cedd7b9
b27a850
 
cedd7b9
 
 
 
b27a850
 
f06b197
 
b27a850
cedd7b9
 
 
 
 
 
 
 
 
7f36089
f06b197
cedd7b9
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing
from functools import lru_cache

# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")

# Download model files
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# Optimize environment variables for CPU performance
os.environ["LLAMA_CUBLAS"] = "0"  # Disable CUDA since we're CPU only
os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
os.environ["LLAMA_AVX"] = "1"      # Enable AVX
os.environ["LLAMA_AVX2"] = "1"     # Enable AVX2
os.environ["LLAMA_F16"] = "1"      # Use FP16 where available

# Import the right module
from llama_cpp import Llama
print("Using CPU-optimized llama-cpp-python")

# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 5000  # Increased cache size

# Common phrases for pre-loading
COMMON_PHRASES = {
    "English to Spanish": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Spanish to English": [
        "Hola", "Gracias", "Buenos dรญas", "ยฟCรณmo estรกs?", "ยฟCรณmo te llamas?",
        "No entiendo", "Por favor", "Lo siento", "Sรญ", "No", "Dรณnde estรก"
    ],
    "English to Korean": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Korean to English": [
        "์•ˆ๋…•ํ•˜์„ธ์š”", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค", "์ข‹์€ ์•„์นจ์ž…๋‹ˆ๋‹ค", "์–ด๋–ป๊ฒŒ ์ง€๋‚ด์„ธ์š”?", "์ด๋ฆ„์ด ๋ญ์˜ˆ์š”?",
        "์ดํ•ด๊ฐ€ ์•ˆ ๋ผ์š”", "์ œ๋ฐœ", "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค", "๋„ค", "์•„๋‹ˆ์š”", "์–ด๋””์— ์žˆ์–ด์š”"
    ]
}

# Implement LRU cache for better performance
@lru_cache(maxsize=100)
def get_cached_translation(direction, text):
    """LRU cache for translations"""
    return None  # This gets bypassed when there's a cache hit

# Create a worker pool for parallel translation
class ModelWorkerPool:
    def __init__(self, num_workers=1):
        self.num_workers = num_workers
        self.request_queue = queue.Queue()
        self.response_queue = queue.Queue()
        self.workers = []
        self.initialized = False
        
        # Create shared model instance with optimized settings
        print("Initializing model with CPU optimizations...")
        start_time = time.time()
        
        # CPU optimization settings - use fewer threads for Q8 model
        cpu_count = multiprocessing.cpu_count()
        optimal_threads = max(1, min(4, cpu_count - 1))  # Use fewer threads for better performance
        
        # Create a smaller context size for faster inference
        self.model = Llama(
            model_path=base_model_path,
            lora_path=adapter_path,
            n_ctx=256,                # Reduced context for faster processing
            n_threads=optimal_threads, # Optimized thread count
            n_batch=512,              # Reduced batch size for CPU
            use_mmap=True,            # Efficient memory mapping
            n_gpu_layers=0,           # CPU only
            seed=42,                  # Consistent results
            verbose=False,            # Reduce overhead
            rope_freq_base=10000,     # Default attention parameters
            rope_freq_scale=1.0,
        )
        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
        
        # Start worker threads
        for i in range(num_workers):
            worker = threading.Thread(target=self._worker_loop, daemon=True)
            worker.start()
            self.workers.append(worker)
        
        self.initialized = True
        
        # Pre-warm in background thread to not block startup
        warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
        warming_thread.start()
    
    def _worker_loop(self):
        """Worker thread that processes translation requests"""
        while True:
            try:
                request = self.request_queue.get()
                if request is None:  # Shutdown signal
                    break
                    
                direction, text, callback_id = request
                
                # Check LRU cache first
                cached = get_cached_translation(direction, text)
                if cached is not None:
                    self.response_queue.put((callback_id, cached))
                    self.request_queue.task_done()
                    continue
                
                # Check regular cache
                cache_key = f"{direction}:{text}"
                if cache_key in translation_cache:
                    result = translation_cache[cache_key]
                else:
                    # Process new translation
                    result = self._process_translation(direction, text)
                    # Store in regular cache
                    if len(translation_cache) >= MAX_CACHE_SIZE:
                        translation_cache.pop(next(iter(translation_cache)))
                    translation_cache[cache_key] = result
                
                self.response_queue.put((callback_id, result))
                self.request_queue.task_done()
            except Exception as e:
                print(f"Error in worker thread: {e}")
                self.response_queue.put((callback_id, f"Error: {str(e)}"))
                self.request_queue.task_done()
    
    def _prewarm_model(self):
        """Pre-compute common translations to warm up the model - minimal to save time"""
        print("Pre-warming model with essential phrases (truncated for speed)...")
        start = time.time()
        
        # Just warm up with one phrase per direction to speed up startup
        for direction, phrases in COMMON_PHRASES.items():
            self._process_translation(direction, phrases[0])
            # Only do the most common phrase to save startup time
            
        print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
    
    def _process_translation(self, direction, text):
        """Optimized translation function"""
        # Skip empty inputs
        if not text or not text.strip():
            return ""
            
        # Start timing for performance tracking
        start_time = time.time()
        
        # Map language directions
        lang_map = {
            "English to Spanish": ("ENGLISH", "SPANISH"),
            "Spanish to English": ("SPANISH", "ENGLISH"),
            "Korean to English": ("KOREAN", "ENGLISH"),
            "English to Korean": ("ENGLISH", "KOREAN")
        }
        
        if direction not in lang_map:
            return "Invalid direction"
        
        source_lang, target_lang = lang_map[direction]
        
        # Truncate long inputs for faster processing
        max_input_length = 100  # Limit input length
        if len(text) > max_input_length:
            text = text[:max_input_length] + "..."
        
        # Efficient prompt format
        prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
        
        # Reduce max tokens for faster inference
        input_tokens = len(text.split())
        max_tokens = min(50, max(20, int(input_tokens * 1.2)))
        
        # Generate translation with aggressive performance optimizations
        try:
            response = self.model.create_completion(
                prompt,
                max_tokens=max_tokens,
                temperature=0.0,      # Deterministic for faster inference
                top_k=1,              # Only consider most likely token
                top_p=1.0,            # No sampling
                repeat_penalty=1.0,   # No repeat penalty
                stream=False,         # Get complete response at once
                stop=["[/", "\n\n"],  # Stop early if possible
            )
            
            translation = response['choices'][0]['text'].strip()
            
            # Log performance
            inference_time = time.time() - start_time
            tokens_per_second = (input_tokens + len(translation.split())) / inference_time
            print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
            
            return translation
        except Exception as e:
            print(f"Translation error: {e}")
            return f"Error: Could not translate text. Try shorter input."
    
    def request_translation(self, direction, text, callback_id):
        """Queue a translation request"""
        self.request_queue.put((direction, text, callback_id))

# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
worker_pool = ModelWorkerPool(num_workers=1)

# Counter for request IDs
next_request_id = 0

# Fast similarity check function for finding close matches in cache
def find_similar_cached(direction, text, threshold=0.8):
    """Find similar translations in cache based on prefix matching"""
    if len(text) < 5:  # For very short inputs, look for exact matches
        return None
        
    text_lower = text.lower()
    best_match = None
    best_score = 0
    
    for cached_key in list(translation_cache.keys()):
        cached_dir, cached_text = cached_key.split(":", 1)
        if cached_dir != direction:
            continue
            
        # Simple similarity - prefix matching
        if cached_text.lower().startswith(text_lower[:5]):
            similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
            if similarity > best_score and similarity > threshold:
                best_score = similarity
                best_match = translation_cache[cached_key]
                
    return best_match

# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
    """Queue translation request and wait for result - optimized version"""
    global next_request_id
    
    # Trim whitespace for better cache hits
    text = text.strip()
    
    # Skip empty inputs
    if not text:
        return ""
    
    # Check LRU cache first
    cached = get_cached_translation(direction, text)
    if cached is not None:
        return cached
    
    # Check main cache
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # For short inputs, try to find similar cached
    if len(text) < 20:
        similar = find_similar_cached(direction, text)
        if similar:
            return similar
    
    # Generate unique request ID
    request_id = next_request_id
    next_request_id += 1
    
    # Queue the request
    worker_pool.request_translation(direction, text, request_id)
    
    # Wait for the response with reasonable timeout
    progress(0, desc="Translating...")
    max_wait = 20  # Reduced maximum wait time
    start_time = time.time()
    
    # Show progress while waiting
    while time.time() - start_time < max_wait:
        progress((time.time() - start_time) / max_wait)
        
        # Check for our response
        try:
            while not worker_pool.response_queue.empty():
                resp_id, result = worker_pool.response_queue.get_nowait()
                if resp_id == request_id:
                    # Update LRU cache
                    get_cached_translation.__wrapped__.__defaults__ = (result,)
                    progress(1.0)
                    return result
        except queue.Empty:
            pass
        
        # Small sleep to prevent CPU hogging - reduced for faster response
        time.sleep(0.01)
    
    progress(1.0)
    return "Translation timed out. Please try a shorter text."

# Create Gradio interface with simplified UI for performance
with gr.Blocks(title="Fast CPU Translation App") as iface:
    gr.Markdown(f"""
    ## Fast CPU Translation App
    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}  
    **For best performance, use short sentences or phrases.**
    """)
    
    with gr.Row():
        direction = gr.Radio(
            choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
        output_text = gr.Textbox(lines=3, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Add examples with common short phrases for quick results
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello"],
            ["Spanish to English", "Hola"],
            ["English to Korean", "Thank you"],
            ["Korean to English", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค"]
        ],
        inputs=[direction, input_text],
        fn=translate,
        outputs=output_text
    )
    
    # Add performance tips
    gr.Markdown("""
    ### Performance Tips
    - Keep text under 50 characters for fastest results
    - Common phrases are pre-cached
    - First translation may be slow, subsequent ones faster
    - Frequently used phrases use an LRU cache for speed
    """)

# Launch with optimized settings
if __name__ == "__main__":
    iface.launch(
        debug=False,
        show_error=True,
        share=False,
        quiet=True,
        server_name="0.0.0.0", 
        server_port=7860
    )