johnpaulbin commited on
Commit
f06b197
·
verified ·
1 Parent(s): c2b521a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -78
app.py CHANGED
@@ -1,16 +1,22 @@
1
- from huggingface_hub import hf_hub_download
2
- from llama_cpp import Llama
 
3
  import gradio as gr
 
 
 
4
  import multiprocessing
5
- import time
6
- import os
7
 
8
- # Model paths
 
 
 
 
 
9
  def get_model_path(repo_id, filename):
10
  print(f"Obtaining {filename}...")
11
  return hf_hub_download(repo_id=repo_id, filename=filename)
12
 
13
- # Get models
14
  base_model_path = get_model_path(
15
  "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
16
  "articulate-11-expspanish-base-merged-q8_0.gguf"
@@ -20,95 +26,241 @@ adapter_path = get_model_path(
20
  "articulate-V1-q8_0.gguf"
21
  )
22
 
23
- # Conservative CPU settings to avoid memory corruption
24
- cpu_count = multiprocessing.cpu_count()
25
- optimal_threads = max(1, min(8, cpu_count // 2)) # More conservative thread count
26
- batch_size = 128 # Reduced batch size to prevent memory issues
 
 
 
27
 
28
- print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
29
-
30
- # Initialize model with safer parameters
31
- start_time = time.time()
32
- llm = Llama(
33
- model_path=base_model_path,
34
- lora_path=adapter_path,
35
- n_ctx=512,
36
- n_threads=optimal_threads,
37
- n_batch=batch_size, # Smaller batch size for stability
38
- use_mmap=True,
39
- n_gpu_layers=0,
40
- verbose=False
41
- )
42
- print(f"Model loaded in {time.time() - start_time:.2f} seconds")
 
 
43
 
44
- # Simple translation cache (limited size)
45
  translation_cache = {}
46
- MAX_CACHE_SIZE = 50 # Reduced cache size
47
 
48
- def translate(direction, text):
49
- # Validate input
50
- if not text or not text.strip():
51
- return ""
52
-
53
- text = text.strip()
54
-
55
- # Simple cache lookup
56
- cache_key = f"{direction}:{text}"
57
- if cache_key in translation_cache:
58
- return translation_cache[cache_key]
59
-
60
- # Start timing
61
- start_time = time.time()
62
-
63
- # Language mapping
64
- lang_map = {
65
- "English to Spanish": ("ENGLISH", "SPANISH"),
66
- "Spanish to English": ("SPANISH", "ENGLISH"),
67
- "Korean to English": ("KOREAN", "ENGLISH"),
68
- "English to Korean": ("ENGLISH", "KOREAN")
69
- }
70
-
71
- if direction not in lang_map:
72
- return "Invalid direction"
 
 
 
73
 
74
- source_lang, target_lang = lang_map[direction]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # Create prompt
77
- prompt = f"[{source_lang}]{text}[{target_lang}]"
 
 
 
 
 
 
78
 
79
- try:
80
- # Generate translation with conservative settings
81
- response = llm.create_completion(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  prompt,
83
- max_tokens=128, # Conservative token limit
84
- temperature=0.0, # Deterministic
85
- top_k=1, # Most likely token only
86
- top_p=1.0, # No sampling
87
- repeat_penalty=1.0,
88
- stream=False
89
  )
90
 
91
  translation = response['choices'][0]['text'].strip()
92
 
93
- # Manage cache size
94
  if len(translation_cache) >= MAX_CACHE_SIZE:
95
- # Remove oldest entry
96
  translation_cache.pop(next(iter(translation_cache)))
97
  translation_cache[cache_key] = translation
98
 
99
  # Log performance
100
  inference_time = time.time() - start_time
101
- print(f"Translation completed in {inference_time:.3f}s")
 
102
 
103
  return translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- except Exception as e:
106
- print(f"Translation error: {e}")
107
- return f"Error during translation: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  # Create Gradio interface
110
- with gr.Blocks(title="Translation App") as iface:
111
- gr.Markdown("## Fast Translation App")
 
 
 
112
 
113
  with gr.Row():
114
  direction = gr.Dropdown(
@@ -118,24 +270,41 @@ with gr.Blocks(title="Translation App") as iface:
118
  )
119
 
120
  with gr.Row():
121
- input_text = gr.Textbox(lines=5, label="Input Text")
122
  output_text = gr.Textbox(lines=5, label="Translation")
123
 
124
  # Add translate button
125
  translate_btn = gr.Button("Translate")
126
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
127
 
128
- # Examples WITHOUT caching (to avoid memory issues)
 
 
 
 
 
 
 
 
 
129
  gr.Examples(
130
  examples=[
131
  ["English to Spanish", "Hello, how are you today?"],
132
  ["Spanish to English", "Hola, ¿cómo estás hoy?"],
133
  ["English to Korean", "The weather is nice today."],
134
- ["Korean to English", "오늘 날씨가 좋습니다."]
135
  ],
136
  inputs=[direction, input_text],
137
- cache_examples=False # Disabled caching to prevent memory issues
 
138
  )
139
 
140
- # Launch with safer settings
141
- iface.launch(debug=False, show_error=True)
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import torch
4
  import gradio as gr
5
+ from huggingface_hub import hf_hub_download
6
+ import threading
7
+ import queue
8
  import multiprocessing
 
 
9
 
10
+ # First check if GPU is available for maximum speed
11
+ has_gpu = torch.cuda.is_available()
12
+ gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
13
+ print(f"GPU available: {has_gpu} - {gpu_name}")
14
+
15
+ # Download model files
16
  def get_model_path(repo_id, filename):
17
  print(f"Obtaining {filename}...")
18
  return hf_hub_download(repo_id=repo_id, filename=filename)
19
 
 
20
  base_model_path = get_model_path(
21
  "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
22
  "articulate-11-expspanish-base-merged-q8_0.gguf"
 
26
  "articulate-V1-q8_0.gguf"
27
  )
28
 
29
+ # Set up optimized environment variables for llama-cpp-python
30
+ os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
31
+ os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
32
+ # For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
33
+ os.environ["LLAMA_AVX"] = "1"
34
+ os.environ["LLAMA_AVX2"] = "1"
35
+ os.environ["LLAMA_F16"] = "1" # Use FP16 where available
36
 
37
+ # Determine the most optimized backend
38
+ if has_gpu:
39
+ try:
40
+ from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
41
+ LlamaClass = GPULlama
42
+ print("Using GPU-accelerated llama-cpp-python")
43
+ n_gpu_layers = -1 # Use all layers on GPU
44
+ except ImportError:
45
+ from llama_cpp import Llama
46
+ LlamaClass = Llama
47
+ print("Using standard llama-cpp-python with GPU acceleration")
48
+ n_gpu_layers = -1 # Use all layers on GPU
49
+ else:
50
+ from llama_cpp import Llama
51
+ LlamaClass = Llama
52
+ print("Using CPU-only llama-cpp-python")
53
+ n_gpu_layers = 0
54
 
55
+ # Cache for translations
56
  translation_cache = {}
57
+ MAX_CACHE_SIZE = 1000
58
 
59
+ # Pre-compute common translations
60
+ COMMON_PHRASES = {
61
+ "English to Spanish": [
62
+ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
63
+ "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
64
+ ],
65
+ "Spanish to English": [
66
+ "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
67
+ "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
68
+ ],
69
+ "English to Korean": [
70
+ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
71
+ "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
72
+ ],
73
+ "Korean to English": [
74
+ "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
75
+ "이해가 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
76
+ ]
77
+ }
78
+
79
+ # Background worker for model loading and inference
80
+ class ModelWorker:
81
+ def __init__(self):
82
+ self.model = None
83
+ self.request_queue = queue.Queue()
84
+ self.response_queue = queue.Queue()
85
+ self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
86
+ self.worker_thread.start()
87
 
88
+ def _worker_loop(self):
89
+ # Initialize model in the worker thread
90
+ print("Initializing model in background thread...")
91
+
92
+ # CPU optimization settings
93
+ cpu_count = multiprocessing.cpu_count()
94
+ optimal_threads = max(4, cpu_count - 2) # Leave two cores free
95
+
96
+ # Initialize with the most optimized settings
97
+ start_time = time.time()
98
+ self.model = LlamaClass(
99
+ model_path=base_model_path,
100
+ lora_path=adapter_path,
101
+ n_ctx=512, # Larger context for longer translations
102
+ n_threads=optimal_threads, # Optimized thread count
103
+ n_batch=1024, # Large batch for parallel processing
104
+ use_mmap=True, # Efficient memory mapping
105
+ n_gpu_layers=n_gpu_layers, # GPU acceleration if available
106
+ seed=42, # Consistent results
107
+ verbose=False, # Reduce overhead
108
+ main_gpu=0, # Primary GPU
109
+ tensor_split=None, # Auto-distribute across GPUs if multiple
110
+ rope_freq_base=10000, # Optimized attention parameters
111
+ rope_freq_scale=1.0,
112
+ )
113
+ print(f"Model loaded in {time.time() - start_time:.2f} seconds")
114
+
115
+ # Pre-warm the model with common phrases
116
+ self._prewarm_model()
117
+
118
+ # Process requests
119
+ while True:
120
+ try:
121
+ request = self.request_queue.get()
122
+ if request is None: # Shutdown signal
123
+ break
124
+
125
+ direction, text, callback_id = request
126
+ result = self._process_translation(direction, text)
127
+ self.response_queue.put((callback_id, result))
128
+ except Exception as e:
129
+ print(f"Error in worker thread: {e}")
130
+ self.response_queue.put((callback_id, f"Error: {str(e)}"))
131
 
132
+ def _prewarm_model(self):
133
+ """Pre-compute common translations to warm up the model"""
134
+ print("Pre-warming model with common phrases...")
135
+ start = time.time()
136
+ for direction, phrases in COMMON_PHRASES.items():
137
+ for phrase in phrases[:3]: # Just do a few to warm up
138
+ self._process_translation(direction, phrase)
139
+ print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
140
 
141
+ def _process_translation(self, direction, text):
142
+ # Skip empty inputs
143
+ if not text or not text.strip():
144
+ return ""
145
+
146
+ # Check cache first for faster response
147
+ cache_key = f"{direction}:{text}"
148
+ if cache_key in translation_cache:
149
+ return translation_cache[cache_key]
150
+
151
+ # Start timing for performance tracking
152
+ start_time = time.time()
153
+
154
+ # Map language directions
155
+ lang_map = {
156
+ "English to Spanish": ("ENGLISH", "SPANISH"),
157
+ "Spanish to English": ("SPANISH", "ENGLISH"),
158
+ "Korean to English": ("KOREAN", "ENGLISH"),
159
+ "English to Korean": ("ENGLISH", "KOREAN")
160
+ }
161
+
162
+ if direction not in lang_map:
163
+ return "Invalid direction"
164
+
165
+ source_lang, target_lang = lang_map[direction]
166
+
167
+ # Efficient prompt format
168
+ prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
169
+
170
+ # Estimate appropriate token length based on input
171
+ input_tokens = len(text.split())
172
+ max_tokens = min(200, max(50, int(input_tokens * 1.5)))
173
+
174
+ # Generate translation with optimized settings
175
+ response = self.model.create_completion(
176
  prompt,
177
+ max_tokens=max_tokens,
178
+ temperature=0.0, # Deterministic for faster inference
179
+ top_k=1, # Only consider most likely token
180
+ top_p=1.0, # No sampling
181
+ repeat_penalty=1.0, # No repeat penalty
182
+ stream=False # Get complete response at once
183
  )
184
 
185
  translation = response['choices'][0]['text'].strip()
186
 
187
+ # Cache result
188
  if len(translation_cache) >= MAX_CACHE_SIZE:
189
+ # Remove oldest entry (first key)
190
  translation_cache.pop(next(iter(translation_cache)))
191
  translation_cache[cache_key] = translation
192
 
193
  # Log performance
194
  inference_time = time.time() - start_time
195
+ tokens_per_second = (input_tokens + len(translation.split())) / inference_time
196
+ print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
197
 
198
  return translation
199
+
200
+ def request_translation(self, direction, text, callback_id):
201
+ """Queue a translation request"""
202
+ self.request_queue.put((direction, text, callback_id))
203
+
204
+ # Create worker instance
205
+ worker = ModelWorker()
206
+
207
+ # Counter for request IDs
208
+ next_request_id = 0
209
+
210
+ # Gradio interface functions
211
+ def translate(direction, text, progress=gr.Progress()):
212
+ """Queue translation request and wait for result"""
213
+ global next_request_id
214
+
215
+ # Check cache first for immediate response
216
+ cache_key = f"{direction}:{text}"
217
+ if cache_key in translation_cache:
218
+ return translation_cache[cache_key]
219
+
220
+ # If input is very short, check if we have a similar cached phrase
221
+ if len(text) < 20:
222
+ for cached_key in translation_cache:
223
+ cached_dir, cached_text = cached_key.split(":", 1)
224
+ if cached_dir == direction and cached_text.lower().startswith(text.lower()):
225
+ return translation_cache[cached_key]
226
+
227
+ # Generate unique request ID
228
+ request_id = next_request_id
229
+ next_request_id += 1
230
+
231
+ # Queue the request
232
+ worker.request_translation(direction, text, request_id)
233
+
234
+ # Wait for the response (with progress feedback)
235
+ progress(0, desc="Translating...")
236
+ max_wait = 30 # Maximum wait time in seconds
237
+ start_time = time.time()
238
+
239
+ while time.time() - start_time < max_wait:
240
+ progress((time.time() - start_time) / max_wait)
241
 
242
+ # Check for our response
243
+ try:
244
+ while not worker.response_queue.empty():
245
+ resp_id, result = worker.response_queue.get_nowait()
246
+ if resp_id == request_id:
247
+ progress(1.0)
248
+ return result
249
+ except queue.Empty:
250
+ pass
251
+
252
+ # Small sleep to prevent CPU hogging
253
+ time.sleep(0.05)
254
+
255
+ progress(1.0)
256
+ return "Translation timed out. Please try again."
257
 
258
  # Create Gradio interface
259
+ with gr.Blocks(title="Ultra-Fast Translation App") as iface:
260
+ gr.Markdown(f"""
261
+ ## Ultra-Fast Translation App
262
+ Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
263
+ """)
264
 
265
  with gr.Row():
266
  direction = gr.Dropdown(
 
270
  )
271
 
272
  with gr.Row():
273
+ input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
274
  output_text = gr.Textbox(lines=5, label="Translation")
275
 
276
  # Add translate button
277
  translate_btn = gr.Button("Translate")
278
  translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
279
 
280
+ # Optimization options
281
+ with gr.Accordion("Advanced Options", open=False):
282
+ gr.Markdown("""
283
+ ### Performance Tips
284
+ - Short sentences translate faster than long paragraphs
285
+ - Common phrases may be cached for instant results
286
+ - First translation might be slower as the model warms up
287
+ """)
288
+
289
+ # Add examples with preloaded common phrases
290
  gr.Examples(
291
  examples=[
292
  ["English to Spanish", "Hello, how are you today?"],
293
  ["Spanish to English", "Hola, ¿cómo estás hoy?"],
294
  ["English to Korean", "The weather is nice today."],
295
+ ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
296
  ],
297
  inputs=[direction, input_text],
298
+ fn=translate,
299
+ outputs=output_text
300
  )
301
 
302
+ # Launch with optimized settings
303
+ iface.launch(
304
+ debug=False,
305
+ show_error=True,
306
+ share=False, # Don't share publicly by default
307
+ quiet=True, # Reduce console output
308
+ server_name="0.0.0.0",
309
+ server_port=7860
310
+ )