Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ from huggingface_hub import hf_hub_download
|
|
6 |
import threading
|
7 |
import queue
|
8 |
import multiprocessing
|
|
|
9 |
|
10 |
# First check if GPU is available for maximum speed
|
11 |
has_gpu = torch.cuda.is_available()
|
@@ -26,37 +27,22 @@ adapter_path = get_model_path(
|
|
26 |
"articulate-V1-q8_0.gguf"
|
27 |
)
|
28 |
|
29 |
-
#
|
30 |
-
os.environ["LLAMA_CUBLAS"] = "
|
31 |
os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
|
32 |
-
|
33 |
-
os.environ["
|
34 |
-
os.environ["
|
35 |
-
os.environ["LLAMA_F16"] = "1" # Use FP16 where available
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
|
41 |
-
LlamaClass = GPULlama
|
42 |
-
print("Using GPU-accelerated llama-cpp-python")
|
43 |
-
n_gpu_layers = -1 # Use all layers on GPU
|
44 |
-
except ImportError:
|
45 |
-
from llama_cpp import Llama
|
46 |
-
LlamaClass = Llama
|
47 |
-
print("Using standard llama-cpp-python with GPU acceleration")
|
48 |
-
n_gpu_layers = -1 # Use all layers on GPU
|
49 |
-
else:
|
50 |
-
from llama_cpp import Llama
|
51 |
-
LlamaClass = Llama
|
52 |
-
print("Using CPU-only llama-cpp-python")
|
53 |
-
n_gpu_layers = 0
|
54 |
|
55 |
# Cache for translations
|
56 |
translation_cache = {}
|
57 |
-
MAX_CACHE_SIZE =
|
58 |
|
59 |
-
#
|
60 |
COMMON_PHRASES = {
|
61 |
"English to Spanish": [
|
62 |
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
|
@@ -76,46 +62,59 @@ COMMON_PHRASES = {
|
|
76 |
]
|
77 |
}
|
78 |
|
79 |
-
#
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
self.request_queue = queue.Queue()
|
84 |
self.response_queue = queue.Queue()
|
85 |
-
self.
|
86 |
-
self.
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
# CPU optimization settings
|
93 |
cpu_count = multiprocessing.cpu_count()
|
94 |
-
optimal_threads = max(4, cpu_count -
|
95 |
|
96 |
-
#
|
97 |
-
|
98 |
-
self.model = LlamaClass(
|
99 |
model_path=base_model_path,
|
100 |
lora_path=adapter_path,
|
101 |
-
n_ctx=
|
102 |
n_threads=optimal_threads, # Optimized thread count
|
103 |
-
n_batch=
|
104 |
-
use_mmap=True,
|
105 |
-
n_gpu_layers=
|
106 |
-
seed=42,
|
107 |
-
verbose=False,
|
108 |
-
|
109 |
-
tensor_split=None, # Auto-distribute across GPUs if multiple
|
110 |
-
rope_freq_base=10000, # Optimized attention parameters
|
111 |
rope_freq_scale=1.0,
|
112 |
)
|
113 |
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
|
114 |
|
115 |
-
#
|
116 |
-
|
|
|
|
|
|
|
117 |
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
while True:
|
120 |
try:
|
121 |
request = self.request_queue.get()
|
@@ -123,31 +122,51 @@ class ModelWorker:
|
|
123 |
break
|
124 |
|
125 |
direction, text, callback_id = request
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
self.response_queue.put((callback_id, result))
|
|
|
128 |
except Exception as e:
|
129 |
print(f"Error in worker thread: {e}")
|
130 |
self.response_queue.put((callback_id, f"Error: {str(e)}"))
|
|
|
131 |
|
132 |
def _prewarm_model(self):
|
133 |
-
"""Pre-compute common translations to warm up the model"""
|
134 |
-
print("Pre-warming model with
|
135 |
start = time.time()
|
|
|
|
|
136 |
for direction, phrases in COMMON_PHRASES.items():
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
140 |
|
141 |
def _process_translation(self, direction, text):
|
|
|
142 |
# Skip empty inputs
|
143 |
if not text or not text.strip():
|
144 |
return ""
|
145 |
|
146 |
-
# Check cache first for faster response
|
147 |
-
cache_key = f"{direction}:{text}"
|
148 |
-
if cache_key in translation_cache:
|
149 |
-
return translation_cache[cache_key]
|
150 |
-
|
151 |
# Start timing for performance tracking
|
152 |
start_time = time.time()
|
153 |
|
@@ -164,147 +183,191 @@ class ModelWorker:
|
|
164 |
|
165 |
source_lang, target_lang = lang_map[direction]
|
166 |
|
|
|
|
|
|
|
|
|
|
|
167 |
# Efficient prompt format
|
168 |
prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
|
169 |
|
170 |
-
#
|
171 |
input_tokens = len(text.split())
|
172 |
-
max_tokens = min(
|
173 |
-
|
174 |
-
# Generate translation with optimized settings
|
175 |
-
response = self.model.create_completion(
|
176 |
-
prompt,
|
177 |
-
max_tokens=max_tokens,
|
178 |
-
temperature=0.0, # Deterministic for faster inference
|
179 |
-
top_k=1, # Only consider most likely token
|
180 |
-
top_p=1.0, # No sampling
|
181 |
-
repeat_penalty=1.0, # No repeat penalty
|
182 |
-
stream=False # Get complete response at once
|
183 |
-
)
|
184 |
-
|
185 |
-
translation = response['choices'][0]['text'].strip()
|
186 |
|
187 |
-
#
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
def request_translation(self, direction, text, callback_id):
|
201 |
"""Queue a translation request"""
|
202 |
self.request_queue.put((direction, text, callback_id))
|
203 |
|
204 |
-
# Create worker
|
205 |
-
|
206 |
|
207 |
# Counter for request IDs
|
208 |
next_request_id = 0
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
# Gradio interface functions
|
211 |
def translate(direction, text, progress=gr.Progress()):
|
212 |
-
"""Queue translation request and wait for result"""
|
213 |
global next_request_id
|
214 |
|
215 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
cache_key = f"{direction}:{text}"
|
217 |
if cache_key in translation_cache:
|
218 |
return translation_cache[cache_key]
|
219 |
|
220 |
-
#
|
221 |
if len(text) < 20:
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
return translation_cache[cached_key]
|
226 |
|
227 |
# Generate unique request ID
|
228 |
request_id = next_request_id
|
229 |
next_request_id += 1
|
230 |
|
231 |
# Queue the request
|
232 |
-
|
233 |
|
234 |
-
# Wait for the response
|
235 |
progress(0, desc="Translating...")
|
236 |
-
max_wait =
|
237 |
start_time = time.time()
|
238 |
|
|
|
239 |
while time.time() - start_time < max_wait:
|
240 |
progress((time.time() - start_time) / max_wait)
|
241 |
|
242 |
# Check for our response
|
243 |
try:
|
244 |
-
while not
|
245 |
-
resp_id, result =
|
246 |
if resp_id == request_id:
|
|
|
|
|
247 |
progress(1.0)
|
248 |
return result
|
249 |
except queue.Empty:
|
250 |
pass
|
251 |
|
252 |
-
# Small sleep to prevent CPU hogging
|
253 |
-
time.sleep(0.
|
254 |
|
255 |
progress(1.0)
|
256 |
-
return "Translation timed out. Please try
|
257 |
|
258 |
-
# Create Gradio interface
|
259 |
-
with gr.Blocks(title="
|
260 |
gr.Markdown(f"""
|
261 |
-
##
|
262 |
-
Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
|
|
|
263 |
""")
|
264 |
|
265 |
with gr.Row():
|
266 |
-
direction = gr.
|
267 |
-
choices=["English to Spanish", "Spanish to English", "
|
268 |
label="Translation Direction",
|
269 |
value="English to Spanish"
|
270 |
)
|
271 |
|
272 |
with gr.Row():
|
273 |
-
input_text = gr.Textbox(lines=
|
274 |
-
output_text = gr.Textbox(lines=
|
275 |
|
276 |
# Add translate button
|
277 |
translate_btn = gr.Button("Translate")
|
278 |
translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
|
279 |
|
280 |
-
#
|
281 |
-
with gr.Accordion("Advanced Options", open=False):
|
282 |
-
gr.Markdown("""
|
283 |
-
### Performance Tips
|
284 |
-
- Short sentences translate faster than long paragraphs
|
285 |
-
- Common phrases may be cached for instant results
|
286 |
-
- First translation might be slower as the model warms up
|
287 |
-
""")
|
288 |
-
|
289 |
-
# Add examples with preloaded common phrases
|
290 |
gr.Examples(
|
291 |
examples=[
|
292 |
-
["English to Spanish", "Hello
|
293 |
-
["Spanish to English", "Hola
|
294 |
-
["English to Korean", "
|
295 |
-
["Korean to English", "
|
296 |
],
|
297 |
inputs=[direction, input_text],
|
298 |
fn=translate,
|
299 |
outputs=output_text
|
300 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
# Launch with optimized settings
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
6 |
import threading
|
7 |
import queue
|
8 |
import multiprocessing
|
9 |
+
from functools import lru_cache
|
10 |
|
11 |
# First check if GPU is available for maximum speed
|
12 |
has_gpu = torch.cuda.is_available()
|
|
|
27 |
"articulate-V1-q8_0.gguf"
|
28 |
)
|
29 |
|
30 |
+
# Optimize environment variables for CPU performance
|
31 |
+
os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only
|
32 |
os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
|
33 |
+
os.environ["LLAMA_AVX"] = "1" # Enable AVX
|
34 |
+
os.environ["LLAMA_AVX2"] = "1" # Enable AVX2
|
35 |
+
os.environ["LLAMA_F16"] = "1" # Use FP16 where available
|
|
|
36 |
|
37 |
+
# Import the right module
|
38 |
+
from llama_cpp import Llama
|
39 |
+
print("Using CPU-optimized llama-cpp-python")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Cache for translations
|
42 |
translation_cache = {}
|
43 |
+
MAX_CACHE_SIZE = 5000 # Increased cache size
|
44 |
|
45 |
+
# Common phrases for pre-loading
|
46 |
COMMON_PHRASES = {
|
47 |
"English to Spanish": [
|
48 |
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
|
|
|
62 |
]
|
63 |
}
|
64 |
|
65 |
+
# Implement LRU cache for better performance
|
66 |
+
@lru_cache(maxsize=100)
|
67 |
+
def get_cached_translation(direction, text):
|
68 |
+
"""LRU cache for translations"""
|
69 |
+
return None # This gets bypassed when there's a cache hit
|
70 |
+
|
71 |
+
# Create a worker pool for parallel translation
|
72 |
+
class ModelWorkerPool:
|
73 |
+
def __init__(self, num_workers=1):
|
74 |
+
self.num_workers = num_workers
|
75 |
self.request_queue = queue.Queue()
|
76 |
self.response_queue = queue.Queue()
|
77 |
+
self.workers = []
|
78 |
+
self.initialized = False
|
79 |
+
|
80 |
+
# Create shared model instance with optimized settings
|
81 |
+
print("Initializing model with CPU optimizations...")
|
82 |
+
start_time = time.time()
|
83 |
|
84 |
+
# CPU optimization settings - use fewer threads for Q8 model
|
85 |
cpu_count = multiprocessing.cpu_count()
|
86 |
+
optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance
|
87 |
|
88 |
+
# Create a smaller context size for faster inference
|
89 |
+
self.model = Llama(
|
|
|
90 |
model_path=base_model_path,
|
91 |
lora_path=adapter_path,
|
92 |
+
n_ctx=256, # Reduced context for faster processing
|
93 |
n_threads=optimal_threads, # Optimized thread count
|
94 |
+
n_batch=512, # Reduced batch size for CPU
|
95 |
+
use_mmap=True, # Efficient memory mapping
|
96 |
+
n_gpu_layers=0, # CPU only
|
97 |
+
seed=42, # Consistent results
|
98 |
+
verbose=False, # Reduce overhead
|
99 |
+
rope_freq_base=10000, # Default attention parameters
|
|
|
|
|
100 |
rope_freq_scale=1.0,
|
101 |
)
|
102 |
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
|
103 |
|
104 |
+
# Start worker threads
|
105 |
+
for i in range(num_workers):
|
106 |
+
worker = threading.Thread(target=self._worker_loop, daemon=True)
|
107 |
+
worker.start()
|
108 |
+
self.workers.append(worker)
|
109 |
|
110 |
+
self.initialized = True
|
111 |
+
|
112 |
+
# Pre-warm in background thread to not block startup
|
113 |
+
warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
|
114 |
+
warming_thread.start()
|
115 |
+
|
116 |
+
def _worker_loop(self):
|
117 |
+
"""Worker thread that processes translation requests"""
|
118 |
while True:
|
119 |
try:
|
120 |
request = self.request_queue.get()
|
|
|
122 |
break
|
123 |
|
124 |
direction, text, callback_id = request
|
125 |
+
|
126 |
+
# Check LRU cache first
|
127 |
+
cached = get_cached_translation(direction, text)
|
128 |
+
if cached is not None:
|
129 |
+
self.response_queue.put((callback_id, cached))
|
130 |
+
self.request_queue.task_done()
|
131 |
+
continue
|
132 |
+
|
133 |
+
# Check regular cache
|
134 |
+
cache_key = f"{direction}:{text}"
|
135 |
+
if cache_key in translation_cache:
|
136 |
+
result = translation_cache[cache_key]
|
137 |
+
else:
|
138 |
+
# Process new translation
|
139 |
+
result = self._process_translation(direction, text)
|
140 |
+
# Store in regular cache
|
141 |
+
if len(translation_cache) >= MAX_CACHE_SIZE:
|
142 |
+
translation_cache.pop(next(iter(translation_cache)))
|
143 |
+
translation_cache[cache_key] = result
|
144 |
+
|
145 |
self.response_queue.put((callback_id, result))
|
146 |
+
self.request_queue.task_done()
|
147 |
except Exception as e:
|
148 |
print(f"Error in worker thread: {e}")
|
149 |
self.response_queue.put((callback_id, f"Error: {str(e)}"))
|
150 |
+
self.request_queue.task_done()
|
151 |
|
152 |
def _prewarm_model(self):
|
153 |
+
"""Pre-compute common translations to warm up the model - minimal to save time"""
|
154 |
+
print("Pre-warming model with essential phrases (truncated for speed)...")
|
155 |
start = time.time()
|
156 |
+
|
157 |
+
# Just warm up with one phrase per direction to speed up startup
|
158 |
for direction, phrases in COMMON_PHRASES.items():
|
159 |
+
self._process_translation(direction, phrases[0])
|
160 |
+
# Only do the most common phrase to save startup time
|
161 |
+
|
162 |
+
print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")
|
163 |
|
164 |
def _process_translation(self, direction, text):
|
165 |
+
"""Optimized translation function"""
|
166 |
# Skip empty inputs
|
167 |
if not text or not text.strip():
|
168 |
return ""
|
169 |
|
|
|
|
|
|
|
|
|
|
|
170 |
# Start timing for performance tracking
|
171 |
start_time = time.time()
|
172 |
|
|
|
183 |
|
184 |
source_lang, target_lang = lang_map[direction]
|
185 |
|
186 |
+
# Truncate long inputs for faster processing
|
187 |
+
max_input_length = 100 # Limit input length
|
188 |
+
if len(text) > max_input_length:
|
189 |
+
text = text[:max_input_length] + "..."
|
190 |
+
|
191 |
# Efficient prompt format
|
192 |
prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
|
193 |
|
194 |
+
# Reduce max tokens for faster inference
|
195 |
input_tokens = len(text.split())
|
196 |
+
max_tokens = min(50, max(20, int(input_tokens * 1.2)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
+
# Generate translation with aggressive performance optimizations
|
199 |
+
try:
|
200 |
+
response = self.model.create_completion(
|
201 |
+
prompt,
|
202 |
+
max_tokens=max_tokens,
|
203 |
+
temperature=0.0, # Deterministic for faster inference
|
204 |
+
top_k=1, # Only consider most likely token
|
205 |
+
top_p=1.0, # No sampling
|
206 |
+
repeat_penalty=1.0, # No repeat penalty
|
207 |
+
stream=False, # Get complete response at once
|
208 |
+
stop=["[/", "\n\n"], # Stop early if possible
|
209 |
+
)
|
210 |
+
|
211 |
+
translation = response['choices'][0]['text'].strip()
|
212 |
+
|
213 |
+
# Log performance
|
214 |
+
inference_time = time.time() - start_time
|
215 |
+
tokens_per_second = (input_tokens + len(translation.split())) / inference_time
|
216 |
+
print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
|
217 |
+
|
218 |
+
return translation
|
219 |
+
except Exception as e:
|
220 |
+
print(f"Translation error: {e}")
|
221 |
+
return f"Error: Could not translate text. Try shorter input."
|
222 |
|
223 |
def request_translation(self, direction, text, callback_id):
|
224 |
"""Queue a translation request"""
|
225 |
self.request_queue.put((direction, text, callback_id))
|
226 |
|
227 |
+
# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
|
228 |
+
worker_pool = ModelWorkerPool(num_workers=1)
|
229 |
|
230 |
# Counter for request IDs
|
231 |
next_request_id = 0
|
232 |
|
233 |
+
# Fast similarity check function for finding close matches in cache
|
234 |
+
def find_similar_cached(direction, text, threshold=0.8):
|
235 |
+
"""Find similar translations in cache based on prefix matching"""
|
236 |
+
if len(text) < 5: # For very short inputs, look for exact matches
|
237 |
+
return None
|
238 |
+
|
239 |
+
text_lower = text.lower()
|
240 |
+
best_match = None
|
241 |
+
best_score = 0
|
242 |
+
|
243 |
+
for cached_key in list(translation_cache.keys()):
|
244 |
+
cached_dir, cached_text = cached_key.split(":", 1)
|
245 |
+
if cached_dir != direction:
|
246 |
+
continue
|
247 |
+
|
248 |
+
# Simple similarity - prefix matching
|
249 |
+
if cached_text.lower().startswith(text_lower[:5]):
|
250 |
+
similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
|
251 |
+
if similarity > best_score and similarity > threshold:
|
252 |
+
best_score = similarity
|
253 |
+
best_match = translation_cache[cached_key]
|
254 |
+
|
255 |
+
return best_match
|
256 |
+
|
257 |
# Gradio interface functions
|
258 |
def translate(direction, text, progress=gr.Progress()):
|
259 |
+
"""Queue translation request and wait for result - optimized version"""
|
260 |
global next_request_id
|
261 |
|
262 |
+
# Trim whitespace for better cache hits
|
263 |
+
text = text.strip()
|
264 |
+
|
265 |
+
# Skip empty inputs
|
266 |
+
if not text:
|
267 |
+
return ""
|
268 |
+
|
269 |
+
# Check LRU cache first
|
270 |
+
cached = get_cached_translation(direction, text)
|
271 |
+
if cached is not None:
|
272 |
+
return cached
|
273 |
+
|
274 |
+
# Check main cache
|
275 |
cache_key = f"{direction}:{text}"
|
276 |
if cache_key in translation_cache:
|
277 |
return translation_cache[cache_key]
|
278 |
|
279 |
+
# For short inputs, try to find similar cached
|
280 |
if len(text) < 20:
|
281 |
+
similar = find_similar_cached(direction, text)
|
282 |
+
if similar:
|
283 |
+
return similar
|
|
|
284 |
|
285 |
# Generate unique request ID
|
286 |
request_id = next_request_id
|
287 |
next_request_id += 1
|
288 |
|
289 |
# Queue the request
|
290 |
+
worker_pool.request_translation(direction, text, request_id)
|
291 |
|
292 |
+
# Wait for the response with reasonable timeout
|
293 |
progress(0, desc="Translating...")
|
294 |
+
max_wait = 20 # Reduced maximum wait time
|
295 |
start_time = time.time()
|
296 |
|
297 |
+
# Show progress while waiting
|
298 |
while time.time() - start_time < max_wait:
|
299 |
progress((time.time() - start_time) / max_wait)
|
300 |
|
301 |
# Check for our response
|
302 |
try:
|
303 |
+
while not worker_pool.response_queue.empty():
|
304 |
+
resp_id, result = worker_pool.response_queue.get_nowait()
|
305 |
if resp_id == request_id:
|
306 |
+
# Update LRU cache
|
307 |
+
get_cached_translation.__wrapped__.__defaults__ = (result,)
|
308 |
progress(1.0)
|
309 |
return result
|
310 |
except queue.Empty:
|
311 |
pass
|
312 |
|
313 |
+
# Small sleep to prevent CPU hogging - reduced for faster response
|
314 |
+
time.sleep(0.01)
|
315 |
|
316 |
progress(1.0)
|
317 |
+
return "Translation timed out. Please try a shorter text."
|
318 |
|
319 |
+
# Create Gradio interface with simplified UI for performance
|
320 |
+
with gr.Blocks(title="Fast CPU Translation App") as iface:
|
321 |
gr.Markdown(f"""
|
322 |
+
## Fast CPU Translation App
|
323 |
+
Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
|
324 |
+
**For best performance, use short sentences or phrases.**
|
325 |
""")
|
326 |
|
327 |
with gr.Row():
|
328 |
+
direction = gr.Radio(
|
329 |
+
choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
|
330 |
label="Translation Direction",
|
331 |
value="English to Spanish"
|
332 |
)
|
333 |
|
334 |
with gr.Row():
|
335 |
+
input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
|
336 |
+
output_text = gr.Textbox(lines=3, label="Translation")
|
337 |
|
338 |
# Add translate button
|
339 |
translate_btn = gr.Button("Translate")
|
340 |
translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
|
341 |
|
342 |
+
# Add examples with common short phrases for quick results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
gr.Examples(
|
344 |
examples=[
|
345 |
+
["English to Spanish", "Hello"],
|
346 |
+
["Spanish to English", "Hola"],
|
347 |
+
["English to Korean", "Thank you"],
|
348 |
+
["Korean to English", "감사합니다"]
|
349 |
],
|
350 |
inputs=[direction, input_text],
|
351 |
fn=translate,
|
352 |
outputs=output_text
|
353 |
)
|
354 |
+
|
355 |
+
# Add performance tips
|
356 |
+
gr.Markdown("""
|
357 |
+
### Performance Tips
|
358 |
+
- Keep text under 50 characters for fastest results
|
359 |
+
- Common phrases are pre-cached
|
360 |
+
- First translation may be slow, subsequent ones faster
|
361 |
+
- Frequently used phrases use an LRU cache for speed
|
362 |
+
""")
|
363 |
|
364 |
# Launch with optimized settings
|
365 |
+
if __name__ == "__main__":
|
366 |
+
iface.launch(
|
367 |
+
debug=False,
|
368 |
+
show_error=True,
|
369 |
+
share=False,
|
370 |
+
quiet=True,
|
371 |
+
server_name="0.0.0.0",
|
372 |
+
server_port=7860
|
373 |
+
)
|