Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

googoo / app.py

johnpaulbin

Update app.py

cedd7b9 verified 2 months ago

raw

history blame

14.1 kB

	import os
	import time
	import torch
	import gradio as gr
	from huggingface_hub import hf_hub_download
	import threading
	import queue
	import multiprocessing
	from functools import lru_cache

	# First check if GPU is available for maximum speed
	has_gpu = torch.cuda.is_available()
	gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
	print(f"GPU available: {has_gpu} - {gpu_name}")

	# Download model files
	def get_model_path(repo_id, filename):
	print(f"Obtaining {filename}...")
	return hf_hub_download(repo_id=repo_id, filename=filename)

	base_model_path = get_model_path(
	"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
	"articulate-11-expspanish-base-merged-q8_0.gguf"
	)
	adapter_path = get_model_path(
	"johnpaulbin/articulate-V1-Q8_0-GGUF",
	"articulate-V1-q8_0.gguf"
	)

	# Optimize environment variables for CPU performance
	os.environ["LLAMA_CUBLAS"] = "0" # Disable CUDA since we're CPU only
	os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
	os.environ["LLAMA_AVX"] = "1" # Enable AVX
	os.environ["LLAMA_AVX2"] = "1" # Enable AVX2
	os.environ["LLAMA_F16"] = "1" # Use FP16 where available

	# Import the right module
	from llama_cpp import Llama
	print("Using CPU-optimized llama-cpp-python")

	# Cache for translations
	translation_cache = {}
	MAX_CACHE_SIZE = 5000 # Increased cache size

	# Common phrases for pre-loading
	COMMON_PHRASES = {
	"English to Spanish": [
	"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
	"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
	],
	"Spanish to English": [
	"Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
	"No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
	],
	"English to Korean": [
	"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
	"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
	],
	"Korean to English": [
	"안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
	"이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
	]
	}

	# Implement LRU cache for better performance
	@lru_cache(maxsize=100)
	def get_cached_translation(direction, text):
	"""LRU cache for translations"""
	return None # This gets bypassed when there's a cache hit

	# Create a worker pool for parallel translation
	class ModelWorkerPool:
	def __init__(self, num_workers=1):
	self.num_workers = num_workers
	self.request_queue = queue.Queue()
	self.response_queue = queue.Queue()
	self.workers = []
	self.initialized = False

	# Create shared model instance with optimized settings
	print("Initializing model with CPU optimizations...")
	start_time = time.time()

	# CPU optimization settings - use fewer threads for Q8 model
	cpu_count = multiprocessing.cpu_count()
	optimal_threads = max(1, min(4, cpu_count - 1)) # Use fewer threads for better performance

	# Create a smaller context size for faster inference
	self.model = Llama(
	model_path=base_model_path,
	lora_path=adapter_path,
	n_ctx=256, # Reduced context for faster processing
	n_threads=optimal_threads, # Optimized thread count
	n_batch=512, # Reduced batch size for CPU
	use_mmap=True, # Efficient memory mapping
	n_gpu_layers=0, # CPU only
	seed=42, # Consistent results
	verbose=False, # Reduce overhead
	rope_freq_base=10000, # Default attention parameters
	rope_freq_scale=1.0,
	)
	print(f"Model loaded in {time.time() - start_time:.2f} seconds")

	# Start worker threads
	for i in range(num_workers):
	worker = threading.Thread(target=self._worker_loop, daemon=True)
	worker.start()
	self.workers.append(worker)

	self.initialized = True

	# Pre-warm in background thread to not block startup
	warming_thread = threading.Thread(target=self._prewarm_model, daemon=True)
	warming_thread.start()

	def _worker_loop(self):
	"""Worker thread that processes translation requests"""
	while True:
	try:
	request = self.request_queue.get()
	if request is None: # Shutdown signal
	break

	direction, text, callback_id = request

	# Check LRU cache first
	cached = get_cached_translation(direction, text)
	if cached is not None:
	self.response_queue.put((callback_id, cached))
	self.request_queue.task_done()
	continue

	# Check regular cache
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	result = translation_cache[cache_key]
	else:
	# Process new translation
	result = self._process_translation(direction, text)
	# Store in regular cache
	if len(translation_cache) >= MAX_CACHE_SIZE:
	translation_cache.pop(next(iter(translation_cache)))
	translation_cache[cache_key] = result

	self.response_queue.put((callback_id, result))
	self.request_queue.task_done()
	except Exception as e:
	print(f"Error in worker thread: {e}")
	self.response_queue.put((callback_id, f"Error: {str(e)}"))
	self.request_queue.task_done()

	def _prewarm_model(self):
	"""Pre-compute common translations to warm up the model - minimal to save time"""
	print("Pre-warming model with essential phrases (truncated for speed)...")
	start = time.time()

	# Just warm up with one phrase per direction to speed up startup
	for direction, phrases in COMMON_PHRASES.items():
	self._process_translation(direction, phrases[0])
	# Only do the most common phrase to save startup time

	print(f"Basic model pre-warming completed in {time.time() - start:.2f} seconds")

	def _process_translation(self, direction, text):
	"""Optimized translation function"""
	# Skip empty inputs
	if not text or not text.strip():
	return ""

	# Start timing for performance tracking
	start_time = time.time()

	# Map language directions
	lang_map = {
	"English to Spanish": ("ENGLISH", "SPANISH"),
	"Spanish to English": ("SPANISH", "ENGLISH"),
	"Korean to English": ("KOREAN", "ENGLISH"),
	"English to Korean": ("ENGLISH", "KOREAN")
	}

	if direction not in lang_map:
	return "Invalid direction"

	source_lang, target_lang = lang_map[direction]

	# Truncate long inputs for faster processing
	max_input_length = 100 # Limit input length
	if len(text) > max_input_length:
	text = text[:max_input_length] + "..."

	# Efficient prompt format
	prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"

	# Reduce max tokens for faster inference
	input_tokens = len(text.split())
	max_tokens = min(50, max(20, int(input_tokens * 1.2)))

	# Generate translation with aggressive performance optimizations
	try:
	response = self.model.create_completion(
	prompt,
	max_tokens=max_tokens,
	temperature=0.0, # Deterministic for faster inference
	top_k=1, # Only consider most likely token
	top_p=1.0, # No sampling
	repeat_penalty=1.0, # No repeat penalty
	stream=False, # Get complete response at once
	stop=["[/", "\n\n"], # Stop early if possible
	)

	translation = response['choices'][0]['text'].strip()

	# Log performance
	inference_time = time.time() - start_time
	tokens_per_second = (input_tokens + len(translation.split())) / inference_time
	print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")

	return translation
	except Exception as e:
	print(f"Translation error: {e}")
	return f"Error: Could not translate text. Try shorter input."

	def request_translation(self, direction, text, callback_id):
	"""Queue a translation request"""
	self.request_queue.put((direction, text, callback_id))

	# Create optimized worker pool - use just one worker for better performance with Q8 model on CPU
	worker_pool = ModelWorkerPool(num_workers=1)

	# Counter for request IDs
	next_request_id = 0

	# Fast similarity check function for finding close matches in cache
	def find_similar_cached(direction, text, threshold=0.8):
	"""Find similar translations in cache based on prefix matching"""
	if len(text) < 5: # For very short inputs, look for exact matches
	return None

	text_lower = text.lower()
	best_match = None
	best_score = 0

	for cached_key in list(translation_cache.keys()):
	cached_dir, cached_text = cached_key.split(":", 1)
	if cached_dir != direction:
	continue

	# Simple similarity - prefix matching
	if cached_text.lower().startswith(text_lower[:5]):
	similarity = min(1.0, len(text_lower) / max(1, len(cached_text.lower())))
	if similarity > best_score and similarity > threshold:
	best_score = similarity
	best_match = translation_cache[cached_key]

	return best_match

	# Gradio interface functions
	def translate(direction, text, progress=gr.Progress()):
	"""Queue translation request and wait for result - optimized version"""
	global next_request_id

	# Trim whitespace for better cache hits
	text = text.strip()

	# Skip empty inputs
	if not text:
	return ""

	# Check LRU cache first
	cached = get_cached_translation(direction, text)
	if cached is not None:
	return cached

	# Check main cache
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	return translation_cache[cache_key]

	# For short inputs, try to find similar cached
	if len(text) < 20:
	similar = find_similar_cached(direction, text)
	if similar:
	return similar

	# Generate unique request ID
	request_id = next_request_id
	next_request_id += 1

	# Queue the request
	worker_pool.request_translation(direction, text, request_id)

	# Wait for the response with reasonable timeout
	progress(0, desc="Translating...")
	max_wait = 20 # Reduced maximum wait time
	start_time = time.time()

	# Show progress while waiting
	while time.time() - start_time < max_wait:
	progress((time.time() - start_time) / max_wait)

	# Check for our response
	try:
	while not worker_pool.response_queue.empty():
	resp_id, result = worker_pool.response_queue.get_nowait()
	if resp_id == request_id:
	# Update LRU cache
	get_cached_translation.__wrapped__.__defaults__ = (result,)
	progress(1.0)
	return result
	except queue.Empty:
	pass

	# Small sleep to prevent CPU hogging - reduced for faster response
	time.sleep(0.01)

	progress(1.0)
	return "Translation timed out. Please try a shorter text."

	# Create Gradio interface with simplified UI for performance
	with gr.Blocks(title="Fast CPU Translation App") as iface:
	gr.Markdown(f"""
	## Fast CPU Translation App
	Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only - Optimized'}
	For best performance, use short sentences or phrases.
	""")

	with gr.Row():
	direction = gr.Radio(
	choices=["English to Spanish", "Spanish to English", "English to Korean", "Korean to English"],
	label="Translation Direction",
	value="English to Spanish"
	)

	with gr.Row():
	input_text = gr.Textbox(lines=3, label="Input Text", placeholder="Enter text to translate (shorter is faster)...")
	output_text = gr.Textbox(lines=3, label="Translation")

	# Add translate button
	translate_btn = gr.Button("Translate")
	translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)

	# Add examples with common short phrases for quick results
	gr.Examples(
	examples=[
	["English to Spanish", "Hello"],
	["Spanish to English", "Hola"],
	["English to Korean", "Thank you"],
	["Korean to English", "감사합니다"]
	],
	inputs=[direction, input_text],
	fn=translate,
	outputs=output_text
	)

	# Add performance tips
	gr.Markdown("""
	### Performance Tips
	- Keep text under 50 characters for fastest results
	- Common phrases are pre-cached
	- First translation may be slow, subsequent ones faster
	- Frequently used phrases use an LRU cache for speed
	""")

	# Launch with optimized settings
	if __name__ == "__main__":
	iface.launch(
	debug=False,
	show_error=True,
	share=False,
	quiet=True,
	server_name="0.0.0.0",
	server_port=7860
	)