Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

googoo / app.py

johnpaulbin

Update app.py

b27a850 verified 2 months ago

raw

history blame

4.96 kB

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import gradio as gr
	import multiprocessing
	import time
	import os

	# Model paths - download models if not already cached
	def get_model_path(repo_id, filename):
	print(f"Obtaining {filename}...")
	return hf_hub_download(repo_id=repo_id, filename=filename)

	# Get models
	base_model_path = get_model_path(
	"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
	"articulate-11-expspanish-base-merged-q8_0.gguf"
	)
	adapter_path = get_model_path(
	"johnpaulbin/articulate-V1-Q8_0-GGUF",
	"articulate-V1-q8_0.gguf"
	)

	# CPU optimization settings
	cpu_count = multiprocessing.cpu_count()
	physical_cores = max(1, cpu_count // 2) # Estimate physical cores
	optimal_threads = max(4, physical_cores - 1) # Leave one core free for system
	batch_size = int(os.environ.get("BATCH_SIZE", "512")) # Configurable batch size

	print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")

	# Initialize model with optimized parameters
	start_time = time.time()
	llm = Llama(
	model_path=base_model_path,
	lora_path=adapter_path,
	n_ctx=512, # Context length
	n_threads=optimal_threads, # Optimized thread count
	n_batch=batch_size, # Process more tokens in parallel
	use_mmap=True, # More efficient memory usage
	n_gpu_layers=0, # CPU only
	seed=42, # Consistent results
	verbose=False # Reduce logging overhead
	)
	print(f"Model loaded in {time.time() - start_time:.2f} seconds")

	# Translation cache
	translation_cache = {}
	MAX_CACHE_SIZE = 100 # Limit cache size

	def translate(direction, text):
	# Skip empty inputs
	if not text or not text.strip():
	return ""

	# Check cache first for faster response
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	return translation_cache[cache_key]

	# Start timing for performance tracking
	start_time = time.time()

	# Map language directions
	lang_map = {
	"English to Spanish": ("ENGLISH", "SPANISH"),
	"Spanish to English": ("SPANISH", "ENGLISH"),
	"Korean to English": ("KOREAN", "ENGLISH"),
	"English to Korean": ("ENGLISH", "KOREAN")
	}

	if direction not in lang_map:
	return "Invalid direction"

	source_lang, target_lang = lang_map[direction]

	# Efficient prompt format
	prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"

	# Estimate appropriate token length based on input
	input_tokens = len(text.split())
	max_tokens = min(200, max(50, int(input_tokens * 1.5)))

	# Generate translation with optimized settings
	response = llm.create_completion(
	prompt,
	max_tokens=max_tokens,
	temperature=0.0, # Deterministic for faster inference
	top_k=1, # Only consider most likely token
	top_p=1.0, # No sampling
	repeat_penalty=1.0, # No repeat penalty processing
	stream=False # Get complete response at once (faster)
	)

	translation = response['choices'][0]['text'].strip()

	# Cache result
	if len(translation_cache) >= MAX_CACHE_SIZE:
	# Remove oldest entry (first key)
	translation_cache.pop(next(iter(translation_cache)))
	translation_cache[cache_key] = translation

	# Log performance
	inference_time = time.time() - start_time
	tokens_per_second = (input_tokens + len(translation.split())) / inference_time
	print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")

	return translation

	# Create Gradio interface with minimal overhead
	with gr.Blocks(title="Fast Translation App") as iface:
	gr.Markdown("## Translation App")

	with gr.Row():
	direction = gr.Dropdown(
	choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
	label="Translation Direction",
	value="English to Spanish"
	)

	with gr.Row():
	input_text = gr.Textbox(lines=5, label="Input Text")
	output_text = gr.Textbox(lines=5, label="Translation")

	# Add translate button
	translate_btn = gr.Button("Translate")
	translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)

	# Add examples for convenience
	gr.Examples(
	examples=[
	["English to Spanish", "Hello, how are you today?"],
	["Spanish to English", "Hola, ¿cómo estás hoy?"],
	["English to Korean", "The weather is nice today."],
	["Korean to English", "오늘 날씨가 좋습니다."]
	],
	inputs=[direction, input_text],
	outputs=output_text,
	cache_examples=True # Pre-compute examples
	)

	# Launch with optimized settings
	iface.launch(debug=False, show_error=True)