Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

googoo / app.py

johnpaulbin

Update app.py

c2b521a verified 2 months ago

raw

history blame

4.55 kB

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import gradio as gr
	import multiprocessing
	import time
	import os

	# Model paths
	def get_model_path(repo_id, filename):
	print(f"Obtaining {filename}...")
	return hf_hub_download(repo_id=repo_id, filename=filename)

	# Get models
	base_model_path = get_model_path(
	"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
	"articulate-11-expspanish-base-merged-q8_0.gguf"
	)
	adapter_path = get_model_path(
	"johnpaulbin/articulate-V1-Q8_0-GGUF",
	"articulate-V1-q8_0.gguf"
	)

	# Conservative CPU settings to avoid memory corruption
	cpu_count = multiprocessing.cpu_count()
	optimal_threads = max(1, min(8, cpu_count // 2)) # More conservative thread count
	batch_size = 128 # Reduced batch size to prevent memory issues

	print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")

	# Initialize model with safer parameters
	start_time = time.time()
	llm = Llama(
	model_path=base_model_path,
	lora_path=adapter_path,
	n_ctx=512,
	n_threads=optimal_threads,
	n_batch=batch_size, # Smaller batch size for stability
	use_mmap=True,
	n_gpu_layers=0,
	verbose=False
	)
	print(f"Model loaded in {time.time() - start_time:.2f} seconds")

	# Simple translation cache (limited size)
	translation_cache = {}
	MAX_CACHE_SIZE = 50 # Reduced cache size

	def translate(direction, text):
	# Validate input
	if not text or not text.strip():
	return ""

	text = text.strip()

	# Simple cache lookup
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	return translation_cache[cache_key]

	# Start timing
	start_time = time.time()

	# Language mapping
	lang_map = {
	"English to Spanish": ("ENGLISH", "SPANISH"),
	"Spanish to English": ("SPANISH", "ENGLISH"),
	"Korean to English": ("KOREAN", "ENGLISH"),
	"English to Korean": ("ENGLISH", "KOREAN")
	}

	if direction not in lang_map:
	return "Invalid direction"

	source_lang, target_lang = lang_map[direction]

	# Create prompt
	prompt = f"[{source_lang}]{text}[{target_lang}]"

	try:
	# Generate translation with conservative settings
	response = llm.create_completion(
	prompt,
	max_tokens=128, # Conservative token limit
	temperature=0.0, # Deterministic
	top_k=1, # Most likely token only
	top_p=1.0, # No sampling
	repeat_penalty=1.0,
	stream=False
	)

	translation = response['choices'][0]['text'].strip()

	# Manage cache size
	if len(translation_cache) >= MAX_CACHE_SIZE:
	# Remove oldest entry
	translation_cache.pop(next(iter(translation_cache)))
	translation_cache[cache_key] = translation

	# Log performance
	inference_time = time.time() - start_time
	print(f"Translation completed in {inference_time:.3f}s")

	return translation

	except Exception as e:
	print(f"Translation error: {e}")
	return f"Error during translation: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Translation App") as iface:
	gr.Markdown("## Fast Translation App")

	with gr.Row():
	direction = gr.Dropdown(
	choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
	label="Translation Direction",
	value="English to Spanish"
	)

	with gr.Row():
	input_text = gr.Textbox(lines=5, label="Input Text")
	output_text = gr.Textbox(lines=5, label="Translation")

	# Add translate button
	translate_btn = gr.Button("Translate")
	translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)

	# Examples WITHOUT caching (to avoid memory issues)
	gr.Examples(
	examples=[
	["English to Spanish", "Hello, how are you today?"],
	["Spanish to English", "Hola, ¿cómo estás hoy?"],
	["English to Korean", "The weather is nice today."],
	["Korean to English", "오늘 날씨가 좋습니다."]
	],
	inputs=[direction, input_text],
	cache_examples=False # Disabled caching to prevent memory issues
	)

	# Launch with safer settings
	iface.launch(debug=False, show_error=True)