Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

johnpaulbin commited on Feb 25

Commit

b27a850

verified ·

1 Parent(s): da4aea2

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -55

app.py CHANGED Viewed

@@ -1,77 +1,142 @@
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import gradio as gr
-# Download the base model
-base_model_repo = "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF"
-base_model_file = "articulate-11-expspanish-base-merged-q8_0.gguf"
-base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_file)
-# Download the LoRA adapter
-adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
-adapter_file = "articulate-V1-q8_0.gguf"
-adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
-import multiprocessing
-# Optimize thread count based on available CPU cores
-# Use half the available cores for better performance with LLMs
 cpu_count = multiprocessing.cpu_count()
-optimal_threads = max(2, cpu_count // 2)
-print(f"Initializing model with {optimal_threads} threads...")
-# Initialize the Llama model with base model and adapter
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
-    n_ctx=512,       # Context length, set manually since adapter lacks it
-    n_threads=optimal_threads,     # Adjust based on your system
-    use_mmap=True,
-    n_gpu_layers=0    # Set to >0 if GPU acceleration is desired and supported
 )
-# Define the translation function
 def translate(direction, text):
-    # Determine source and target languages based on direction
-    if direction == "English to Spanish":
-        source_lang = "ENGLISH"
-        target_lang = "SPANISH"
-    elif direction == "Spanish to English":
-        source_lang = "SPANISH"
-        target_lang = "ENGLISH"
-    elif direction == "Korean to English":
-        source_lang = "KOREAN"
-        target_lang = "ENGLISH"
-    elif direction == "English to Korean":
-        source_lang = "ENGLISH"
-        target_lang = "KOREAN"
-    else:
         return "Invalid direction"
-    # Construct the prompt for raw completion
-    prompt = f"[{source_lang}]{text}[{target_lang}]"
-    # Generate completion with deterministic settings (greedy decoding)
     response = llm.create_completion(
         prompt,
-        max_tokens=200,    # Limit output length
-        temperature=0,     # Greedy decoding
-        top_k=1            # Select the most probable token
     )
-    # Extract and return the generated text
-    return response['choices'][0]['text'].strip()
-# Define the Gradio interface
-direction_options = ["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"]
-iface = gr.Interface(
-    fn=translate,
-    inputs=[
-        gr.Dropdown(choices=direction_options, label="Translation Direction"),
-        gr.Textbox(lines=5, label="Input Text")
-    ],
-    outputs=gr.Textbox(lines=5, label="Translation"),
-    title="Translation App",
-    description="Translate text between English and Spanish using the Articulate V1 model."
-)
-# Launch the app
-iface.launch(debug=True)

 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import gradio as gr
+import multiprocessing
+import time
+import os
+# Model paths - download models if not already cached
+def get_model_path(repo_id, filename):
+    print(f"Obtaining {filename}...")
+    return hf_hub_download(repo_id=repo_id, filename=filename)
+# Get models
+base_model_path = get_model_path(
+    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
+    "articulate-11-expspanish-base-merged-q8_0.gguf"
+)
+adapter_path = get_model_path(
+    "johnpaulbin/articulate-V1-Q8_0-GGUF",
+    "articulate-V1-q8_0.gguf"
+)
+# CPU optimization settings
 cpu_count = multiprocessing.cpu_count()
+physical_cores = max(1, cpu_count // 2)  # Estimate physical cores
+optimal_threads = max(4, physical_cores - 1)  # Leave one core free for system
+batch_size = int(os.environ.get("BATCH_SIZE", "512"))  # Configurable batch size
+print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
+# Initialize model with optimized parameters
+start_time = time.time()
 llm = Llama(
     model_path=base_model_path,
     lora_path=adapter_path,
+    n_ctx=512,                # Context length
+    n_threads=optimal_threads, # Optimized thread count
+    n_batch=batch_size,       # Process more tokens in parallel
+    use_mmap=True,            # More efficient memory usage
+    n_gpu_layers=0,           # CPU only
+    seed=42,                  # Consistent results
+    verbose=False             # Reduce logging overhead
 )
+print(f"Model loaded in {time.time() - start_time:.2f} seconds")
+# Translation cache
+translation_cache = {}
+MAX_CACHE_SIZE = 100  # Limit cache size
 def translate(direction, text):
+    # Skip empty inputs
+    if not text or not text.strip():
+        return ""
+    # Check cache first for faster response
+    cache_key = f"{direction}:{text}"
+    if cache_key in translation_cache:
+        return translation_cache[cache_key]
+    # Start timing for performance tracking
+    start_time = time.time()
+    # Map language directions
+    lang_map = {
+        "English to Spanish": ("ENGLISH", "SPANISH"),
+        "Spanish to English": ("SPANISH", "ENGLISH"),
+        "Korean to English": ("KOREAN", "ENGLISH"),
+        "English to Korean": ("ENGLISH", "KOREAN")
+    }
+    if direction not in lang_map:
         return "Invalid direction"
+    source_lang, target_lang = lang_map[direction]
+    # Efficient prompt format
+    prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
+    # Estimate appropriate token length based on input
+    input_tokens = len(text.split())
+    max_tokens = min(200, max(50, int(input_tokens * 1.5)))
+    # Generate translation with optimized settings
     response = llm.create_completion(
         prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,      # Deterministic for faster inference
+        top_k=1,              # Only consider most likely token
+        top_p=1.0,            # No sampling
+        repeat_penalty=1.0,   # No repeat penalty processing
+        stream=False          # Get complete response at once (faster)
     )
+    translation = response['choices'][0]['text'].strip()
+    # Cache result
+    if len(translation_cache) >= MAX_CACHE_SIZE:
+        # Remove oldest entry (first key)
+        translation_cache.pop(next(iter(translation_cache)))
+    translation_cache[cache_key] = translation
+    # Log performance
+    inference_time = time.time() - start_time
+    tokens_per_second = (input_tokens + len(translation.split())) / inference_time
+    print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
+    return translation
+# Create Gradio interface with minimal overhead
+with gr.Blocks(title="Fast Translation App") as iface:
+    gr.Markdown("## Translation App")
+    with gr.Row():
+        direction = gr.Dropdown(
+            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
+            label="Translation Direction",
+            value="English to Spanish"
+        )
+    with gr.Row():
+        input_text = gr.Textbox(lines=5, label="Input Text")
+        output_text = gr.Textbox(lines=5, label="Translation")
+    # Add translate button
+    translate_btn = gr.Button("Translate")
+    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
+    # Add examples for convenience
+    gr.Examples(
+        examples=[
+            ["English to Spanish", "Hello, how are you today?"],
+            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
+            ["English to Korean", "The weather is nice today."],
+            ["Korean to English", "오늘 날씨가 좋습니다."]
+        ],
+        inputs=[direction, input_text],
+        outputs=output_text,
+        cache_examples=True  # Pre-compute examples
+    )
+# Launch with optimized settings
+iface.launch(debug=False, show_error=True)