File size: 4,959 Bytes
7f36089
 
 
b27a850
 
 
7f36089
b27a850
 
 
 
7f36089
b27a850
 
 
 
 
 
 
 
 
 
 
9c9d112
b27a850
 
 
 
 
49c7346
b27a850
 
7f36089
49c7346
 
b27a850
 
 
 
 
 
 
7f36089
b27a850
 
 
 
 
7f36089
26149dc
b27a850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26149dc
7f36089
b27a850
 
 
 
26149dc
b27a850
 
 
 
 
26149dc
 
b27a850
 
 
 
 
 
7f36089
 
b27a850
 
 
 
 
 
 
 
 
 
 
 
 
 
7f36089
b27a850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f36089
b27a850
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import gradio as gr
import multiprocessing
import time
import os

# Model paths - download models if not already cached
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

# Get models
base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# CPU optimization settings
cpu_count = multiprocessing.cpu_count()
physical_cores = max(1, cpu_count // 2)  # Estimate physical cores
optimal_threads = max(4, physical_cores - 1)  # Leave one core free for system
batch_size = int(os.environ.get("BATCH_SIZE", "512"))  # Configurable batch size

print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")

# Initialize model with optimized parameters
start_time = time.time()
llm = Llama(
    model_path=base_model_path,
    lora_path=adapter_path,
    n_ctx=512,                # Context length
    n_threads=optimal_threads, # Optimized thread count
    n_batch=batch_size,       # Process more tokens in parallel
    use_mmap=True,            # More efficient memory usage
    n_gpu_layers=0,           # CPU only
    seed=42,                  # Consistent results
    verbose=False             # Reduce logging overhead
)
print(f"Model loaded in {time.time() - start_time:.2f} seconds")

# Translation cache
translation_cache = {}
MAX_CACHE_SIZE = 100  # Limit cache size

def translate(direction, text):
    # Skip empty inputs
    if not text or not text.strip():
        return ""
        
    # Check cache first for faster response
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # Start timing for performance tracking
    start_time = time.time()
    
    # Map language directions
    lang_map = {
        "English to Spanish": ("ENGLISH", "SPANISH"),
        "Spanish to English": ("SPANISH", "ENGLISH"),
        "Korean to English": ("KOREAN", "ENGLISH"),
        "English to Korean": ("ENGLISH", "KOREAN")
    }
    
    if direction not in lang_map:
        return "Invalid direction"
    
    source_lang, target_lang = lang_map[direction]
    
    # Efficient prompt format
    prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
    
    # Estimate appropriate token length based on input
    input_tokens = len(text.split())
    max_tokens = min(200, max(50, int(input_tokens * 1.5)))
    
    # Generate translation with optimized settings
    response = llm.create_completion(
        prompt,
        max_tokens=max_tokens,
        temperature=0.0,      # Deterministic for faster inference
        top_k=1,              # Only consider most likely token
        top_p=1.0,            # No sampling
        repeat_penalty=1.0,   # No repeat penalty processing
        stream=False          # Get complete response at once (faster)
    )
    
    translation = response['choices'][0]['text'].strip()
    
    # Cache result
    if len(translation_cache) >= MAX_CACHE_SIZE:
        # Remove oldest entry (first key)
        translation_cache.pop(next(iter(translation_cache)))
    translation_cache[cache_key] = translation
    
    # Log performance
    inference_time = time.time() - start_time
    tokens_per_second = (input_tokens + len(translation.split())) / inference_time
    print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
    
    return translation

# Create Gradio interface with minimal overhead
with gr.Blocks(title="Fast Translation App") as iface:
    gr.Markdown("## Translation App")
    
    with gr.Row():
        direction = gr.Dropdown(
            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=5, label="Input Text")
        output_text = gr.Textbox(lines=5, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Add examples for convenience
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello, how are you today?"],
            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
            ["English to Korean", "The weather is nice today."],
            ["Korean to English", "오늘 날씨가 좋습니다."]
        ],
        inputs=[direction, input_text],
        outputs=output_text,
        cache_examples=True  # Pre-compute examples
    )

# Launch with optimized settings
iface.launch(debug=False, show_error=True)