johnpaulbin commited on
Commit
b27a850
·
verified ·
1 Parent(s): da4aea2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -55
app.py CHANGED
@@ -1,77 +1,142 @@
1
  from huggingface_hub import hf_hub_download
2
  from llama_cpp import Llama
3
  import gradio as gr
 
 
 
4
 
5
- # Download the base model
6
- base_model_repo = "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF"
7
- base_model_file = "articulate-11-expspanish-base-merged-q8_0.gguf"
8
- base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_file)
9
 
10
- # Download the LoRA adapter
11
- adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
12
- adapter_file = "articulate-V1-q8_0.gguf"
13
- adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
14
- import multiprocessing
15
- # Optimize thread count based on available CPU cores
16
- # Use half the available cores for better performance with LLMs
 
 
 
 
17
  cpu_count = multiprocessing.cpu_count()
18
- optimal_threads = max(2, cpu_count // 2)
19
- print(f"Initializing model with {optimal_threads} threads...")
 
 
 
20
 
21
- # Initialize the Llama model with base model and adapter
 
22
  llm = Llama(
23
  model_path=base_model_path,
24
  lora_path=adapter_path,
25
- n_ctx=512, # Context length, set manually since adapter lacks it
26
- n_threads=optimal_threads, # Adjust based on your system
27
- use_mmap=True,
28
- n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
 
 
 
29
  )
 
 
 
 
 
30
 
31
- # Define the translation function
32
  def translate(direction, text):
33
- # Determine source and target languages based on direction
34
- if direction == "English to Spanish":
35
- source_lang = "ENGLISH"
36
- target_lang = "SPANISH"
37
- elif direction == "Spanish to English":
38
- source_lang = "SPANISH"
39
- target_lang = "ENGLISH"
40
- elif direction == "Korean to English":
41
- source_lang = "KOREAN"
42
- target_lang = "ENGLISH"
43
- elif direction == "English to Korean":
44
- source_lang = "ENGLISH"
45
- target_lang = "KOREAN"
46
- else:
 
 
 
 
 
 
 
47
  return "Invalid direction"
48
 
49
- # Construct the prompt for raw completion
50
- prompt = f"[{source_lang}]{text}[{target_lang}]"
 
 
51
 
52
- # Generate completion with deterministic settings (greedy decoding)
 
 
 
 
53
  response = llm.create_completion(
54
  prompt,
55
- max_tokens=200, # Limit output length
56
- temperature=0, # Greedy decoding
57
- top_k=1 # Select the most probable token
 
 
 
58
  )
59
 
60
- # Extract and return the generated text
61
- return response['choices'][0]['text'].strip()
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- # Define the Gradio interface
64
- direction_options = ["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"]
65
- iface = gr.Interface(
66
- fn=translate,
67
- inputs=[
68
- gr.Dropdown(choices=direction_options, label="Translation Direction"),
69
- gr.Textbox(lines=5, label="Input Text")
70
- ],
71
- outputs=gr.Textbox(lines=5, label="Translation"),
72
- title="Translation App",
73
- description="Translate text between English and Spanish using the Articulate V1 model."
74
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # Launch the app
77
- iface.launch(debug=True)
 
1
  from huggingface_hub import hf_hub_download
2
  from llama_cpp import Llama
3
  import gradio as gr
4
+ import multiprocessing
5
+ import time
6
+ import os
7
 
8
+ # Model paths - download models if not already cached
9
+ def get_model_path(repo_id, filename):
10
+ print(f"Obtaining {filename}...")
11
+ return hf_hub_download(repo_id=repo_id, filename=filename)
12
 
13
+ # Get models
14
+ base_model_path = get_model_path(
15
+ "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
16
+ "articulate-11-expspanish-base-merged-q8_0.gguf"
17
+ )
18
+ adapter_path = get_model_path(
19
+ "johnpaulbin/articulate-V1-Q8_0-GGUF",
20
+ "articulate-V1-q8_0.gguf"
21
+ )
22
+
23
+ # CPU optimization settings
24
  cpu_count = multiprocessing.cpu_count()
25
+ physical_cores = max(1, cpu_count // 2) # Estimate physical cores
26
+ optimal_threads = max(4, physical_cores - 1) # Leave one core free for system
27
+ batch_size = int(os.environ.get("BATCH_SIZE", "512")) # Configurable batch size
28
+
29
+ print(f"Initializing model with {optimal_threads} threads and batch size {batch_size}...")
30
 
31
+ # Initialize model with optimized parameters
32
+ start_time = time.time()
33
  llm = Llama(
34
  model_path=base_model_path,
35
  lora_path=adapter_path,
36
+ n_ctx=512, # Context length
37
+ n_threads=optimal_threads, # Optimized thread count
38
+ n_batch=batch_size, # Process more tokens in parallel
39
+ use_mmap=True, # More efficient memory usage
40
+ n_gpu_layers=0, # CPU only
41
+ seed=42, # Consistent results
42
+ verbose=False # Reduce logging overhead
43
  )
44
+ print(f"Model loaded in {time.time() - start_time:.2f} seconds")
45
+
46
+ # Translation cache
47
+ translation_cache = {}
48
+ MAX_CACHE_SIZE = 100 # Limit cache size
49
 
 
50
  def translate(direction, text):
51
+ # Skip empty inputs
52
+ if not text or not text.strip():
53
+ return ""
54
+
55
+ # Check cache first for faster response
56
+ cache_key = f"{direction}:{text}"
57
+ if cache_key in translation_cache:
58
+ return translation_cache[cache_key]
59
+
60
+ # Start timing for performance tracking
61
+ start_time = time.time()
62
+
63
+ # Map language directions
64
+ lang_map = {
65
+ "English to Spanish": ("ENGLISH", "SPANISH"),
66
+ "Spanish to English": ("SPANISH", "ENGLISH"),
67
+ "Korean to English": ("KOREAN", "ENGLISH"),
68
+ "English to Korean": ("ENGLISH", "KOREAN")
69
+ }
70
+
71
+ if direction not in lang_map:
72
  return "Invalid direction"
73
 
74
+ source_lang, target_lang = lang_map[direction]
75
+
76
+ # Efficient prompt format
77
+ prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
78
 
79
+ # Estimate appropriate token length based on input
80
+ input_tokens = len(text.split())
81
+ max_tokens = min(200, max(50, int(input_tokens * 1.5)))
82
+
83
+ # Generate translation with optimized settings
84
  response = llm.create_completion(
85
  prompt,
86
+ max_tokens=max_tokens,
87
+ temperature=0.0, # Deterministic for faster inference
88
+ top_k=1, # Only consider most likely token
89
+ top_p=1.0, # No sampling
90
+ repeat_penalty=1.0, # No repeat penalty processing
91
+ stream=False # Get complete response at once (faster)
92
  )
93
 
94
+ translation = response['choices'][0]['text'].strip()
95
+
96
+ # Cache result
97
+ if len(translation_cache) >= MAX_CACHE_SIZE:
98
+ # Remove oldest entry (first key)
99
+ translation_cache.pop(next(iter(translation_cache)))
100
+ translation_cache[cache_key] = translation
101
+
102
+ # Log performance
103
+ inference_time = time.time() - start_time
104
+ tokens_per_second = (input_tokens + len(translation.split())) / inference_time
105
+ print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
106
+
107
+ return translation
108
 
109
+ # Create Gradio interface with minimal overhead
110
+ with gr.Blocks(title="Fast Translation App") as iface:
111
+ gr.Markdown("## Translation App")
112
+
113
+ with gr.Row():
114
+ direction = gr.Dropdown(
115
+ choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
116
+ label="Translation Direction",
117
+ value="English to Spanish"
118
+ )
119
+
120
+ with gr.Row():
121
+ input_text = gr.Textbox(lines=5, label="Input Text")
122
+ output_text = gr.Textbox(lines=5, label="Translation")
123
+
124
+ # Add translate button
125
+ translate_btn = gr.Button("Translate")
126
+ translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
127
+
128
+ # Add examples for convenience
129
+ gr.Examples(
130
+ examples=[
131
+ ["English to Spanish", "Hello, how are you today?"],
132
+ ["Spanish to English", "Hola, ¿cómo estás hoy?"],
133
+ ["English to Korean", "The weather is nice today."],
134
+ ["Korean to English", "오늘 날씨가 좋습니다."]
135
+ ],
136
+ inputs=[direction, input_text],
137
+ outputs=output_text,
138
+ cache_examples=True # Pre-compute examples
139
+ )
140
 
141
+ # Launch with optimized settings
142
+ iface.launch(debug=False, show_error=True)