Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,13 +11,20 @@ base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_f
|
|
11 |
adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
|
12 |
adapter_file = "articulate-V1-q8_0.gguf"
|
13 |
adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Initialize the Llama model with base model and adapter
|
16 |
llm = Llama(
|
17 |
model_path=base_model_path,
|
18 |
lora_path=adapter_path,
|
19 |
n_ctx=512, # Context length, set manually since adapter lacks it
|
20 |
-
n_threads=
|
|
|
21 |
n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
|
22 |
)
|
23 |
|
|
|
11 |
adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
|
12 |
adapter_file = "articulate-V1-q8_0.gguf"
|
13 |
adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
|
14 |
+
import multiprocessing
|
15 |
+
# Optimize thread count based on available CPU cores
|
16 |
+
# Use half the available cores for better performance with LLMs
|
17 |
+
cpu_count = multiprocessing.cpu_count()
|
18 |
+
optimal_threads = max(2, cpu_count // 2)
|
19 |
+
print(f"Initializing model with {optimal_threads} threads...")
|
20 |
|
21 |
# Initialize the Llama model with base model and adapter
|
22 |
llm = Llama(
|
23 |
model_path=base_model_path,
|
24 |
lora_path=adapter_path,
|
25 |
n_ctx=512, # Context length, set manually since adapter lacks it
|
26 |
+
n_threads=optimal_threads, # Adjust based on your system
|
27 |
+
use_mmap=True,
|
28 |
n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
|
29 |
)
|
30 |
|