johnpaulbin commited on
Commit
9c9d112
·
verified ·
1 Parent(s): 253566e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -11,13 +11,20 @@ base_model_path = hf_hub_download(repo_id=base_model_repo, filename=base_model_f
11
  adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
12
  adapter_file = "articulate-V1-q8_0.gguf"
13
  adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
 
 
 
 
 
 
14
 
15
  # Initialize the Llama model with base model and adapter
16
  llm = Llama(
17
  model_path=base_model_path,
18
  lora_path=adapter_path,
19
  n_ctx=512, # Context length, set manually since adapter lacks it
20
- n_threads=2, # Adjust based on your system
 
21
  n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
22
  )
23
 
 
11
  adapter_repo = "johnpaulbin/articulate-V1-Q8_0-GGUF"
12
  adapter_file = "articulate-V1-q8_0.gguf"
13
  adapter_path = hf_hub_download(repo_id=adapter_repo, filename=adapter_file)
14
+ import multiprocessing
15
+ # Optimize thread count based on available CPU cores
16
+ # Use half the available cores for better performance with LLMs
17
+ cpu_count = multiprocessing.cpu_count()
18
+ optimal_threads = max(2, cpu_count // 2)
19
+ print(f"Initializing model with {optimal_threads} threads...")
20
 
21
  # Initialize the Llama model with base model and adapter
22
  llm = Llama(
23
  model_path=base_model_path,
24
  lora_path=adapter_path,
25
  n_ctx=512, # Context length, set manually since adapter lacks it
26
+ n_threads=optimal_threads, # Adjust based on your system
27
+ use_mmap=True,
28
  n_gpu_layers=0 # Set to >0 if GPU acceleration is desired and supported
29
  )
30