Thomas (Tom) Gardos commited on
Commit
6f6768d
·
2 Parent(s): 30045eb 902a706

Merge pull request #40 from DL4DS/remove_tinyllama

Browse files

Remove TinyLlama from LFS and add caching mechanism

code/modules/chat/chat_model_loader.py CHANGED
@@ -5,6 +5,8 @@ from langchain_community.llms import LlamaCpp
5
  import torch
6
  import transformers
7
  import os
 
 
8
  from langchain.callbacks.manager import CallbackManager
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
10
 
@@ -14,6 +16,14 @@ class ChatModelLoader:
14
  self.config = config
15
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
 
 
 
 
 
 
 
 
 
17
  def load_chat_model(self):
18
  if self.config["llm_params"]["llm_loader"] == "openai":
19
  llm = ChatOpenAI(
@@ -21,7 +31,7 @@ class ChatModelLoader:
21
  )
22
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
23
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
24
- model_path = self.config["llm_params"]["local_llm_params"]["model"]
25
  llm = LlamaCpp(
26
  model_path=model_path,
27
  n_batch=n_batch,
 
5
  import torch
6
  import transformers
7
  import os
8
+ from pathlib import Path
9
+ from huggingface_hub import hf_hub_download
10
  from langchain.callbacks.manager import CallbackManager
11
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
12
 
 
16
  self.config = config
17
  self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
 
19
+ def _verify_model_cache(self, model_cache_path):
20
+ hf_hub_download(
21
+ repo_id=self.config["llm_params"]["local_llm_params"]["repo_id"],
22
+ filename=self.config["llm_params"]["local_llm_params"]["filename"],
23
+ cache_dir=model_cache_path
24
+ )
25
+ return str(list(Path(model_cache_path).glob("*/snapshots/*/*.gguf"))[0])
26
+
27
  def load_chat_model(self):
28
  if self.config["llm_params"]["llm_loader"] == "openai":
29
  llm = ChatOpenAI(
 
31
  )
32
  elif self.config["llm_params"]["llm_loader"] == "local_llm":
33
  n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
34
+ model_path = self._verify_model_cache(self.config["llm_params"]["local_llm_params"]["model"])
35
  llm = LlamaCpp(
36
  model_path=model_path,
37
  n_batch=n_batch,
code/modules/config/config.yml CHANGED
@@ -34,6 +34,8 @@ llm_params:
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
 
 
37
  pdf_reader: 'llama' # str [llama, pymupdf, gpt]
38
 
39
  chat_logging:
@@ -50,4 +52,4 @@ splitter_options:
50
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
51
  front_chunks_to_remove : null # int or None
52
  last_chunks_to_remove : null # int or None
53
- delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
 
34
  local_llm_params:
35
  model: 'tiny-llama'
36
  temperature: 0.7
37
+ repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
38
+ filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
39
  pdf_reader: 'llama' # str [llama, pymupdf, gpt]
40
 
41
  chat_logging:
 
52
  chunk_separators : ["\n\n", "\n", " ", ""] # list of strings
53
  front_chunks_to_remove : null # int or None
54
  last_chunks_to_remove : null # int or None
55
+ delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings
code/modules/config/constants.py CHANGED
@@ -86,5 +86,5 @@ Question: {question}
86
 
87
  # Model Paths
88
 
89
- LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
90
  MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"
 
86
 
87
  # Model Paths
88
 
89
+ LLAMA_PATH = "../storage/models/tinyllama"
90
  MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf"