hmrizal commited on
Commit
9e0e548
·
verified ·
1 Parent(s): b01ddc0

update initialize_model_once and create_llm_pipeline for GGUF model, add llama_cpp, add fallback hierarchy system

Browse files
Files changed (1) hide show
  1. app.py +178 -29
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
 
2
  import os
 
3
  os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU only
4
  import uuid
5
  import threading
@@ -10,9 +12,13 @@ from langchain.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores import FAISS
11
  from langchain.llms import HuggingFacePipeline
12
  from langchain.chains import LLMChain
13
- from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, pipeline
14
  from langchain.prompts import PromptTemplate
15
- import time
 
 
 
 
16
 
17
  # Global model cache
18
  MODEL_CACHE = {
@@ -34,7 +40,7 @@ MODEL_CONFIG = {
34
  },
35
  "TinyLlama Chat": {
36
  "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
37
- "description": "Compact 1.1B parameter model, fast but less powerful",
38
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
39
  },
40
  "Mistral Instruct": {
@@ -44,12 +50,12 @@ MODEL_CONFIG = {
44
  },
45
  "Phi-4 Mini Instruct": {
46
  "name": "microsoft/Phi-4-mini-instruct",
47
- "description": "Compact Microsoft model with strong instruction following",
48
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
49
  },
50
  "DeepSeek Coder Instruct": {
51
  "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
52
- "description": "1.3B model specialized for code understanding",
53
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
54
  },
55
  "DeepSeek Lite Chat": {
@@ -75,15 +81,22 @@ MODEL_CONFIG = {
75
  }
76
  }
77
 
 
 
 
 
 
 
 
78
  def initialize_model_once(model_key):
79
- """Initialize the model once and cache it"""
80
  with MODEL_CACHE["init_lock"]:
81
  current_model = MODEL_CACHE["model_name"]
82
  if MODEL_CACHE["model"] is None or current_model != model_key:
83
- # Clear previous model from memory if any
84
  if MODEL_CACHE["model"] is not None:
85
  del MODEL_CACHE["model"]
86
- del MODEL_CACHE["tokenizer"]
 
87
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
88
 
89
  model_info = MODEL_CONFIG[model_key]
@@ -92,8 +105,45 @@ def initialize_model_once(model_key):
92
 
93
  try:
94
  print(f"Loading model: {model_name}")
95
- # Handle T5 models separately
96
- if model_info.get("is_t5", False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
98
  MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
99
  model_name,
@@ -101,16 +151,27 @@ def initialize_model_once(model_key):
101
  device_map="auto" if torch.cuda.is_available() else None,
102
  low_cpu_mem_usage=True
103
  )
 
 
 
104
  else:
105
- # Load tokenizer and model with appropriate configuration
106
- MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
107
  MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
108
  model_name,
 
109
  torch_dtype=model_info["dtype"],
110
  device_map="auto" if torch.cuda.is_available() else None,
111
  low_cpu_mem_usage=True,
112
  trust_remote_code=True
113
  )
 
 
114
  print(f"Model {model_name} loaded successfully")
115
  except Exception as e:
116
  import traceback
@@ -118,28 +179,39 @@ def initialize_model_once(model_key):
118
  print(traceback.format_exc())
119
  raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
120
 
121
- if MODEL_CACHE["model"] is None or MODEL_CACHE["tokenizer"] is None:
122
- raise ValueError(f"Model or tokenizer not initialized properly for {model_key}")
123
-
124
- return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], model_info.get("is_t5", False)
125
 
126
  def create_llm_pipeline(model_key):
127
  """Create a new pipeline using the specified model"""
128
  try:
129
  print(f"Creating pipeline for model: {model_key}")
130
- tokenizer, model, is_t5 = initialize_model_once(model_key)
131
 
132
- if model is None or tokenizer is None:
133
- raise ValueError(f"Model or tokenizer is None for {model_key}")
134
 
135
- # Create appropriate pipeline based on model type
136
- if is_t5:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  print("Creating T5 pipeline")
138
  pipe = pipeline(
139
  "text2text-generation",
140
  model=model,
141
  tokenizer=tokenizer,
142
- max_new_tokens=128, # Reduced for better performance
143
  temperature=0.3,
144
  top_p=0.9,
145
  return_full_text=False,
@@ -150,7 +222,7 @@ def create_llm_pipeline(model_key):
150
  "text-generation",
151
  model=model,
152
  tokenizer=tokenizer,
153
- max_new_tokens=128, # Reduced for better performance
154
  temperature=0.3,
155
  top_p=0.9,
156
  top_k=30,
@@ -159,13 +231,73 @@ def create_llm_pipeline(model_key):
159
  )
160
 
161
  print("Pipeline created successfully")
162
- # Wrap pipeline in HuggingFacePipeline for LangChain compatibility
163
  return HuggingFacePipeline(pipeline=pipe)
164
  except Exception as e:
165
  import traceback
166
  print(f"Error creating pipeline: {str(e)}")
167
  print(traceback.format_exc())
168
- raise RuntimeError(f"Failed to create pipeline: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  def create_conversational_chain(db, file_path, model_key):
171
  llm = create_llm_pipeline(model_key)
@@ -523,10 +655,27 @@ def create_gradio_interface():
523
  def handle_process_file(file, model_key, sess_id):
524
  if file is None:
525
  return None, None, False, "Mohon upload file CSV terlebih dahulu."
526
-
527
- chatbot = ChatBot(sess_id, model_key)
528
- result = chatbot.process_file(file)
529
- return chatbot, True, [(None, result)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
 
531
  process_button.click(
532
  fn=handle_process_file,
 
1
  import gradio as gr
2
+ import gc
3
  import os
4
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
5
  os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU only
6
  import uuid
7
  import threading
 
12
  from langchain.vectorstores import FAISS
13
  from langchain.llms import HuggingFacePipeline
14
  from langchain.chains import LLMChain
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, BitsAndBytesConfig, pipeline
16
  from langchain.prompts import PromptTemplate
17
+ from llama_cpp import Llama
18
+ import re
19
+ import datetime
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
 
23
  # Global model cache
24
  MODEL_CACHE = {
 
40
  },
41
  "TinyLlama Chat": {
42
  "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
43
+ "description": "Model ringan dengan 1.1B parameter, cepat dan ringan",
44
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
45
  },
46
  "Mistral Instruct": {
 
50
  },
51
  "Phi-4 Mini Instruct": {
52
  "name": "microsoft/Phi-4-mini-instruct",
53
+ "description": "Model yang ringan dari Microsoft cocok untuk tugas instruksional",
54
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
55
  },
56
  "DeepSeek Coder Instruct": {
57
  "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
58
+ "description": "1.3B model untuk kode dan analisis data",
59
  "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
60
  },
61
  "DeepSeek Lite Chat": {
 
81
  }
82
  }
83
 
84
+ # Tambahkan model fallback ke MODEL_CONFIG
85
+ MODEL_CONFIG["Fallback Model"] = {
86
+ "name": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
87
+ "description": "Model sangat ringan untuk fallback",
88
+ "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
89
+ }
90
+
91
  def initialize_model_once(model_key):
 
92
  with MODEL_CACHE["init_lock"]:
93
  current_model = MODEL_CACHE["model_name"]
94
  if MODEL_CACHE["model"] is None or current_model != model_key:
95
+ # Clear previous model
96
  if MODEL_CACHE["model"] is not None:
97
  del MODEL_CACHE["model"]
98
+ if MODEL_CACHE["tokenizer"] is not None:
99
+ del MODEL_CACHE["tokenizer"]
100
  torch.cuda.empty_cache() if torch.cuda.is_available() else None
101
 
102
  model_info = MODEL_CONFIG[model_key]
 
105
 
106
  try:
107
  print(f"Loading model: {model_name}")
108
+
109
+ # Periksa apakah ini model GGUF
110
+ if "GGUF" in model_name:
111
+ # Download model file terlebih dahulu jika belum ada
112
+ from huggingface_hub import hf_hub_download
113
+ try:
114
+ # Coba temukan file GGUF di repo
115
+ repo_id = model_name
116
+ model_path = hf_hub_download(
117
+ repo_id=repo_id,
118
+ filename="model.gguf" # Nama file dapat berbeda
119
+ )
120
+ except Exception as e:
121
+ print(f"Couldn't find model.gguf, trying other filenames: {str(e)}")
122
+ # Coba cari file GGUF dengan nama lain
123
+ import requests
124
+ from huggingface_hub import list_repo_files
125
+
126
+ files = list_repo_files(repo_id)
127
+ gguf_files = [f for f in files if f.endswith('.gguf')]
128
+
129
+ if not gguf_files:
130
+ raise ValueError(f"No GGUF files found in {repo_id}")
131
+
132
+ # Gunakan file GGUF pertama yang ditemukan
133
+ model_path = hf_hub_download(repo_id=repo_id, filename=gguf_files[0])
134
+
135
+ # Load model GGUF dengan llama-cpp-python
136
+ MODEL_CACHE["model"] = Llama(
137
+ model_path=model_path,
138
+ n_ctx=2048, # Konteks yang lebih kecil untuk penghematan memori
139
+ n_batch=512,
140
+ n_threads=2 # Sesuaikan dengan 2 vCPU
141
+ )
142
+ MODEL_CACHE["tokenizer"] = None # GGUF tidak membutuhkan tokenizer terpisah
143
+ MODEL_CACHE["is_gguf"] = True
144
+
145
+ # Handle T5 models
146
+ elif model_info.get("is_t5", False):
147
  MODEL_CACHE["tokenizer"] = T5Tokenizer.from_pretrained(model_name)
148
  MODEL_CACHE["model"] = T5ForConditionalGeneration.from_pretrained(
149
  model_name,
 
151
  device_map="auto" if torch.cuda.is_available() else None,
152
  low_cpu_mem_usage=True
153
  )
154
+ MODEL_CACHE["is_gguf"] = False
155
+
156
+ # Handle standard HF models
157
  else:
158
+ quantization_config = BitsAndBytesConfig(
159
+ load_in_4bit=True,
160
+ bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
161
+ bnb_4bit_quant_type="nf4",
162
+ bnb_4bit_use_double_quant=True
163
+ )
164
+ MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
165
  MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
166
  model_name,
167
+ quantization_config=quantization_config,
168
  torch_dtype=model_info["dtype"],
169
  device_map="auto" if torch.cuda.is_available() else None,
170
  low_cpu_mem_usage=True,
171
  trust_remote_code=True
172
  )
173
+ MODEL_CACHE["is_gguf"] = False
174
+
175
  print(f"Model {model_name} loaded successfully")
176
  except Exception as e:
177
  import traceback
 
179
  print(traceback.format_exc())
180
  raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
181
 
182
+ return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], MODEL_CACHE.get("is_gguf", False)
 
 
 
183
 
184
  def create_llm_pipeline(model_key):
185
  """Create a new pipeline using the specified model"""
186
  try:
187
  print(f"Creating pipeline for model: {model_key}")
188
+ tokenizer, model, is_gguf = initialize_model_once(model_key)
189
 
190
+ if model is None:
191
+ raise ValueError(f"Model is None for {model_key}")
192
 
193
+ # For GGUF models from llama-cpp-python
194
+ if is_gguf:
195
+ # Buat adaptor untuk menggunakan model GGUF seperti HF pipeline
196
+ from langchain.llms import LlamaCpp
197
+ llm = LlamaCpp(
198
+ model_path=model.model_path,
199
+ temperature=0.3,
200
+ max_tokens=128,
201
+ top_p=0.9,
202
+ n_ctx=2048,
203
+ streaming=False
204
+ )
205
+ return llm
206
+
207
+ # Create appropriate pipeline for HF models
208
+ elif getattr(model_info, "is_t5", False):
209
  print("Creating T5 pipeline")
210
  pipe = pipeline(
211
  "text2text-generation",
212
  model=model,
213
  tokenizer=tokenizer,
214
+ max_new_tokens=128,
215
  temperature=0.3,
216
  top_p=0.9,
217
  return_full_text=False,
 
222
  "text-generation",
223
  model=model,
224
  tokenizer=tokenizer,
225
+ max_new_tokens=128,
226
  temperature=0.3,
227
  top_p=0.9,
228
  top_k=30,
 
231
  )
232
 
233
  print("Pipeline created successfully")
 
234
  return HuggingFacePipeline(pipeline=pipe)
235
  except Exception as e:
236
  import traceback
237
  print(f"Error creating pipeline: {str(e)}")
238
  print(traceback.format_exc())
239
+
240
+ # Fallback ke model sederhana jika yang utama gagal
241
+ if model_key != "Fallback Model":
242
+ print(f"Trying fallback model")
243
+ try:
244
+ return create_fallback_pipeline()
245
+ except:
246
+ raise RuntimeError(f"Failed to create pipeline: {str(e)}")
247
+ else:
248
+ raise RuntimeError(f"Failed to create pipeline: {str(e)}")
249
+
250
+ def create_fallback_pipeline():
251
+ """Create a fallback pipeline with a very small model"""
252
+ model_key = "Fallback Model"
253
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
254
+ model = AutoModelForCausalLM.from_pretrained(
255
+ MODEL_CONFIG[model_key]["name"],
256
+ torch_dtype=MODEL_CONFIG[model_key]["dtype"],
257
+ device_map="auto" if torch.cuda.is_available() else None,
258
+ low_cpu_mem_usage=True
259
+ )
260
+
261
+ pipe = pipeline(
262
+ "text-generation",
263
+ model=model,
264
+ tokenizer=tokenizer,
265
+ max_new_tokens=128,
266
+ temperature=0.3,
267
+ return_full_text=False,
268
+ )
269
+
270
+ return HuggingFacePipeline(pipeline=pipe)
271
+
272
+ def handle_model_loading_error(model_key, session_id):
273
+ """Handle model loading errors with fallback options"""
274
+ fallback_hierarchy = [
275
+ "DeepSeek Coder Instruct", # 1.3B model
276
+ "Phi-4", # 1.5B model
277
+ "TinyLlama-Chat", # 1.1B model
278
+ "Flan-T5-Small" # Paling ringan
279
+ ]
280
+
281
+ # Jika model yang gagal sudah merupakan fallback terakhir, berikan pesan error
282
+ if model_key == fallback_hierarchy[-1]:
283
+ return None, f"Tidak dapat memuat model {model_key}. Harap coba lagi nanti."
284
+
285
+ # Temukan posisi model yang gagal dalam hirarki
286
+ try:
287
+ current_index = fallback_hierarchy.index(model_key)
288
+ except ValueError:
289
+ current_index = -1
290
+
291
+ # Coba model berikutnya dalam hirarki
292
+ for fallback_model in fallback_hierarchy[current_index+1:]:
293
+ try:
294
+ print(f"Trying fallback model: {fallback_model}")
295
+ chatbot = ChatBot(session_id, fallback_model)
296
+ return chatbot, f"Model {model_key} tidak tersedia. Menggunakan {fallback_model} sebagai alternatif."
297
+ except Exception as e:
298
+ print(f"Fallback model {fallback_model} also failed: {str(e)}")
299
+
300
+ return None, "Semua model gagal dimuat. Harap coba lagi nanti."
301
 
302
  def create_conversational_chain(db, file_path, model_key):
303
  llm = create_llm_pipeline(model_key)
 
655
  def handle_process_file(file, model_key, sess_id):
656
  if file is None:
657
  return None, None, False, "Mohon upload file CSV terlebih dahulu."
658
+
659
+ try:
660
+ chatbot = ChatBot(sess_id, model_key)
661
+ result = chatbot.process_file(file)
662
+ return chatbot, True, [(None, result)]
663
+ except Exception as e:
664
+ import traceback
665
+ print(f"Error processing file with {model_key}: {str(e)}")
666
+ print(traceback.format_exc())
667
+
668
+ # Coba dengan model fallback
669
+ try:
670
+ chatbot, message = handle_model_loading_error(model_key, sess_id)
671
+ if chatbot is not None:
672
+ result = chatbot.process_file(file)
673
+ return chatbot, True, [(None, message), (None, result)]
674
+ else:
675
+ return None, False, [(None, message)]
676
+ except Exception as fb_err:
677
+ error_msg = f"Error dengan model {model_key}: {str(e)}\n\nFallback juga gagal: {str(fb_err)}"
678
+ return None, False, [(None, error_msg)]
679
 
680
  process_button.click(
681
  fn=handle_process_file,