Spaces:
Running
Running
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline | |
from datasets import load_dataset, Dataset | |
import json | |
class HuggingFaceHelper: | |
def __init__(self, model_path="./merged_model", dataset_path=None): | |
self.model_path = model_path | |
self.dataset_path = dataset_path | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Load tokenizer and model | |
self.tokenizer = AutoTokenizer.from_pretrained(model_path) | |
self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map="auto") | |
def check_model_integrity(self): | |
print("π Checking model integrity...") | |
for param_tensor in self.model.state_dict(): | |
print(f"{param_tensor}: {self.model.state_dict()[param_tensor].size()}") | |
print("β Model integrity check completed.") | |
def test_pipeline(self): | |
try: | |
pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) | |
output = pipe("What is the future of AI?", max_length=100) | |
print("β Model successfully generates text:", output) | |
except Exception as e: | |
print(f"β Pipeline Error: {e}") | |
def load_dataset(self): | |
if self.dataset_path: | |
dataset = load_dataset("json", data_files=self.dataset_path, split="train") | |
return dataset.map(self.tokenize_function, batched=True) | |
else: | |
raise ValueError("Dataset path not provided.") | |
def tokenize_function(self, examples): | |
return self.tokenizer(examples["messages"], truncation=True, padding="max_length", max_length=512) | |
def fine_tune(self, output_dir="./fine_tuned_model", epochs=3, batch_size=4): | |
dataset = self.load_dataset() | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
num_train_epochs=epochs, | |
weight_decay=0.01, | |
logging_dir=f"{output_dir}/logs", | |
push_to_hub=False, | |
) | |
trainer = Trainer( | |
model=self.model, | |
args=training_args, | |
train_dataset=dataset, | |
tokenizer=self.tokenizer, | |
) | |
trainer.train() | |
self.save_model(output_dir) | |
def save_model(self, output_dir): | |
self.model.save_pretrained(output_dir) | |
self.tokenizer.save_pretrained(output_dir) | |
print(f"β Model saved to {output_dir}") | |
def generate_response(self, prompt, max_length=200): | |
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) | |
output = self.model.generate(**inputs, max_length=max_length) | |
return self.tokenizer.decode(output[0], skip_special_tokens=True) | |
# Example usage | |
if __name__ == "__main__": | |
helper = HuggingFaceHelper(model_path="./merged_model", dataset_path="codette_training_data_finetune_fixed.jsonl") | |
helper.check_model_integrity() | |
helper.test_pipeline() | |
helper.fine_tune(output_dir="./codette_finetuned", epochs=3, batch_size=4) | |
print(helper.generate_response("How will AI impact cybersecurity?")) | |