import torch from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline from datasets import load_dataset, Dataset import json class HuggingFaceHelper: def __init__(self, model_path="./merged_model", dataset_path=None): self.model_path = model_path self.dataset_path = dataset_path self.device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer and model self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map="auto") def check_model_integrity(self): print("🔍 Checking model integrity...") for param_tensor in self.model.state_dict(): print(f"{param_tensor}: {self.model.state_dict()[param_tensor].size()}") print("✅ Model integrity check completed.") def test_pipeline(self): try: pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) output = pipe("What is the future of AI?", max_length=100) print("✅ Model successfully generates text:", output) except Exception as e: print(f"❌ Pipeline Error: {e}") def load_dataset(self): if self.dataset_path: dataset = load_dataset("json", data_files=self.dataset_path, split="train") return dataset.map(self.tokenize_function, batched=True) else: raise ValueError("Dataset path not provided.") def tokenize_function(self, examples): return self.tokenizer(examples["messages"], truncation=True, padding="max_length", max_length=512) def fine_tune(self, output_dir="./fine_tuned_model", epochs=3, batch_size=4): dataset = self.load_dataset() training_args = TrainingArguments( output_dir=output_dir, evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=epochs, weight_decay=0.01, logging_dir=f"{output_dir}/logs", push_to_hub=False, ) trainer = Trainer( model=self.model, args=training_args, train_dataset=dataset, tokenizer=self.tokenizer, ) trainer.train() self.save_model(output_dir) def save_model(self, output_dir): self.model.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) print(f"✅ Model saved to {output_dir}") def generate_response(self, prompt, max_length=200): inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) output = self.model.generate(**inputs, max_length=max_length) return self.tokenizer.decode(output[0], skip_special_tokens=True) # Example usage if __name__ == "__main__": helper = HuggingFaceHelper(model_path="./merged_model", dataset_path="codette_training_data_finetune_fixed.jsonl") helper.check_model_integrity() helper.test_pipeline() helper.fine_tune(output_dir="./codette_finetuned", epochs=3, batch_size=4) print(helper.generate_response("How will AI impact cybersecurity?"))