from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments from datasets import load_dataset import torch # Load the pre-trained GPT-2 model and tokenizer model_name = "gpt2" tokenizer = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) # Load your custom dataset (replace 'path_to_dataset' with your dataset path) # Dataset format should be a text file with one example per line. dataset = load_dataset("text", data_files={"train": "path_to_train.txt", "test": "path_to_test.txt"}) # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Set up data collator (for padding batch sizes) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Define training arguments training_args = TrainingArguments( output_dir="./results", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=8, save_steps=500, save_total_limit=2, prediction_loss_only=True, logging_dir="./logs", learning_rate=5e-5, warmup_steps=500, weight_decay=0.01, fp16=torch.cuda.is_available(), evaluation_strategy="steps", eval_steps=500 ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], tokenizer=tokenizer, data_collator=data_collator, ) # Fine-tune the model trainer.train() # Save the fine-tuned model trainer.save_model("./fine_tuned_gpt2") tokenizer.save_pretrained("./fine_tuned_gpt2") print("Model fine-tuned and saved successfully!")