izakpaul2002 commited on
Commit
06a7d5d
·
verified ·
1 Parent(s): 9aaacf1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -0
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ import torch
4
+
5
+ # Load the pre-trained GPT-2 model and tokenizer
6
+ model_name = "gpt2"
7
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
8
+ model = GPT2LMHeadModel.from_pretrained(model_name)
9
+
10
+ # Load your custom dataset (replace 'path_to_dataset' with your dataset path)
11
+ # Dataset format should be a text file with one example per line.
12
+ dataset = load_dataset("text", data_files={"train": "path_to_train.txt", "test": "path_to_test.txt"})
13
+
14
+ # Tokenize the dataset
15
+ def tokenize_function(examples):
16
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
17
+
18
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
19
+
20
+ # Set up data collator (for padding batch sizes)
21
+ from transformers import DataCollatorForLanguageModeling
22
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
23
+
24
+ # Define training arguments
25
+ training_args = TrainingArguments(
26
+ output_dir="./results",
27
+ overwrite_output_dir=True,
28
+ num_train_epochs=3,
29
+ per_device_train_batch_size=8,
30
+ save_steps=500,
31
+ save_total_limit=2,
32
+ prediction_loss_only=True,
33
+ logging_dir="./logs",
34
+ learning_rate=5e-5,
35
+ warmup_steps=500,
36
+ weight_decay=0.01,
37
+ fp16=torch.cuda.is_available(),
38
+ evaluation_strategy="steps",
39
+ eval_steps=500
40
+ )
41
+
42
+ # Initialize Trainer
43
+ trainer = Trainer(
44
+ model=model,
45
+ args=training_args,
46
+ train_dataset=tokenized_datasets["train"],
47
+ eval_dataset=tokenized_datasets["test"],
48
+ tokenizer=tokenizer,
49
+ data_collator=data_collator,
50
+ )
51
+
52
+ # Fine-tune the model
53
+ trainer.train()
54
+
55
+ # Save the fine-tuned model
56
+ trainer.save_model("./fine_tuned_gpt2")
57
+ tokenizer.save_pretrained("./fine_tuned_gpt2")
58
+
59
+ print("Model fine-tuned and saved successfully!")