File size: 2,390 Bytes
10c0e07
3c45579
e985b51
10c0e07
3c45579
 
e985b51
3c45579
10c0e07
 
 
3c45579
 
10c0e07
3c45579
10c0e07
3c45579
e985b51
10c0e07
e985b51
10c0e07
e985b51
 
3c45579
10c0e07
e985b51
3c45579
e985b51
10c0e07
 
3c45579
e985b51
3c45579
 
 
 
e985b51
3c45579
e985b51
3c45579
 
 
 
e985b51
 
3c45579
10c0e07
3c45579
 
e985b51
 
3c45579
e985b51
 
 
 
10c0e07
e985b51
 
3c45579
e985b51
 
3c45579
e985b51
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Authenticate with Hugging Face (replace with your token)
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
login(token=hf_token)

# Load Pretrained TinyLlama Model & Tokenizer
MODEL_NAME = "vv876803/tinyllama-victor"  # Replace with your TinyLlama model name if different
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token)

# Prepare LoRA for Efficient Training
peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)  # Ensures LoRA compatibility
model = get_peft_model(model, peft_config)

# Load Dataset (Example: OpenAssistant OASST1)
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")

# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal Language Modeling (not MLM)
)

# Training Arguments (use `cpu` as the accelerator here for free CPU)
training_args = TrainingArguments(
    output_dir="./tinyllama-finetuned",  # Directory to save the fine-tuned model
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch
    per_device_train_batch_size=2,  # Small batch size due to limited resources
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",  # Log directory
    logging_steps=10,
    save_total_limit=2,  # Limit the number of saved checkpoints
    fp16=False,  # Use float32 as we're using CPU
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./tinyllama-finetuned")
tokenizer.save_pretrained("./tinyllama-finetuned")