File size: 2,390 Bytes
10c0e07 3c45579 e985b51 10c0e07 3c45579 e985b51 3c45579 10c0e07 3c45579 10c0e07 3c45579 10c0e07 3c45579 e985b51 10c0e07 e985b51 10c0e07 e985b51 3c45579 10c0e07 e985b51 3c45579 e985b51 10c0e07 3c45579 e985b51 3c45579 e985b51 3c45579 e985b51 3c45579 e985b51 3c45579 10c0e07 3c45579 e985b51 3c45579 e985b51 10c0e07 e985b51 3c45579 e985b51 3c45579 e985b51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from huggingface_hub import login
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
# Authenticate with Hugging Face (replace with your token)
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
login(token=hf_token)
# Load Pretrained TinyLlama Model & Tokenizer
MODEL_NAME = "vv876803/tinyllama-victor" # Replace with your TinyLlama model name if different
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
# Prepare LoRA for Efficient Training
peft_config = LoraConfig(
r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model) # Ensures LoRA compatibility
model = get_peft_model(model, peft_config)
# Load Dataset (Example: OpenAssistant OASST1)
dataset = load_dataset("OpenAssistant/oasst1", split="train[:10%]")
# Tokenize Dataset
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False # Causal Language Modeling (not MLM)
)
# Training Arguments (use `cpu` as the accelerator here for free CPU)
training_args = TrainingArguments(
output_dir="./tinyllama-finetuned", # Directory to save the fine-tuned model
evaluation_strategy="epoch", # Evaluate after each epoch
save_strategy="epoch", # Save after each epoch
per_device_train_batch_size=2, # Small batch size due to limited resources
per_device_eval_batch_size=2,
num_train_epochs=3,
logging_dir="./logs", # Log directory
logging_steps=10,
save_total_limit=2, # Limit the number of saved checkpoints
fp16=False, # Use float32 as we're using CPU
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
data_collator=data_collator,
)
# Train the model
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./tinyllama-finetuned")
tokenizer.save_pretrained("./tinyllama-finetuned")
|