File size: 28,558 Bytes

c923988

import argparse
import json
import os
from pathlib import Path

import numpy as np
import torch
import sacrebleu

from datasets import load_dataset
from torch.utils.data import Dataset, ConcatDataset
from tqdm import tqdm
from transformers import (
    AutoProcessor,
    AutoModel,
    BatchFeature,
    Trainer,
    TrainingArguments,
    StoppingCriteria,
    StoppingCriteriaList,
)
from collections import defaultdict

import soundfile as sf
from datasets import Audio
import random

class BaseAudioDataset(Dataset):
    def __init__(self, processor, split, sampling_rate=16000, debug=False):
        self.processor = processor
        self.training = "train" in split
        self.debug = debug
        self.sampling_rate = sampling_rate
        self.name = ""
        
    def set_dataset_name(self, name):
        self.name = name

    @staticmethod
    def filter_corrupted_files(data, audio_field, text_fields, dataset_name, sampling_rate=16000, debug=True):
        original_size = len(data)
        
        data = data.cast_column(audio_field, Audio(decode=False))
        
        def identify_corrupted_files(example):
            try:
                sf.read(example[audio_field]["path"])
                
                for field in text_fields:
                    if field in example and example[field].replace('"', '') == "":
                        return False
                return True
            except Exception:
                return False
        
        data = data.filter(identify_corrupted_files, num_proc=16)
        validated_size = len(data)
        
        # Audio Decoding
        data = data.cast_column(audio_field, Audio(sampling_rate=sampling_rate, decode=True))
        
        if debug:
            print(f"Dataset: {dataset_name}")
            print(f"Original data nums: {original_size}")
            print(f"After filtering data nums: {validated_size}")
            print(f"Filtering ratio: {validated_size/original_size:.2%}")
            
        return data

    @staticmethod
    def filter_by_audio_length(data, audio_field, min_sec=2, max_sec=20, debug=True):
        original_size = len(data)
        
        def filter_audio_by_length(example):
            try:
                audio = example[audio_field]['array']
                channel = 1
                if hasattr(audio, 'ndim') and audio.ndim > 1:
                    channel = audio.ndim
                    audio = audio.squeeze()
                audio_length = len(audio) / example[audio_field]['sampling_rate'] / channel
                return min_sec <= audio_length <= max_sec
            except Exception as e:
                if debug:
                    print(f"Error : {str(e)[:100]}... - sample excluded")
                return False
        
        data = data.filter(filter_audio_by_length, num_proc=16)
        filtered_size = len(data)
        
        if debug:
            print(f"Before Length Filtering data nums: {original_size}")
            print(f"After Length Filtering data nums: {filtered_size}")
            print(f"Filtering ratio: {filtered_size/original_size:.2%}")
            
        return data

    def prepare_model_inputs(self, audio_array, instruction, answer_text):
        user_message = {
            'role': 'user',
            'content': '<start_of_audio>' + instruction,
        }
        prompt = self.processor.tokenizer.apply_chat_template(
            [user_message], tokenize=False, add_generation_prompt=True, add_bos=True
        )
        
        inputs = self.processor(
            text=prompt, 
            audio=[audio_array], 
            add_special_tokens=False, 
            return_tensors='pt'
        )
        
        answer = f"{answer_text}{ANSWER_SUFFIX}"
        answer_ids = self.processor.tokenizer(answer, add_special_tokens=False, return_tensors='pt').input_ids
        
        if self.debug:
            self.debug = False
            task_type = 'AST' if hasattr(self, 'ast') and self.ast else 'ASR'
            lang_info = f" - {self.lang}" if hasattr(self, 'lang') else ""
            print(f"{task_type}{lang_info}\nPROMPT: {prompt}\nINPUT: {self.processor.decode(inputs.input_ids[0], skip_special_tokens=False)}\nANSWER: {self.processor.decode(answer_ids[0], skip_special_tokens=False)}\n")
            print(f"INPUT_MODE: {inputs.input_modes[0].item()}")
        
        if self.training:
            input_ids = torch.cat([inputs.input_ids, answer_ids], dim=1)
            labels = torch.full_like(input_ids, _IGNORE_INDEX)
            labels[:, -answer_ids.shape[1]:] = answer_ids
            padding = torch.zeros((inputs.token_type_ids.shape[0], answer_ids.shape[1]))
            token_type_ids = torch.cat([inputs.token_type_ids, padding], dim=1)
        else:
            input_ids = inputs.input_ids
            labels = answer_ids
            token_type_ids = inputs.token_type_ids
        
        return {
            'input_ids': input_ids,
            'labels': labels,
            'token_type_ids': token_type_ids,
            'input_audio_embeds': inputs.input_audio_embeds,
            'audio_embed_sizes': inputs.audio_embed_sizes,
            'input_modes': inputs.input_modes,
        }

# CoVoST2 Dataset Class
class CoVoSTDataset(BaseAudioDataset):
    def __init__(self, processor, data_dir, split, ast=False,
                 lang=("en_ko", "Korean"), sampling_rate=16000, debug=False):
        super().__init__(processor, split, sampling_rate, debug)
        
        self.set_dataset_name("CoVoST")
        self.ast = ast
        self.lang = lang[0]
        
        self.data = load_dataset("junnei/covost2", 
                           lang[0],
                           data_dir=data_dir, 
                           split=split,
                           trust_remote_code=True
                           )

        text_fields = ["sentence", "translation"] if ast else ["sentence"]
        self.data = self.filter_corrupted_files(self.data, "audio", text_fields, "CoVoST")
        
        # (Optional) Audio length Filtering
        self.data = self.filter_by_audio_length(self.data, "audio")

        # Instruction Setting
        self.instruction = random.choice(INSTRUCTION["ast"]).format(lang[1]) if ast else random.choice(INSTRUCTION["asr"])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        
        if self.ast:
            answer_text = data["translation"]
        else:
            answer_text = data["sentence"].replace('"', '')
        
        return self.prepare_model_inputs(
            data["audio"]["array"],
            self.instruction,
            answer_text
        )

# Zeroth Korean Dataset Class
class ZerothKoreanDataset(BaseAudioDataset):
    def __init__(self, processor, split, sampling_rate=16000, debug=False):
        super().__init__(processor, split, sampling_rate, debug)
        
        self.set_dataset_name("Zeroth")
        # only ASR
        self.ast = False
        self.lang = "ko"
        
        # load dataset
        self.data = load_dataset("Bingsu/zeroth-korean",
                            split=split,
                            trust_remote_code=True
                            )
        
        # (Optional) Audio length Filtering
        self.data = self.filter_by_audio_length(self.data, "audio")

        # Instruction Setting
        self.instruction = random.choice(INSTRUCTION["asr"])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        
        # Zeroth Korean is only for ASR
        answer_text = data["text"].replace('"', '')
        
        return self.prepare_model_inputs(
            data["audio"]["array"],
            self.instruction,
            answer_text
        )

# Libri Speech Dataset Class
class LibriSpeechDataset(BaseAudioDataset):
    def __init__(self, processor, subset, split, sampling_rate=16000, debug=False):
        super().__init__(processor, split, sampling_rate, debug)
        
        self.set_dataset_name(f"LibriSpeech_{subset}")
        # only ASR
        self.ast = False
        self.lang = "en"
        
        # load dataset
        self.data = load_dataset("fixie-ai/librispeech_asr",
                            subset,
                            split=split,
                            trust_remote_code=True
                            )
        
        # (Optional) Audio length Filtering
        self.data = self.filter_by_audio_length(self.data, "audio")
            
        # Instruction Setting
        self.instruction = random.choice(INSTRUCTION["asr"])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        
        # Libri Speech is only for ASR
        answer_text = data["text"].replace('"', '')
        
        return self.prepare_model_inputs(
            data["audio"]["array"],
            self.instruction,
            answer_text
        )

# Fleurs Dataset Class
class FleursDataset(BaseAudioDataset):
    def __init__(self, processor, split, source_lang, target_lang=None, 
                 mode="asr", sampling_rate=16000, debug=False):
        super().__init__(processor, split, sampling_rate, debug)
        
        self.set_dataset_name("Fleurs")
        # Mode Setting (ASR or AST)
        if mode not in ["asr", "ast"]:
            raise ValueError("mode must be 'asr' or 'ast'.")
        
        self.mode = mode
        self.ast = (mode == "ast")
        self.source_lang = source_lang
        
        # Language name mapping (expand if needed)
        self.lang_names = {
            'en_us': 'English', 'ko_kr': 'Korean'
        }
        
        # load dataset - source language dataset
        self.data = load_dataset("google/fleurs",
                            source_lang,
                            split=split,
                            trust_remote_code=True
                            )

        # (Optional) Audio length Filtering
        self.data = self.filter_by_audio_length(self.data, "audio")
        
        # When AST mode, load target language dataset.
        if self.ast:
            if target_lang is None:
                raise ValueError("AST mode requires target_lang.")
                
            self.target_lang = target_lang
            self.lang = f"{source_lang}_{target_lang}"
            
            # load dataset - target language dataset (for translation)
            target_data = load_dataset("google/fleurs",
                                target_lang,
                                split=split,
                                trust_remote_code=True
                                )
            
            source_dict = {item['id']: item for item in self.data}
            target_dict = {item['id']: item for item in target_data}
            
            # only Common ID, add translation fields
            common_ids = set(source_dict.keys()) & set(target_dict.keys())
            print(f"FLEURS AST Common data filtering: {len(self.data)} -> {len(common_ids)}")
            self.data = [
                {**source_dict[id], 'translation': target_dict[id]['transcription']}
                for id in common_ids
            ]

            # Instruction Setting - use target language name
            target_lang_name = self.lang_names.get(target_lang, target_lang.capitalize())
            self.instruction = random.choice(INSTRUCTION["ast"]).format(target_lang_name)
        else:
            # ASR mode
            self.lang = source_lang
            self.instruction = random.choice(INSTRUCTION["asr"])

        if self.debug:
            print(f"FLEURS dataset loaded: {self.mode.upper()} mode")
            print(f"source lang: {source_lang} ({self.lang_names.get(source_lang, source_lang)})")
            if self.ast:
                print(f"target lang: {target_lang} ({self.lang_names.get(target_lang, target_lang)})")
            print(f"dataset size: {len(self.data)}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        audio_array = data["audio"]["array"]

        if self.ast:
            answer_text = data["translation"]
        else:
            answer_text = data["transcription"]
        
        return self.prepare_model_inputs(
            audio_array,
            self.instruction,
            answer_text
        )

def covost_collate_fn(batch):
    input_ids_list = []
    labels_list = []
    token_type_ids_list = []
    input_audio_embeds_list = []
    audio_embed_sizes_list = []
    audio_attention_mask_list = []
    input_modes_list = []
    for inputs in batch:
        input_ids_list.append(inputs['input_ids'][0])
        labels_list.append(inputs['labels'][0])
        token_type_ids_list.append(inputs['token_type_ids'][0])
        input_audio_embeds_list.append(inputs['input_audio_embeds'])
        audio_embed_sizes_list.append(inputs['audio_embed_sizes'])
        audio_attention_mask_list.append(
            inputs['input_audio_embeds'].new_full((inputs['input_audio_embeds'].size(1),), True, dtype=torch.bool)
        )
        input_modes_list.append(inputs['input_modes'])

    try:
        token_type_ids = pad_sequence(token_type_ids_list, padding_side='left', padding_value=0)
        input_ids = pad_sequence(input_ids_list, padding_side='left', padding_value=0)
        labels = pad_sequence(labels_list, padding_side='left', padding_value=0)
        audio_attention_mask = (
            pad_sequence(audio_attention_mask_list, padding_side='left', padding_value=False)
            if len(audio_attention_mask_list) > 1
            else None
        )
    except Exception as e:
        print(e)
        print(input_ids_list)
        print(labels_list)
        raise
    attention_mask = (input_ids != 0).long()
    input_audio_embeds = cat_with_pad(input_audio_embeds_list, dim=0)
    audio_embed_sizes = torch.cat(audio_embed_sizes_list)
    input_modes = torch.cat(input_modes_list)
    
    return BatchFeature(
        {
            'input_ids': input_ids,
            'labels': labels,
            'token_type_ids': token_type_ids,
            'attention_mask': attention_mask,
            'input_audio_embeds': input_audio_embeds,
            'audio_embed_sizes': audio_embed_sizes,
            'audio_attention_mask': audio_attention_mask,
            'input_modes': input_modes,
        }
    )

def pad_sequence(sequences, padding_side='left', padding_value=0):
    """
    Pad a list of sequences to the same length.
    sequences: list of tensors in [seq_len, *] shape
    """
    assert padding_side in ['right', 'left']
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max(len(seq) for seq in sequences)
    batch_size = len(sequences)
    output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
    for i, seq in enumerate(sequences):
        length = seq.size(0)
        if padding_side == 'right':
            output.data[i, :length] = seq
        else:
            output.data[i, -length:] = seq
    return output

def cat_with_pad(tensors, dim, padding_value=0):
    """
    cat along dim, while pad to max for all other dims
    """
    ndim = tensors[0].dim()
    assert all(
        t.dim() == ndim for t in tensors[1:]
    ), 'All tensors must have the same number of dimensions'

    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
    out_size[dim] = sum(t.shape[dim] for t in tensors)
    output = tensors[0].new_full(out_size, padding_value)

    index = 0
    for t in tensors:
        # Create a slice list where every dimension except dim is full slice
        slices = [slice(0, t.shape[d]) for d in range(ndim)]
        # Update only the concat dimension slice
        slices[dim] = slice(index, index + t.shape[dim])

        output[slices] = t
        index += t.shape[dim]

    return output

def count_parameters_by_module(model):
    # dictionary for parameters number by modules
    module_params = defaultdict(lambda: {"total": 0, "trainable": 0})
    
    # all params
    total_params = 0
    total_trainable_params = 0
    
    # Check Embedding Token masks
    embedding_masks = {}
    for name, param in model.named_parameters():
        if 'embed_tokens.weight' in name and hasattr(param, '_backward_hooks') and param._backward_hooks:
            # check if params has embedding_grad_mask_hook
            for hook_id, hook_fn in param._backward_hooks.items():
                if hook_fn.__code__.co_name == 'embedding_grad_mask_hook':
                    # Accessing mask variables in the closure of hook functions
                    for cell in hook_fn.__closure__ or []:
                        if isinstance(cell.cell_contents, torch.Tensor) and cell.cell_contents.dtype == torch.bool:
                            # check mask tensor
                            embedding_masks[name] = ~cell.cell_contents  # True : Trainable
                 
    # Count params by modules
    for name, param in model.named_parameters():
        # extracts top module_name
        module_name = name.split('.')[0]
        param_count = param.numel()
        
        module_params[module_name]["total"] += param_count
        total_params += param_count
        
        if param.requires_grad:
            # Only count for real trainable params. (with masks)
            if name in embedding_masks:
                trainable_count = embedding_masks[name].sum().item()
                module_params[module_name]["trainable"] += trainable_count
                total_trainable_params += trainable_count
            else:
                module_params[module_name]["trainable"] += param_count
                total_trainable_params += param_count
    
    print(f"All Params: {total_params:,}")
    print(f"Trainable Params: {total_trainable_params:,} ({total_trainable_params/total_params*100:.2f}%)")
    print("\nParams by Module:")
    
    for module_name, counts in sorted(module_params.items()):
        trainable_percentage = counts["trainable"] / counts["total"] * 100 if counts["total"] > 0 else 0
        total_percentage = counts["total"] / total_params * 100
        
        print(f"- {module_name}:")
        print(f"  Total: {counts['total']:,} ({total_percentage:.2f}% of model)")
        print(f"  Trainable: {counts['trainable']:,} ({trainable_percentage:.2f}% of module)")
    
    return module_params

def create_model(model_name_or_path, revision="main", use_flash_attention = False):
    model = AutoModel.from_pretrained(
        model_name_or_path,
        revision=revision,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="flash_attention_2" if use_flash_attention else "eager",
        trust_remote_code=True,
    )
    
    # Set use_cache to False after model loaded
    model.config.use_cache = False

    # Freeze all parameters
    for param in model.parameters():
        param.requires_grad = False
    
    model.set_lora_adapter('speech')
    model.to(torch.bfloat16)
    
    # (Optional) unfreeze audio_tower parameters
    #for param in model.audio_tower.parameters():
    #    param.requires_grad = True

    # Only unfreeze audio_projector parameters
    for param in model.audio_projector.parameters():
        param.requires_grad = True

    # (Optional) unfreeze audio embed_tokens
    train_embed = True
    if train_embed:
        embed_tokens = model.language_model.model.model.embed_tokens
        
        embed_tokens.weight.requires_grad = False

        # Added Speech token IDs (only this tokens be trainable)
        trainable_token_ids = [256001, 256002]

        embed_tokens.weight.requires_grad = True
        mask = torch.ones_like(embed_tokens.weight, dtype=torch.bool)
        mask[trainable_token_ids] = False  # Trainable Tokens are False (unfreeze), else True (freeze)

        # backward hook, with gradient masking
        def embedding_grad_mask_hook(grad):
            return grad.masked_fill(mask, 0)

        embed_tokens.weight.register_hook(embedding_grad_mask_hook)

        model.language_model.model.model.embed_tokens = embed_tokens
        
    count_parameters_by_module(model)

    return model

os.environ["TOKENIZERS_PARALLELISM"] = "false" 

INSTRUCTION = {
    "ast": [
        "Translate the audio to {0}.",
        "Translate the audio clip into {0}.",
        "Based on the attached audio, generate a comprehensive {0} translation of the spoken content.",
        "Translate the provided audio file into {0}.",
        "Convert the audio speech to {0} text.",
        "Write an {0} translation of the audio file.",
        "Translate spoken words from the audio into {0}.",
        "Create an {0} version of the audio content.",
        "Produce an accurate {0} translation of the audio.",
        "Extract speech from the audio and translate it to {0}.",
        "Turn the audio into readable {0} text.",
        "Write all spoken content from the audio in {0}.",
        "Generate an {0} translation of the speech in the file.",
        "Convert the recording into {0} text.",
        "Accurately translate the audio recording to {0}.",
        "Write down dialogue from the given audio in {0}.",
        "Translate all speech in this audio file to {0}.",
        "Create an accurate {0} version of the speech.",
        "Perform a complete {0} translation of the audio."
    ],
    "asr": [
        "Transcribe the audio clip into text.",
        "Based on the attached audio, generate a comprehensive text transcription of the spoken content.",
        "Transcribe the provided audio file into text.",
        "Convert the audio speech to text.",
        "Write a transcript of the audio file.",
        "Transcribe spoken words from the audio.",
        "Create a text version of the audio content.",
        "Produce a verbatim transcript of the audio.",
        "Extract and transcribe speech from the audio.",
        "Turn the audio into readable text.",
        "Write all spoken words from the audio.",
        "Generate a transcript of the speech in the file.",
        "Convert the recording into a text transcript.",
        "Accurately transcribe the audio recording.",
        "Write down dialogue from the given audio.",
        "Transcribe all speech in this audio file.",
        "Create an accurate text version of the speech.",
        "Perform a complete transcription of the audio."
    ],
}

ANSWER_SUFFIX = "<end_of_turn>"
_IGNORE_INDEX = -100

model_name_or_path = 'junnei/gemma-3-4b-it-speech'
use_flash_attention = True

output_dir = '/workspace/output'
batch_size = 128
batch_size_per_gpu = 32
learning_rate = 4.0e-5 # 1.0e-4 for fine-tuning
wd = 0.01
num_train_epochs = 5

revision = "main" #"v1.0"

processor = AutoProcessor.from_pretrained(
    model_name_or_path,
    revision=revision,
    trust_remote_code=True,
)

model = create_model(
    model_name_or_path,
    revision=revision,
    use_flash_attention=use_flash_attention,
)

train_datasets = []

# Covost ASR mode (English -> English text)
covost_asr_dataset = CoVoSTDataset(
    processor=processor,
    data_dir="/workspace/CommonVoice/EN",
    split="train",
    ast=False,
    lang=("en_ko", "Korean")
)
train_datasets.append(covost_asr_dataset)

# Covost AST mode (English -> Korean text)
covost_dataset = CoVoSTDataset(
    processor=processor,
    data_dir="/workspace/CommonVoice/EN",
    split="train",
    ast=True,
    lang=("en_ko", "Korean")
)
train_datasets.append(covost_dataset)

# Libri Speech Clean ASR mode (English -> English text)
libri_speech_clean = LibriSpeechDataset(
    processor=processor,
    subset="clean",
    split="train.360"
)
train_datasets.append(libri_speech_clean)

# Libri Speech Other ASR mode (English -> English text)
libri_speech_other = LibriSpeechDataset(
    processor=processor,
    subset="other",
    split="train.500"
)
train_datasets.append(libri_speech_other)

# Fleurs ASR mode (English -> English text)
en_asr_fleurs = FleursDataset(
    processor=processor,
    split="train",
    source_lang="en_us",  # English
    mode="asr"
)
train_datasets.append(en_asr_fleurs)

# Fleurs AST mode (English -> Korean text)
en_ko_ast_fleurs = FleursDataset(
    processor=processor,
    split="train",
    source_lang="en_us",  # English
    target_lang="ko_kr",  # Korean
    mode="ast"
)
train_datasets.append(en_ko_ast_fleurs)

# Covost ASR mode (Korean -> Korean text)
covost_ko_asr_dataset = CoVoSTDataset(
    processor=processor,
    data_dir="/workspace/CommonVoice/ko",
    split="train",
    ast=False,
    lang=("ko_en", "English")
)
train_datasets.append(covost_ko_asr_dataset)

# Covost AST mode (Korean -> English text)
covost_ko_dataset = CoVoSTDataset(
    processor=processor,
    data_dir="/workspace/CommonVoice/ko",
    split="train",
    ast=True,
    lang=("ko_en", "English")
)
train_datasets.append(covost_ko_dataset)

# Zeroth ASR mode (Korean -> Korean text)
ko_asr_zeroth = ZerothKoreanDataset(
    processor=processor,
    split="train"
)
train_datasets.append(ko_asr_zeroth)

# Fleurs ASR mode (Korean -> Korean text)
ko_asr_fleurs = FleursDataset(
    processor=processor,
    split="train",
    source_lang="ko_kr",  # Korean
    mode="asr"
)
train_datasets.append(ko_asr_fleurs)

# Fleurs AST mode (Korean -> English text)
ko_en_ast_fleurs = FleursDataset(
    processor=processor,
    split="train",
    source_lang="ko_kr",  # Korean
    target_lang="en_us",  # English
    mode="ast"
)
train_datasets.append(ko_en_ast_fleurs)

print("Count Num of Datasets", len(train_datasets))
print([len(dataset) for dataset in train_datasets])

# ConcatDataset
train_dataset = ConcatDataset(train_datasets) if len(train_datasets) > 1 else train_datasets[0]
print("Count Length of Datas", len(train_dataset))

# Check GPUs
num_gpus = torch.cuda.device_count()
print(f'training on {num_gpus} GPUs')

assert (
    batch_size % (num_gpus * batch_size_per_gpu) == 0
), 'Batch size must be divisible by the number of GPUs'
gradient_accumulation_steps = batch_size // (num_gpus * batch_size_per_gpu)

# hard coded training args
training_args = TrainingArguments(
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size_per_gpu,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim='adamw_torch',
    adam_beta1=0.9,
    adam_beta2=0.95,
    adam_epsilon=1e-7,
    learning_rate=learning_rate,
    weight_decay=wd,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    warmup_steps=50,
    logging_steps=50,
    output_dir=output_dir,
    save_strategy='no',
    save_total_limit=10,
    save_only_model=True,
    bf16=True,
    fp16=False,
    remove_unused_columns=False,
    report_to='none',
    deepspeed=None,
    disable_tqdm=False,
    dataloader_num_workers=4,
    ddp_find_unused_parameters=True,
)

out_path = Path(training_args.output_dir)
out_path.mkdir(parents=True, exist_ok=True)

# create optimizer only for trainable params
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate,
    weight_decay=wd,
    betas=(0.9, 0.95),
    eps=1e-7,
)

# Trainer Setting
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=covost_collate_fn,
    train_dataset=train_dataset,
    optimizers=(optimizer, None),
)

trainer.train()

import shutil

# 1. Save LoRA Adapter
model.language_model.model.save_pretrained(output_dir)

# 1-1. Delete Markdown file
markdown_file = os.path.join(output_dir, "README.md")
if os.path.exists(markdown_file):
    os.remove(markdown_file)

# 2. Save entire model
model.save_pretrained(output_dir)

# 3. Cleanup Memory
del model
del trainer
__import__('gc').collect()
torch.cuda.empty_cache()

from huggingface_hub import HfApi, login, create_repo, Repository, upload_folder

upload_dir = "/workspace/upload"

# 4. Clone Repo
repo_id = "junnei/gemma-3-4b-it-speech"
branch_name = "main"  # 새 브랜치 이름

repo = Repository(local_dir=upload_dir, clone_from = repo_id)
repo.git_checkout(branch_name, create_branch_ok=True)

# 4-1. Move Trained model to Repo
for item in os.listdir(output_dir):
    s = os.path.join(output_dir, item)
    d = os.path.join(upload_dir, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)