Spaces:
Runtime error
Runtime error
Add appy_chat_template process
Browse files- spanish_medica_llm.py +11 -1
spanish_medica_llm.py
CHANGED
@@ -403,6 +403,10 @@ def tokenize(element, tokenizer):
|
|
403 |
input_batch.append(input_ids)
|
404 |
return {"input_ids": input_batch}
|
405 |
|
|
|
|
|
|
|
|
|
406 |
def splitDatasetInTestValid(dataset):
|
407 |
"""
|
408 |
"""
|
@@ -705,8 +709,14 @@ def run_finnetuning_process():
|
|
705 |
tokenizer = loadSpanishTokenizer()
|
706 |
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
707 |
print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
|
708 |
-
|
709 |
print('----------------------------------------------------------')
|
|
|
|
|
|
|
|
|
|
|
|
|
710 |
medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
|
711 |
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
712 |
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|
|
|
403 |
input_batch.append(input_ids)
|
404 |
return {"input_ids": input_batch}
|
405 |
|
406 |
+
def apply_chat_template(example, tokenizer):
|
407 |
+
example['raw_text'] = tokenizer.apply_chat_template(example['raw_text'], tokenize=False)
|
408 |
+
return example
|
409 |
+
|
410 |
def splitDatasetInTestValid(dataset):
|
411 |
"""
|
412 |
"""
|
|
|
709 |
tokenizer = loadSpanishTokenizer()
|
710 |
medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
|
711 |
print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
|
712 |
+
|
713 |
print('----------------------------------------------------------')
|
714 |
+
medicalSpanishDataset = medicalSpanishDataset.map(apply_chat_template,
|
715 |
+
num_proc = os.cpu_count(),
|
716 |
+
fn_kwargs = {'tokenizer':tokenizer},
|
717 |
+
remove_columns = [col for col in medicalSpanishDataset.features if col not in ['raw_text']],
|
718 |
+
desc = 'Applying chat template'
|
719 |
+
)
|
720 |
medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
|
721 |
medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
|
722 |
train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
|