inoid commited on
Commit
6b7d42e
·
1 Parent(s): 78455b7

Add appy_chat_template process

Browse files
Files changed (1) hide show
  1. spanish_medica_llm.py +11 -1
spanish_medica_llm.py CHANGED
@@ -403,6 +403,10 @@ def tokenize(element, tokenizer):
403
  input_batch.append(input_ids)
404
  return {"input_ids": input_batch}
405
 
 
 
 
 
406
  def splitDatasetInTestValid(dataset):
407
  """
408
  """
@@ -705,8 +709,14 @@ def run_finnetuning_process():
705
  tokenizer = loadSpanishTokenizer()
706
  medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
707
  print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
708
-
709
  print('----------------------------------------------------------')
 
 
 
 
 
 
710
  medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
711
  medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
712
  train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
 
403
  input_batch.append(input_ids)
404
  return {"input_ids": input_batch}
405
 
406
+ def apply_chat_template(example, tokenizer):
407
+ example['raw_text'] = tokenizer.apply_chat_template(example['raw_text'], tokenize=False)
408
+ return example
409
+
410
  def splitDatasetInTestValid(dataset):
411
  """
412
  """
 
709
  tokenizer = loadSpanishTokenizer()
710
  medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
711
  print ( tokenizer.apply_chat_template(medicalSpanishDataset[5]['raw_text'], tokenize=False))
712
+
713
  print('----------------------------------------------------------')
714
+ medicalSpanishDataset = medicalSpanishDataset.map(apply_chat_template,
715
+ num_proc = os.cpu_count(),
716
+ fn_kwargs = {'tokenizer':tokenizer},
717
+ remove_columns = [col for col in medicalSpanishDataset.features if col not in ['raw_text']],
718
+ desc = 'Applying chat template'
719
+ )
720
  medicalSpanishDataset = tokenizer.apply_chat_template(medicalSpanishDataset, tokenize=False)
721
  medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
722
  train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )