codeX-1.0 / tokenizer /apply_tokenizer.py
dorkai's picture
Upload model from GitHub.
b410583
raw
history blame contribute delete
371 Bytes
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer.from_file(
"./salesforce/codet5-vocab.json",
"./salesforce/codet5-merges.txt"
)
tokenizer.add_special_tokens([
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)