File size: 1,119 Bytes
a3e05e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys
sys.path.append('.')
from modules.tokenizer.tokenizer import get_tokenizer_and_extra_tokens



if __name__ == '__main__':
    tokenizer, extra_tokens = get_tokenizer_and_extra_tokens()

    assert tokenizer.encode("user") == [1495]
    assert tokenizer.decode([1495]) == "user"

    assert tokenizer.encode("0") == [501]
    assert tokenizer.decode([501]) == "0"

    assert tokenizer.encode("1") == [503]
    assert tokenizer.decode([503]) == "1"

    assert tokenizer.encode("assistant") == [110866]
    assert tokenizer.decode([110866]) == "assistant"

    assert tokenizer.encode("audio") == [26229]
    assert tokenizer.decode([26229]) == "audio"
    

    assert extra_tokens.msg_end == 260
    assert extra_tokens.user_msg_start == 261
    assert extra_tokens.assistant_msg_start == 262
    assert extra_tokens.name_end == 272
    assert extra_tokens.media_begin == 273
    assert extra_tokens.media_content == 274
    assert extra_tokens.media_end == 275

    assert [tokenizer.convert_tokens_to_ids(i) for i in ['<0x0A>', '</s>', '[extra_id_0]']] == [14, 1, 260]

    print("All tests passed!")