Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer
|
|
|
3 |
|
4 |
# List of available tokenizers
|
5 |
tokenizers = [
|
@@ -10,10 +11,50 @@ tokenizers = [
|
|
10 |
"xlnet-base-cased"
|
11 |
]
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def tokenize_text(text, tokenizer_name):
|
14 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
15 |
-
tokens = tokenizer.
|
16 |
-
|
|
|
|
|
17 |
|
18 |
def compare_tokenizers(text, selected_tokenizers):
|
19 |
results = {}
|
@@ -28,10 +69,11 @@ iface = gr.Interface(
|
|
28 |
gr.Textbox(label="Enter text to tokenize"),
|
29 |
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
|
30 |
],
|
31 |
-
outputs=gr.
|
32 |
title="Tokenizer Comparison",
|
33 |
description="Compare tokenization results from different tokenizers.",
|
34 |
)
|
35 |
|
36 |
# Launch the app
|
37 |
-
iface.launch()
|
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer
|
3 |
+
import random
|
4 |
|
5 |
# List of available tokenizers
|
6 |
tokenizers = [
|
|
|
11 |
"xlnet-base-cased"
|
12 |
]
|
13 |
|
14 |
+
def generate_colored_html(tokens, decoded_tokens):
|
15 |
+
colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"]
|
16 |
+
text_color = "#000000"
|
17 |
+
last_color = None
|
18 |
+
background_color = "#F0F0F0"
|
19 |
+
html_tokens = []
|
20 |
+
|
21 |
+
special_token_replacements = {
|
22 |
+
'<pad>': '[Padding]',
|
23 |
+
'<s>': '[Start of Sentence]',
|
24 |
+
'</s>': '[End of Sentence]',
|
25 |
+
'<unk>': '[Unknown]',
|
26 |
+
'<mask>': '[Masked]',
|
27 |
+
'[CLS]': '[Class]',
|
28 |
+
'[SEP]': '[Separator]'
|
29 |
+
}
|
30 |
+
|
31 |
+
for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)):
|
32 |
+
for special_token, replacement in special_token_replacements.items():
|
33 |
+
if special_token in decoded_token:
|
34 |
+
decoded_token = decoded_token.replace(special_token, replacement)
|
35 |
+
|
36 |
+
hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}"
|
37 |
+
|
38 |
+
if '\n' in decoded_token:
|
39 |
+
color = random.choice([c for c in colors if c != last_color])
|
40 |
+
last_color = color
|
41 |
+
newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>"
|
42 |
+
html_tokens.append(newline_representation)
|
43 |
+
else:
|
44 |
+
color = random.choice([c for c in colors if c != last_color])
|
45 |
+
last_color = color
|
46 |
+
html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>')
|
47 |
+
|
48 |
+
html_output = " ".join(html_tokens)
|
49 |
+
html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>'
|
50 |
+
return html_output
|
51 |
+
|
52 |
def tokenize_text(text, tokenizer_name):
|
53 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
54 |
+
tokens = tokenizer.encode(text, add_special_tokens=True)
|
55 |
+
decoded_tokens = [tokenizer.decode(token) for token in tokens]
|
56 |
+
html_output = generate_colored_html(tokens, decoded_tokens)
|
57 |
+
return html_output
|
58 |
|
59 |
def compare_tokenizers(text, selected_tokenizers):
|
60 |
results = {}
|
|
|
69 |
gr.Textbox(label="Enter text to tokenize"),
|
70 |
gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
|
71 |
],
|
72 |
+
outputs=gr.HTML(label="Tokenization Results"),
|
73 |
title="Tokenizer Comparison",
|
74 |
description="Compare tokenization results from different tokenizers.",
|
75 |
)
|
76 |
|
77 |
# Launch the app
|
78 |
+
iface.launch()
|
79 |
+
|