AlGe commited on
Commit
d0428be
·
verified ·
1 Parent(s): 6a409cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -4
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer
 
3
 
4
  # List of available tokenizers
5
  tokenizers = [
@@ -10,10 +11,50 @@ tokenizers = [
10
  "xlnet-base-cased"
11
  ]
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def tokenize_text(text, tokenizer_name):
14
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
15
- tokens = tokenizer.tokenize(text)
16
- return " ".join(tokens)
 
 
17
 
18
  def compare_tokenizers(text, selected_tokenizers):
19
  results = {}
@@ -28,10 +69,11 @@ iface = gr.Interface(
28
  gr.Textbox(label="Enter text to tokenize"),
29
  gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
30
  ],
31
- outputs=gr.JSON(label="Tokenization Results"),
32
  title="Tokenizer Comparison",
33
  description="Compare tokenization results from different tokenizers.",
34
  )
35
 
36
  # Launch the app
37
- iface.launch()
 
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
+ import random
4
 
5
  # List of available tokenizers
6
  tokenizers = [
 
11
  "xlnet-base-cased"
12
  ]
13
 
14
+ def generate_colored_html(tokens, decoded_tokens):
15
+ colors = ["#FFDDC1", "#C1FFD4", "#D4C1FF", "#FFC1C1", "#C1FFFD"]
16
+ text_color = "#000000"
17
+ last_color = None
18
+ background_color = "#F0F0F0"
19
+ html_tokens = []
20
+
21
+ special_token_replacements = {
22
+ '<pad>': '[Padding]',
23
+ '<s>': '[Start of Sentence]',
24
+ '</s>': '[End of Sentence]',
25
+ '<unk>': '[Unknown]',
26
+ '<mask>': '[Masked]',
27
+ '[CLS]': '[Class]',
28
+ '[SEP]': '[Separator]'
29
+ }
30
+
31
+ for i, (token, decoded_token) in enumerate(zip(tokens, decoded_tokens)):
32
+ for special_token, replacement in special_token_replacements.items():
33
+ if special_token in decoded_token:
34
+ decoded_token = decoded_token.replace(special_token, replacement)
35
+
36
+ hover_info = f"Token Index: {i}, Token: {decoded_token}, Token ID: {token}"
37
+
38
+ if '\n' in decoded_token:
39
+ color = random.choice([c for c in colors if c != last_color])
40
+ last_color = color
41
+ newline_representation = f"<span style='background-color: {color}; color: {text_color};' title='{hover_info}'>[NEWLINE]</span><br>"
42
+ html_tokens.append(newline_representation)
43
+ else:
44
+ color = random.choice([c for c in colors if c != last_color])
45
+ last_color = color
46
+ html_tokens.append(f'<span style="background-color: {color}; color: {text_color}; text-decoration: none;" title="{hover_info}">{decoded_token}</span>')
47
+
48
+ html_output = " ".join(html_tokens)
49
+ html_output = f'<div style="background-color: {background_color}; padding: 10px;">{html_output}</div>'
50
+ return html_output
51
+
52
  def tokenize_text(text, tokenizer_name):
53
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
54
+ tokens = tokenizer.encode(text, add_special_tokens=True)
55
+ decoded_tokens = [tokenizer.decode(token) for token in tokens]
56
+ html_output = generate_colored_html(tokens, decoded_tokens)
57
+ return html_output
58
 
59
  def compare_tokenizers(text, selected_tokenizers):
60
  results = {}
 
69
  gr.Textbox(label="Enter text to tokenize"),
70
  gr.CheckboxGroup(choices=tokenizers, label="Select tokenizers")
71
  ],
72
+ outputs=gr.HTML(label="Tokenization Results"),
73
  title="Tokenizer Comparison",
74
  description="Compare tokenization results from different tokenizers.",
75
  )
76
 
77
  # Launch the app
78
+ iface.launch()
79
+