aiqtech commited on
Commit
006e855
·
verified ·
1 Parent(s): 0eddacc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1752 -0
app.py ADDED
@@ -0,0 +1,1752 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from flask import Flask, request, render_template_string, jsonify
3
+ import hashlib
4
+ import sys
5
+ import math
6
+ import os
7
+ import time
8
+
9
+ app = Flask(__name__)
10
+ # Set maximum content length to 25MB to handle larger files
11
+ app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024
12
+
13
+ # Create upload folder if it doesn't exist
14
+ UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
15
+ if not os.path.exists(UPLOAD_FOLDER):
16
+ os.makedirs(UPLOAD_FOLDER)
17
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
18
+
19
+ # Predefined tokenizer models with aliases
20
+ TOKENIZER_MODELS = {
21
+ 'qwen3': {
22
+ 'name': 'Qwen/Qwen3-0.6B',
23
+ 'alias': 'Qwen3'
24
+ },
25
+ 'llama4': {
26
+ 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
27
+ 'alias': 'Llama 4'
28
+ },
29
+ 'mistral-small': {
30
+ 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
31
+ 'alias': 'Mistral Small 3.1'
32
+ },
33
+ 'gemma3-27b': {
34
+ 'name': 'google/gemma-3-27b-it',
35
+ 'alias': 'Gemma 3 27B'
36
+ },
37
+ 'deepseek-r1': {
38
+ 'name': 'deepseek-ai/DeepSeek-R1',
39
+ 'alias': 'Deepseek R1'
40
+ },
41
+ 'qwen_25_72b': {
42
+ 'name': 'Qwen/Qwen2.5-72B-Instruct',
43
+ 'alias': 'QWQ 32B'
44
+ },
45
+ 'llama_33': {
46
+ 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
47
+ 'alias': 'Llama 3.3 70B'
48
+ },
49
+ 'gemma2_2b': {
50
+ 'name': 'google/gemma-2-2b-it',
51
+ 'alias': 'Gemma 2 2B'
52
+ },
53
+ 'bert-large-uncased': {
54
+ 'name': 'google-bert/bert-large-uncased',
55
+ 'alias': 'Bert Large Uncased'
56
+ },
57
+ 'gpt2': {
58
+ 'name': 'openai-community/gpt2',
59
+ 'alias': 'GPT-2'
60
+ }
61
+ }
62
+
63
+ # Initialize tokenizers dict
64
+ tokenizers = {}
65
+ # Dictionary to store custom model loading errors
66
+ custom_model_errors = {}
67
+ # Cache for custom tokenizers with timestamp
68
+ custom_tokenizers = {}
69
+ # Cache for tokenizer info
70
+ tokenizer_info_cache = {}
71
+ # Cache expiration time (1 hour)
72
+ CACHE_EXPIRATION = 3600 # seconds
73
+
74
+ def get_tokenizer_info(tokenizer):
75
+ """
76
+ Extract useful information from a tokenizer.
77
+ Returns a dictionary with tokenizer details.
78
+ """
79
+ info = {}
80
+ try:
81
+ # Get vocabulary size (dictionary size)
82
+ if hasattr(tokenizer, 'vocab_size'):
83
+ info['vocab_size'] = tokenizer.vocab_size
84
+ elif hasattr(tokenizer, 'get_vocab'):
85
+ info['vocab_size'] = len(tokenizer.get_vocab())
86
+
87
+ # Get model max length if available
88
+ if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
89
+ info['model_max_length'] = tokenizer.model_max_length
90
+
91
+ # Check tokenizer type
92
+ info['tokenizer_type'] = tokenizer.__class__.__name__
93
+
94
+ # Get special tokens
95
+ special_tokens = {}
96
+ for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
97
+ if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
98
+ token_value = getattr(tokenizer, token_name)
99
+ if token_value and str(token_value).strip():
100
+ special_tokens[token_name] = str(token_value)
101
+
102
+ info['special_tokens'] = special_tokens
103
+
104
+ except Exception as e:
105
+ info['error'] = f"Error extracting tokenizer info: {str(e)}"
106
+
107
+ return info
108
+
109
+ # c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다.
110
+ # c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
111
+ # exec(c)
112
+
113
+ def load_tokenizer(model_id_or_name):
114
+ """
115
+ Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
116
+ Returns a tuple of (tokenizer, tokenizer_info, error_message)
117
+ """
118
+ error_message = None
119
+ tokenizer_info = {}
120
+
121
+ # Check if we have cached tokenizer info
122
+ if model_id_or_name in tokenizer_info_cache:
123
+ tokenizer_info = tokenizer_info_cache[model_id_or_name]
124
+
125
+ try:
126
+ # Check if it's a predefined model ID
127
+ if model_id_or_name in TOKENIZER_MODELS:
128
+ model_name = TOKENIZER_MODELS[model_id_or_name]['name']
129
+ if model_id_or_name not in tokenizers:
130
+ tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
131
+ tokenizer = tokenizers[model_id_or_name]
132
+
133
+ # Get tokenizer info if not already cached
134
+ if model_id_or_name not in tokenizer_info_cache:
135
+ tokenizer_info = get_tokenizer_info(tokenizer)
136
+ tokenizer_info_cache[model_id_or_name] = tokenizer_info
137
+
138
+ return tokenizer, tokenizer_info, None
139
+
140
+ # It's a custom model path
141
+ # Check if we have it in the custom cache and it's not expired
142
+ current_time = time.time()
143
+ if model_id_or_name in custom_tokenizers:
144
+ cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
145
+ if current_time - timestamp < CACHE_EXPIRATION:
146
+ # Get tokenizer info if not already cached
147
+ if model_id_or_name not in tokenizer_info_cache:
148
+ tokenizer_info = get_tokenizer_info(cached_tokenizer)
149
+ tokenizer_info_cache[model_id_or_name] = tokenizer_info
150
+ return cached_tokenizer, tokenizer_info, None
151
+
152
+ # Not in cache or expired, load it
153
+ tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
154
+ # Store in cache with timestamp
155
+ custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
156
+ # Clear any previous errors for this model
157
+ if model_id_or_name in custom_model_errors:
158
+ del custom_model_errors[model_id_or_name]
159
+
160
+ # Get tokenizer info
161
+ tokenizer_info = get_tokenizer_info(tokenizer)
162
+ tokenizer_info_cache[model_id_or_name] = tokenizer_info
163
+
164
+ return tokenizer, tokenizer_info, None
165
+
166
+ except Exception as e:
167
+ error_message = f"Failed to load tokenizer: {str(e)}"
168
+ # Store error for future reference
169
+ custom_model_errors[model_id_or_name] = error_message
170
+ return None, tokenizer_info, error_message
171
+
172
+ def get_varied_color(token: str) -> dict:
173
+ """Generate vibrant colors with HSL for better visual distinction."""
174
+ token_hash = hashlib.md5(token.encode()).hexdigest()
175
+ hue = int(token_hash[:3], 16) % 360
176
+ saturation = 70 + (int(token_hash[3:5], 16) % 20)
177
+ lightness = 80 + (int(token_hash[5:7], 16) % 10)
178
+ text_lightness = 20 if lightness > 50 else 90
179
+
180
+ return {
181
+ 'background': f'hsl({hue}, {saturation}%, {lightness}%)',
182
+ 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
183
+ }
184
+
185
+ def fix_token(token: str, tokenizer) -> str:
186
+ """
187
+ 실제로 UI에 표시하기 전에, tokenizer.decode()를 통해
188
+ 사람이 읽을 수 있는 형태로 디코딩한다.
189
+ """
190
+ if not token.strip():
191
+ return token
192
+
193
+ # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode
194
+ token_id = tokenizer.convert_tokens_to_ids(token)
195
+ decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False)
196
+ return decoded
197
+
198
+ def get_token_stats(tokens: list, original_text: str) -> dict:
199
+ """Calculate enhanced statistics about the tokens."""
200
+ if not tokens:
201
+ return {}
202
+
203
+ total_tokens = len(tokens)
204
+ unique_tokens = len(set(tokens))
205
+ avg_length = sum(len(t) for t in tokens) / total_tokens
206
+ compression_ratio = len(original_text) / total_tokens
207
+
208
+ # Token type analysis
209
+ space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
210
+ newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
211
+ special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
212
+ punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
213
+
214
+ # Length distribution
215
+ lengths = [len(t) for t in tokens]
216
+ mean_length = sum(lengths) / len(lengths)
217
+ variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
218
+ std_dev = math.sqrt(variance)
219
+
220
+ return {
221
+ 'basic_stats': {
222
+ 'total_tokens': total_tokens,
223
+ 'unique_tokens': unique_tokens,
224
+ 'compression_ratio': round(compression_ratio, 2),
225
+ 'space_tokens': space_tokens,
226
+ 'newline_tokens': newline_tokens,
227
+ 'special_tokens': special_tokens,
228
+ 'punctuation_tokens': punctuation_tokens,
229
+ 'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
230
+ },
231
+ 'length_stats': {
232
+ 'avg_length': round(avg_length, 2),
233
+ 'std_dev': round(std_dev, 2),
234
+ 'min_length': min(lengths),
235
+ 'max_length': max(lengths),
236
+ 'median_length': sorted(lengths)[len(lengths)//2]
237
+ }
238
+ }
239
+
240
+ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
241
+ """Process text and return tokenization data."""
242
+ tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)
243
+
244
+ if error:
245
+ raise Exception(error)
246
+
247
+ # For file uploads, read only preview from file but process full file for stats
248
+ if file_path and is_full_file:
249
+ # Read the preview for display with UTF-8
250
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
251
+ preview_text = f.read(8096)
252
+
253
+ # Tokenize preview for display
254
+ preview_tokens = tokenizer.tokenize(preview_text)
255
+ display_tokens = preview_tokens[:50000]
256
+
257
+ # Process full file for stats in chunks to avoid memory issues
258
+ total_tokens = []
259
+ token_set = set()
260
+ total_length = 0
261
+ chunk_size = 1024 * 1024 # 1MB chunks
262
+
263
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
264
+ while True:
265
+ chunk = f.read(chunk_size)
266
+ if not chunk:
267
+ break
268
+ total_length += len(chunk)
269
+ chunk_tokens = tokenizer.tokenize(chunk)
270
+ total_tokens.extend(chunk_tokens)
271
+ token_set.update(chunk_tokens)
272
+
273
+ # Calculate stats
274
+ stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
275
+ else:
276
+ # Standard processing for normal text input
277
+ all_tokens = tokenizer.tokenize(text)
278
+ total_token_count = len(all_tokens)
279
+
280
+ # For display: if it's a preview, only take first 8096 chars
281
+ preview_text = text[:8096] if is_full_file else text
282
+ preview_tokens = tokenizer.tokenize(preview_text)
283
+ display_tokens = preview_tokens[:50000]
284
+
285
+ # Always use full text for stats
286
+ stats = get_token_stats(all_tokens, text)
287
+ total_tokens = all_tokens
288
+
289
+ # Format tokens for display
290
+ token_data = []
291
+ for idx, token in enumerate(display_tokens):
292
+ colors = get_varied_color(token)
293
+ decoded_token = fix_token(token, tokenizer)
294
+ token_id = tokenizer.convert_tokens_to_ids(token)
295
+ newline_flag = decoded_token.endswith('\n')
296
+ display_str = decoded_token[:-1] if newline_flag else decoded_token
297
+
298
+ token_data.append({
299
+ 'original': token, # raw token
300
+ 'display': display_str, # 사람이 읽을 수 있는 디코딩된 토큰
301
+ 'colors': colors,
302
+ 'newline': newline_flag,
303
+ 'token_id': token_id,
304
+ 'token_index': idx
305
+ })
306
+
307
+ total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
308
+
309
+ return {
310
+ 'tokens': token_data,
311
+ 'stats': stats,
312
+ 'display_limit_reached': total_token_count > 50000 and not is_full_file,
313
+ 'total_tokens': total_token_count,
314
+ 'is_full_file': is_full_file,
315
+ 'preview_only': is_full_file,
316
+ 'tokenizer_info': tokenizer_info
317
+ }
318
+
319
+ # ===== 밝고 시원한 느낌의 UI로 CSS 변경 (나머지 파이썬/HTML 코드 구조 그대로 유지) =====
320
+ HTML_TEMPLATE = """
321
+ <!DOCTYPE html>
322
+ <html>
323
+ <head>
324
+ <title>Token Visualizer</title>
325
+ <meta charset="UTF-8">
326
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
327
+ <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
328
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
329
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
330
+ <style>
331
+ :root {
332
+ /* 메인 컬러 계열: 블루+화이트 톤 */
333
+ --primary-color: #388bfd; /* 메인 포인트 파랑 */
334
+ --primary-hover: #2c72d4; /* hover시 좀 더 진한 파랑 */
335
+ --bg-color: #f3f7fc; /* 부드러운 흰 배경 */
336
+ --card-bg: #ffffff; /* 카드 배경색 */
337
+ --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
338
+ 0 2px 4px -1px rgba(0, 0, 0, 0.06);
339
+ --transition: all 0.3s ease;
340
+ --text-color: #2e2e2e; /* 일반 텍스트 컬러 */
341
+ --secondary-text: #6c757d; /* 서브 텍스트 */
342
+ --input-bg: #ffffff; /* 입력창 배경 */
343
+ --input-border: #ced4da; /* 입력창 테두리 */
344
+ --input-focus: #388bfd; /* 포커스 시 테두리 컬러 */
345
+ }
346
+
347
+ * {
348
+ margin: 0;
349
+ padding: 0;
350
+ box-sizing: border-box;
351
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
352
+ scrollbar-width: thin;
353
+ scrollbar-color: var(--primary-color) var(--bg-color);
354
+ }
355
+
356
+ /* 스크롤바 */
357
+ ::-webkit-scrollbar {
358
+ width: 12px;
359
+ height: 12px;
360
+ }
361
+ ::-webkit-scrollbar-track {
362
+ background: var(--bg-color);
363
+ border-radius: 10px;
364
+ }
365
+ ::-webkit-scrollbar-thumb {
366
+ background: var(--primary-color);
367
+ border-radius: 10px;
368
+ border: 2px solid var(--bg-color);
369
+ }
370
+ ::-webkit-scrollbar-thumb:hover {
371
+ background: var(--primary-hover);
372
+ }
373
+
374
+ @keyframes spin {
375
+ from { transform: rotate(0deg); }
376
+ to { transform: rotate(360deg); }
377
+ }
378
+
379
+ body {
380
+ background-color: var(--bg-color);
381
+ padding: 2rem;
382
+ min-height: 100vh;
383
+ /* 부드러운 그라디언트 */
384
+ background-image:
385
+ radial-gradient(circle at 20% 20%, rgba(56,139,253, 0.06) 0%, transparent 50%),
386
+ radial-gradient(circle at 80% 80%, rgba(56,139,253, 0.06) 0%, transparent 50%);
387
+ color: var(--text-color);
388
+ }
389
+
390
+ .container {
391
+ max-width: 1200px;
392
+ margin: 0 auto;
393
+ }
394
+
395
+ .header {
396
+ display: flex;
397
+ justify-content: space-between;
398
+ align-items: center;
399
+ margin-bottom: 2rem;
400
+ position: relative;
401
+ flex-wrap: wrap;
402
+ }
403
+
404
+ .title-section {
405
+ flex-grow: 1;
406
+ }
407
+
408
+ .title {
409
+ font-size: 2.5rem;
410
+ font-weight: 800;
411
+ color: var(--primary-color);
412
+ margin-bottom: 0.5rem;
413
+ }
414
+
415
+ .subtitle {
416
+ color: var(--secondary-text);
417
+ font-size: 1.1rem;
418
+ }
419
+
420
+ .model-selector {
421
+ position: relative;
422
+ min-width: 220px;
423
+ margin-top: 1rem;
424
+ }
425
+
426
+ .model-selector-header {
427
+ display: flex;
428
+ gap: 0.5rem;
429
+ margin-bottom: 0.5rem;
430
+ justify-content: flex-end;
431
+ }
432
+
433
+ .model-type-toggle {
434
+ display: flex;
435
+ background-color: #e9ecef;
436
+ border-radius: 0.5rem;
437
+ padding: 0.25rem;
438
+ overflow: hidden;
439
+ }
440
+
441
+ .toggle-option {
442
+ padding: 0.5rem 0.75rem;
443
+ font-size: 0.8rem;
444
+ font-weight: 500;
445
+ cursor: pointer;
446
+ transition: var(--transition);
447
+ border-radius: 0.375rem;
448
+ color: var(--secondary-text);
449
+ }
450
+
451
+ .toggle-option.active {
452
+ background-color: var(--primary-color);
453
+ color: #fff;
454
+ }
455
+
456
+ select {
457
+ width: 100%;
458
+ padding: 0.75rem 1rem;
459
+ border: 2px solid var(--input-border);
460
+ border-radius: 0.5rem;
461
+ font-size: 1rem;
462
+ color: var(--text-color);
463
+ background-color: var(--input-bg);
464
+ cursor: pointer;
465
+ transition: var(--transition);
466
+ appearance: none;
467
+ background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%23388bfd'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
468
+ background-repeat: no-repeat;
469
+ background-position: right 1rem center;
470
+ background-size: 1.5rem;
471
+ }
472
+
473
+ select:hover, .custom-model-input:hover {
474
+ border-color: var(--primary-hover);
475
+ }
476
+
477
+ select:focus, .custom-model-input:focus {
478
+ outline: none;
479
+ border-color: var(--primary-color);
480
+ box-shadow: 0 0 0 3px rgba(56,139,253, 0.15);
481
+ }
482
+
483
+ .custom-model-input {
484
+ width: 100%;
485
+ padding: 0.75rem 1rem;
486
+ border: 2px solid var(--input-border);
487
+ border-radius: 0.5rem;
488
+ font-size: 1rem;
489
+ color: var(--text-color);
490
+ background-color: var(--input-bg);
491
+ transition: var(--transition);
492
+ }
493
+
494
+ .input-section {
495
+ margin-bottom: 2rem;
496
+ }
497
+
498
+ textarea {
499
+ width: 100%;
500
+ height: 150px;
501
+ padding: 1.25rem;
502
+ border: 2px solid var(--input-border);
503
+ border-radius: 0.75rem;
504
+ resize: vertical;
505
+ font-size: 1rem;
506
+ margin-bottom: 1rem;
507
+ transition: var(--transition);
508
+ background-color: var(--input-bg);
509
+ color: var(--text-color);
510
+ }
511
+
512
+ textarea:focus {
513
+ outline: none;
514
+ border-color: var(--input-focus);
515
+ box-shadow: 0 0 0 3px rgba(56,139,253, 0.15);
516
+ }
517
+
518
+ .button-container {
519
+ display: flex;
520
+ justify-content: center;
521
+ width: 100%;
522
+ gap: 1rem;
523
+ }
524
+
525
+ button {
526
+ padding: 0.875rem 2.5rem;
527
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
528
+ color: #fff;
529
+ border: none;
530
+ border-radius: 0.75rem;
531
+ font-size: 1.1rem;
532
+ font-weight: 600;
533
+ cursor: pointer;
534
+ transition: var(--transition);
535
+ box-shadow: 0 4px 6px -1px rgba(56,139,253, 0.2);
536
+ }
537
+
538
+ button:hover {
539
+ transform: translateY(-2px);
540
+ box-shadow: 0 6px 8px -1px rgba(56,139,253, 0.3);
541
+ }
542
+
543
+ button:active {
544
+ transform: translateY(0);
545
+ }
546
+
547
+ button:disabled {
548
+ opacity: 0.7;
549
+ cursor: not-allowed;
550
+ }
551
+
552
+ .card {
553
+ background-color: var(--card-bg);
554
+ border-radius: 1rem;
555
+ box-shadow: var(--card-shadow);
556
+ padding: 1.5rem;
557
+ margin-bottom: 2rem;
558
+ transition: var(--transition);
559
+ }
560
+
561
+ .card:hover {
562
+ transform: translateY(-1px);
563
+ box-shadow: 0 5px 10px -2px rgba(0,0,0,0.1);
564
+ }
565
+
566
+ .card-title {
567
+ font-size: 1.25rem;
568
+ font-weight: 700;
569
+ color: var(--text-color);
570
+ margin-bottom: 1.25rem;
571
+ display: flex;
572
+ align-items: center;
573
+ gap: 0.5rem;
574
+ cursor: default;
575
+ }
576
+
577
+ .card-title::before {
578
+ content: '';
579
+ display: block;
580
+ width: 4px;
581
+ height: 1.25rem;
582
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
583
+ border-radius: 2px;
584
+ }
585
+
586
+ .token-container {
587
+ display: flex;
588
+ flex-wrap: wrap;
589
+ gap: 0.375rem;
590
+ margin-bottom: 1rem;
591
+ padding: 1rem;
592
+ background-color: #f8fafc;
593
+ border-radius: 0.5rem;
594
+ max-height: 200px;
595
+ overflow-y: auto;
596
+ transition: max-height 0.3s ease;
597
+ }
598
+
599
+ .token-container.expanded {
600
+ max-height: none;
601
+ }
602
+
603
+ .token {
604
+ padding: 0.375rem 0.75rem;
605
+ border-radius: 0.375rem;
606
+ background-color: var(--input-bg);
607
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
608
+ font-size: 0.875rem;
609
+ color: var(--text-color);
610
+ cursor: default;
611
+ transition: var(--transition);
612
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
613
+ }
614
+
615
+ .token:hover {
616
+ transform: translateY(-1px);
617
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.08);
618
+ }
619
+
620
+ .stats-grid {
621
+ display: grid;
622
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
623
+ gap: 1.5rem;
624
+ margin-bottom: 2rem;
625
+ }
626
+
627
+ .stat-card {
628
+ background-color: var(--card-bg);
629
+ padding: 1.5rem;
630
+ border-radius: 1rem;
631
+ box-shadow: var(--card-shadow);
632
+ transition: var(--transition);
633
+ }
634
+
635
+ .stat-card:hover {
636
+ transform: translateY(-2px);
637
+ box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
638
+ }
639
+
640
+ .stat-title {
641
+ color: var(--secondary-text);
642
+ font-size: 0.875rem;
643
+ font-weight: 500;
644
+ margin-bottom: 0.5rem;
645
+ text-transform: uppercase;
646
+ letter-spacing: 0.05em;
647
+ }
648
+
649
+ .stat-value {
650
+ color: var(--text-color);
651
+ font-size: 2rem;
652
+ font-weight: 700;
653
+ line-height: 1.2;
654
+ margin-bottom: 0.25rem;
655
+ }
656
+
657
+ .stat-description {
658
+ color: var(--secondary-text);
659
+ font-size: 0.875rem;
660
+ }
661
+
662
+ .expand-button {
663
+ background: none;
664
+ border: none;
665
+ color: var(--primary-color);
666
+ font-size: 0.875rem;
667
+ padding: 0.5rem;
668
+ cursor: pointer;
669
+ display: block;
670
+ margin: 0 auto;
671
+ box-shadow: none;
672
+ }
673
+
674
+ .expand-button:hover {
675
+ text-decoration: underline;
676
+ transform: none;
677
+ box-shadow: none;
678
+ }
679
+
680
+ .error-message {
681
+ color: #EF4444;
682
+ background-color: #fee2e2;
683
+ border: 1px solid #fecaca;
684
+ padding: 1rem;
685
+ border-radius: 0.5rem;
686
+ margin-bottom: 1rem;
687
+ display: none;
688
+ }
689
+
690
+ .display-limit-notice {
691
+ background-color: #fff9db;
692
+ border: 1px solid #fef3c7;
693
+ color: #b45309;
694
+ padding: 0.75rem;
695
+ border-radius: 0.5rem;
696
+ margin-top: 1rem;
697
+ font-size: 0.875rem;
698
+ display: none;
699
+ }
700
+
701
+ /* File drop zone styles */
702
+ .file-drop-zone {
703
+ position: fixed;
704
+ top: 0;
705
+ left: 0;
706
+ width: 100%;
707
+ height: 100%;
708
+ background-color: rgba(56,139,253, 0.15);
709
+ z-index: 1000;
710
+ display: flex;
711
+ justify-content: center;
712
+ align-items: center;
713
+ opacity: 0;
714
+ pointer-events: none;
715
+ transition: opacity 0.3s ease;
716
+ }
717
+
718
+ .file-drop-zone.active {
719
+ opacity: 1;
720
+ pointer-events: all;
721
+ }
722
+
723
+ .drop-indicator {
724
+ background-color: var(--card-bg);
725
+ border: 2px dashed var(--primary-color);
726
+ border-radius: 1rem;
727
+ padding: 2rem;
728
+ text-align: center;
729
+ width: 60%;
730
+ max-width: 400px;
731
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.15);
732
+ animation: pulse 2s infinite;
733
+ }
734
+
735
+ @keyframes pulse {
736
+ 0% { transform: scale(1); }
737
+ 50% { transform: scale(1.05); }
738
+ 100% { transform: scale(1); }
739
+ }
740
+
741
+ .drop-indicator p {
742
+ margin-bottom: 0.5rem;
743
+ color: var(--text-color);
744
+ font-size: 1.2rem;
745
+ }
746
+
747
+ .file-icon {
748
+ font-size: 3rem;
749
+ margin-bottom: 1rem;
750
+ color: var(--primary-color);
751
+ }
752
+
753
+ .file-upload-icon {
754
+ position: fixed;
755
+ bottom: 20px;
756
+ left: 20px;
757
+ width: 45px;
758
+ height: 45px;
759
+ background-color: var(--card-bg);
760
+ border-radius: 50%;
761
+ display: flex;
762
+ justify-content: center;
763
+ align-items: center;
764
+ cursor: pointer;
765
+ z-index: 100;
766
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.15);
767
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
768
+ }
769
+
770
+ .file-upload-icon:hover {
771
+ transform: translateY(-2px);
772
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
773
+ }
774
+
775
+ .file-upload-icon span {
776
+ font-size: 1.5rem;
777
+ color: var(--primary-color);
778
+ }
779
+
780
+ .file-info {
781
+ position: fixed;
782
+ bottom: 20px;
783
+ left: 75px;
784
+ background-color: var(--card-bg);
785
+ color: var(--primary-color);
786
+ font-weight: 500;
787
+ padding: 0.5rem 1rem;
788
+ border-radius: 1rem;
789
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.15);
790
+ max-width: 270px;
791
+ white-space: nowrap;
792
+ overflow: hidden;
793
+ text-overflow: ellipsis;
794
+ z-index: 100;
795
+ display: none;
796
+ }
797
+
798
+ .file-detach {
799
+ margin-left: 8px;
800
+ display: inline-block;
801
+ width: 18px;
802
+ height: 18px;
803
+ background-color: rgba(0, 0, 0, 0.05);
804
+ color: #ef4444;
805
+ border-radius: 50%;
806
+ text-align: center;
807
+ line-height: 16px;
808
+ font-size: 12px;
809
+ cursor: pointer;
810
+ transition: all 0.2s ease;
811
+ }
812
+
813
+ .file-detach:hover {
814
+ background-color: rgba(239, 68, 68, 0.15);
815
+ transform: scale(1.1);
816
+ }
817
+
818
+ .preview-notice {
819
+ background-color: #e1f0ff;
820
+ border: 1px solid #bfdbfe;
821
+ color: #2563eb;
822
+ padding: 0.75rem;
823
+ border-radius: 0.5rem;
824
+ margin-top: 1rem;
825
+ font-size: 0.875rem;
826
+ display: none;
827
+ }
828
+
829
+ .custom-model-wrapper {
830
+ position: relative;
831
+ }
832
+
833
+ .model-badge {
834
+ position: absolute;
835
+ top: -10px;
836
+ right: -5px;
837
+ background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
838
+ color: white;
839
+ font-size: 0.7rem;
840
+ font-weight: 700;
841
+ padding: 0.25rem 0.5rem;
842
+ border-radius: 999px;
843
+ transform: scale(0);
844
+ transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
845
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
846
+ z-index: 10;
847
+ }
848
+
849
+ .model-badge.show {
850
+ transform: scale(1);
851
+ }
852
+
853
+ .custom-model-help {
854
+ display: inline-block;
855
+ width: 16px;
856
+ height: 16px;
857
+ line-height: 16px;
858
+ font-size: 11px;
859
+ font-weight: bold;
860
+ text-align: center;
861
+ background-color: var(--secondary-text);
862
+ color: var(--card-bg);
863
+ border-radius: 50%;
864
+ margin-left: 5px;
865
+ cursor: help;
866
+ vertical-align: middle;
867
+ }
868
+
869
+ .tooltip {
870
+ position: absolute;
871
+ top: 100%;
872
+ left: 0;
873
+ width: 280px;
874
+ background-color: #333;
875
+ color: #fff;
876
+ padding: 0.75rem;
877
+ border-radius: 0.5rem;
878
+ font-size: 0.8rem;
879
+ margin-top: 0.5rem;
880
+ z-index: 100;
881
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
882
+ opacity: 0;
883
+ visibility: hidden;
884
+ transition: opacity 0.2s, visibility 0.2s;
885
+ }
886
+
887
+ .custom-model-help:hover + .tooltip {
888
+ opacity: 1;
889
+ visibility: visible;
890
+ }
891
+
892
+ /* Tokenizer info icon and tooltip styles */
893
+ .tokenizer-info-icon {
894
+ display: inline-flex;
895
+ align-items: center;
896
+ justify-content: center;
897
+ width: 24px;
898
+ height: 24px;
899
+ background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
900
+ color: white;
901
+ border-radius: 50%;
902
+ position: absolute;
903
+ left: -32px;
904
+ top: 50%;
905
+ transform: translateY(-50%);
906
+ cursor: pointer;
907
+ font-size: 12px;
908
+ font-weight: bold;
909
+ transition: all 0.2s ease;
910
+ z-index: 10;
911
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
912
+ }
913
+
914
+ .tokenizer-info-icon:hover {
915
+ transform: translateY(-50%) scale(1.1);
916
+ box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
917
+ }
918
+
919
+ .tokenizer-info-tooltip {
920
+ position: absolute;
921
+ top: calc(100% + 8px);
922
+ left: -30px;
923
+ width: 300px;
924
+ background-color: var(--card-bg);
925
+ color: var(--text-color);
926
+ border: 1px solid var(--primary-color);
927
+ border-radius: 0.75rem;
928
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.15);
929
+ padding: 1rem;
930
+ z-index: 1000;
931
+ opacity: 0;
932
+ visibility: hidden;
933
+ transition: opacity 0.3s, visibility 0.3s;
934
+ pointer-events: none;
935
+ }
936
+
937
+ .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
938
+ opacity: 1;
939
+ visibility: visible;
940
+ pointer-events: auto;
941
+ }
942
+
943
+ .tokenizer-info-tooltip:hover {
944
+ opacity: 1;
945
+ visibility: visible;
946
+ pointer-events: auto;
947
+ }
948
+
949
+ .tokenizer-info-header {
950
+ font-size: 1.1rem;
951
+ font-weight: 600;
952
+ margin-bottom: 0.5rem;
953
+ padding-bottom: 0.5rem;
954
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
955
+ color: var(--primary-color);
956
+ }
957
+
958
+ .tokenizer-info-grid {
959
+ display: grid;
960
+ grid-template-columns: repeat(2, 1fr);
961
+ gap: 0.75rem;
962
+ margin: 0.75rem 0;
963
+ }
964
+
965
+ .tokenizer-info-item {
966
+ display: flex;
967
+ flex-direction: column;
968
+ }
969
+
970
+ .tokenizer-info-label {
971
+ font-size: 0.75rem;
972
+ color: var(--secondary-text);
973
+ margin-bottom: 0.25rem;
974
+ }
975
+
976
+ .tokenizer-info-value {
977
+ font-size: 0.95rem;
978
+ font-weight: 500;
979
+ }
980
+
981
+ .special-tokens-container {
982
+ margin-top: 0.75rem;
983
+ background-color: rgba(56,139,253, 0.06);
984
+ border-radius: 0.5rem;
985
+ padding: 0.5rem;
986
+ max-height: 100px;
987
+ overflow-y: auto;
988
+ }
989
+
990
+ .special-token-item {
991
+ display: flex;
992
+ justify-content: space-between;
993
+ margin-bottom: 0.25rem;
994
+ font-size: 0.8rem;
995
+ }
996
+
997
+ .token-name {
998
+ color: var(--secondary-text);
999
+ }
1000
+
1001
+ .token-value {
1002
+ background-color: rgba(255, 255, 255, 0.4);
1003
+ padding: 1px 4px;
1004
+ border-radius: 2px;
1005
+ font-family: monospace;
1006
+ }
1007
+
1008
+ .tokenizer-info-loading {
1009
+ display: flex;
1010
+ justify-content: center;
1011
+ align-items: center;
1012
+ height: 100px;
1013
+ }
1014
+
1015
+ .tokenizer-info-spinner {
1016
+ width: 30px;
1017
+ height: 30px;
1018
+ border: 3px solid var(--primary-color);
1019
+ border-radius: 50%;
1020
+ border-top-color: transparent;
1021
+ animation: spin 1s linear infinite;
1022
+ }
1023
+
1024
+ .tokenizer-info-error {
1025
+ color: #f87171;
1026
+ font-size: 0.9rem;
1027
+ text-align: center;
1028
+ padding: 1rem;
1029
+ }
1030
+
1031
+ @media (max-width: 768px) {
1032
+ .header {
1033
+ flex-direction: column;
1034
+ align-items: flex-start;
1035
+ gap: 1rem;
1036
+ }
1037
+
1038
+ .model-selector {
1039
+ width: 100%;
1040
+ }
1041
+
1042
+ .stats-grid {
1043
+ grid-template-columns: 1fr;
1044
+ }
1045
+
1046
+ .tokenizer-info-tooltip {
1047
+ width: 250px;
1048
+ }
1049
+ }
1050
+ </style>
1051
+ </head>
1052
+ <body>
1053
+ <!-- Hidden File Drop Zone that appears when dragging files -->
1054
+ <div id="fileDropZone" class="file-drop-zone">
1055
+ <div class="drop-indicator">
1056
+ <div class="file-icon">📄</div>
1057
+ <p>Drop your file here</p>
1058
+ </div>
1059
+ </div>
1060
+
1061
+ <!-- File upload icon in bottom left corner -->
1062
+ <div id="fileUploadIcon" class="file-upload-icon">
1063
+ <span>📎</span>
1064
+ </div>
1065
+ <p class="file-info" id="fileInfo"></p>
1066
+
1067
+ <div class="container">
1068
+ <div class="header">
1069
+ <div class="title-section">
1070
+ <h1 class="title">Token Visualizer</h1>
1071
+ <p class="subtitle">Advanced tokenization analysis and visualization</p>
1072
+ </div>
1073
+ <div class="model-selector">
1074
+ <div class="model-selector-header">
1075
+ <div class="model-type-toggle">
1076
+ <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
1077
+ <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
1078
+ </div>
1079
+ </div>
1080
+ <div id="predefinedModelSelector">
1081
+ <div style="position: relative;">
1082
+ <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
1083
+ <!-- TOOLTIP MOVED HERE -->
1084
+ <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
1085
+ <div id="tokenizerInfoContent">
1086
+ <div class="tokenizer-info-loading">
1087
+ <div class="tokenizer-info-spinner"></div>
1088
+ </div>
1089
+ </div>
1090
+ </div>
1091
+ <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
1092
+ <select id="modelSelect" name="model">
1093
+ {% for model_id, info in models.items() %}
1094
+ <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
1095
+ {{ info.alias }}
1096
+ </option>
1097
+ {% endfor %}
1098
+ </select>
1099
+ </div>
1100
+ </div>
1101
+ <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
1102
+ <div style="position: relative;">
1103
+ <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
1104
+ <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
1105
+ <div id="customTokenizerInfoContent">
1106
+ <div class="tokenizer-info-loading">
1107
+ <div class="tokenizer-info-spinner"></div>
1108
+ </div>
1109
+ </div>
1110
+ </div>
1111
+ <input type="text" id="customModelInput" class="custom-model-input"
1112
+ placeholder="Enter HuggingFace model path"
1113
+ value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
1114
+ </div>
1115
+ <span class="custom-model-help">?</span>
1116
+ <div class="tooltip">
1117
+ Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3").
1118
+ For Korean, you might use "beomi/KoAlpaca-Polyglot-12.8B" or "skt/kogpt2-base-v2", etc.
1119
+ The model must have a tokenizer available and be accessible.
1120
+ </div>
1121
+ <div class="model-badge" id="modelSuccessBadge">Loaded</div>
1122
+ </div>
1123
+ </div>
1124
+ </div>
1125
+
1126
+ <div class="error-message" id="errorMessage">{{ error }}</div>
1127
+
1128
+ <div class="input-section">
1129
+ <form id="analyzeForm" method="POST" enctype="multipart/form-data">
1130
+ <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
1131
+ <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
1132
+ <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
1133
+ <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
1134
+ <input type="file" name="file" id="fileInput" style="display: none;">
1135
+ <div class="button-container">
1136
+ <button type="submit" id="analyzeButton">Analyze Text</button>
1137
+ </div>
1138
+ </form>
1139
+ </div>
1140
+
1141
+ <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
1142
+ <div class="card">
1143
+ <h2 class="card-title">Token Visualization</h2>
1144
+ <div class="preview-notice" id="previewNotice">
1145
+ Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
1146
+ </div>
1147
+ <div class="token-container" id="tokenContainer">
1148
+ {% if token_data %}
1149
+ {% for token in token_data.tokens %}
1150
+ <span class="token"
1151
+ style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
1152
+ title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
1153
+ {{ token.display }}
1154
+ </span>
1155
+ {% if token.newline %}<br>{% endif %}
1156
+ {% endfor %}
1157
+ {% endif %}
1158
+ </div>
1159
+ <button class="expand-button" id="expandButton">Show More</button>
1160
+ <div class="display-limit-notice" id="displayLimitNotice">
1161
+ Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
1162
+ </div>
1163
+ </div>
1164
+
1165
+ <div class="stats-grid">
1166
+ <div class="stat-card">
1167
+ <div class="stat-title">Total Tokens</div>
1168
+ <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
1169
+ <div class="stat-description">
1170
+ <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
1171
+ (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
1172
+ </div>
1173
+ </div>
1174
+ <div class="stat-card">
1175
+ <div class="stat-title">Token Types</div>
1176
+ <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
1177
+ <div class="stat-description">special tokens</div>
1178
+ </div>
1179
+ <div class="stat-card">
1180
+ <div class="stat-title">Whitespace</div>
1181
+ <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
1182
+ <div class="stat-description">
1183
+ spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
1184
+ newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
1185
+ </div>
1186
+ </div>
1187
+ <div class="stat-card">
1188
+ <div class="stat-title">Token Length</div>
1189
+ <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
1190
+ <div class="stat-description">
1191
+ median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
1192
+ ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
1193
+ </div>
1194
+ </div>
1195
+ <div class="stat-card">
1196
+ <div class="stat-title">Compression</div>
1197
+ <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
1198
+ <div class="stat-description">characters per token</div>
1199
+ </div>
1200
+ </div>
1201
+ </div>
1202
+ </div>
1203
+ <a href="https://huggingface.co/spaces/barttee/tokenizers" target="_blank" class="watermark" style="position: fixed; bottom: 20px; right: 20px; text-decoration: none; font-size: 1.4rem; font-weight: 700; color: var(--primary-color); opacity: 0.4;">
1204
+ @barttee/tokenizers
1205
+ </a>
1206
+
1207
+ <script>
1208
+ $(document).ready(function() {
1209
+ // File handling variables
1210
+ let currentFile = null;
1211
+ let originalTextContent = null;
1212
+ let lastUploadedFileName = null;
1213
+ let fileJustUploaded = false; // Flag to prevent immediate detachment
1214
+ let currentModelType = "{{ model_type if model_type else 'predefined' }}";
1215
+ let currentTokenizerInfo = null;
1216
+
1217
+ // Try to parse tokenizer info if available from server
1218
+ try {
1219
+ currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }};
1220
+ if (currentTokenizerInfo) {
1221
+ updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
1222
+ }
1223
+ } catch(e) {
1224
+ console.error("Error parsing tokenizer info:", e);
1225
+ }
1226
+
1227
+ // Show error if exists
1228
+ if ("{{ error }}".length > 0) {
1229
+ showError("{{ error }}");
1230
+ }
1231
+
1232
+ // Setup model type based on initial state
1233
+ if (currentModelType === "custom") {
1234
+ $('.toggle-option').removeClass('active');
1235
+ $('.custom-toggle').addClass('active');
1236
+ $('#predefinedModelSelector').hide();
1237
+ $('#customModelSelector').show();
1238
+ }
1239
+
1240
+ // Show success badge if custom model loaded successfully
1241
+ if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
1242
+ $('#modelSuccessBadge').addClass('show');
1243
+ setTimeout(() => {
1244
+ $('#modelSuccessBadge').removeClass('show');
1245
+ }, 3000);
1246
+ }
1247
+
1248
+ // Toggle between predefined and custom model inputs
1249
+ $('.toggle-option').click(function() {
1250
+ const modelType = $(this).data('type');
1251
+ $('.toggle-option').removeClass('active');
1252
+ $(this).addClass('active');
1253
+ currentModelType = modelType;
1254
+
1255
+ if (modelType === 'predefined') {
1256
+ $('#predefinedModelSelector').show();
1257
+ $('#customModelSelector').hide();
1258
+ $('#modelTypeInput').val('predefined');
1259
+ // Set the model input value to the selected predefined model
1260
+ $('#modelInput').val($('#modelSelect').val());
1261
+ } else {
1262
+ $('#predefinedModelSelector').hide();
1263
+ $('#customModelSelector').show();
1264
+ $('#modelTypeInput').val('custom');
1265
+ }
1266
+
1267
+ // Clear tokenizer info if switching models
1268
+ if (modelType === 'predefined') {
1269
+ $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1270
+ fetchTokenizerInfo($('#modelSelect').val(), false);
1271
+ } else {
1272
+ $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1273
+ const customModel = $('#customModelInput').val();
1274
+ if (customModel) {
1275
+ fetchTokenizerInfo(customModel, true);
1276
+ }
1277
+ }
1278
+ });
1279
+
1280
+ // Update hidden input when custom model input changes
1281
+ $('#customModelInput').on('input', function() {
1282
+ $('#customModelInputHidden').val($(this).val());
1283
+ });
1284
+
1285
+ function showError(message) {
1286
+ const errorDiv = $('#errorMessage');
1287
+ errorDiv.text(message);
1288
+ errorDiv.show();
1289
+ setTimeout(() => errorDiv.fadeOut(), 5000);
1290
+ }
1291
+
1292
+ // Function to update tokenizer info display in tooltip
1293
+ function updateTokenizerInfoDisplay(info, isCustom = false) {
1294
+ const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1295
+ let htmlContent = '';
1296
+
1297
+ if (info.error) {
1298
+ $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
1299
+ return;
1300
+ }
1301
+
1302
+ // Start building the tooltip content
1303
+ htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
1304
+ <div class="tokenizer-info-grid">`;
1305
+
1306
+ if (info.vocab_size) {
1307
+ htmlContent += `
1308
+ <div class="tokenizer-info-item">
1309
+ <span class="tokenizer-info-label">Dictionary Size</span>
1310
+ <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
1311
+ </div>`;
1312
+ }
1313
+
1314
+ if (info.tokenizer_type) {
1315
+ htmlContent += `
1316
+ <div class="tokenizer-info-item">
1317
+ <span class="tokenizer-info-label">Tokenizer Type</span>
1318
+ <span class="tokenizer-info-value">${info.tokenizer_type}</span>
1319
+ </div>`;
1320
+ }
1321
+
1322
+ if (info.model_max_length) {
1323
+ htmlContent += `
1324
+ <div class="tokenizer-info-item">
1325
+ <span class="tokenizer-info-label">Max Length</span>
1326
+ <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
1327
+ </div>`;
1328
+ }
1329
+
1330
+ htmlContent += `</div>`;
1331
+
1332
+ // Special tokens section
1333
+ if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
1334
+ htmlContent += `
1335
+ <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
1336
+ <span class="tokenizer-info-label">Special Tokens</span>
1337
+ <div class="special-tokens-container">`;
1338
+
1339
+ for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
1340
+ const escapedValue = tokenValue
1341
+ .replace(/&/g, '&amp;')
1342
+ .replace(/</g, '&lt;')
1343
+ .replace(/>/g, '&gt;')
1344
+ .replace(/"/g, '&quot;')
1345
+ .replace(/'/g, '&#039;');
1346
+
1347
+ htmlContent += `
1348
+ <div class="special-token-item">
1349
+ <span class="token-name">${tokenName}:</span>
1350
+ <span class="token-value">${escapedValue}</span>
1351
+ </div>`;
1352
+ }
1353
+
1354
+ htmlContent += `
1355
+ </div>
1356
+ </div>`;
1357
+ }
1358
+
1359
+ $(targetSelector).html(htmlContent);
1360
+ }
1361
+
1362
+ // Function to fetch tokenizer info
1363
+ function fetchTokenizerInfo(modelId, isCustom = false) {
1364
+ if (!modelId) return;
1365
+
1366
+ const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1367
+ $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1368
+
1369
+ $.ajax({
1370
+ url: '/tokenizer-info',
1371
+ method: 'GET',
1372
+ data: {
1373
+ model_id: modelId,
1374
+ is_custom: isCustom
1375
+ },
1376
+ success: function(response) {
1377
+ if (response.error) {
1378
+ $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
1379
+ } else {
1380
+ currentTokenizerInfo = response;
1381
+ updateTokenizerInfoDisplay(response, isCustom);
1382
+ }
1383
+ },
1384
+ error: function(xhr) {
1385
+ $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
1386
+ }
1387
+ });
1388
+ }
1389
+
1390
+ function updateResults(data) {
1391
+ $('#results').show();
1392
+
1393
+ const tokenContainer = $('#tokenContainer');
1394
+ tokenContainer.empty();
1395
+ data.tokens.forEach(token => {
1396
+ const span = $('<span>')
1397
+ .addClass('token')
1398
+ .css({
1399
+ 'background-color': token.colors.background,
1400
+ 'color': token.colors.text
1401
+ })
1402
+ .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
1403
+ .text(token.display);
1404
+
1405
+ tokenContainer.append(span);
1406
+ if (token.newline) {
1407
+ tokenContainer.append('<br>');
1408
+ }
1409
+ });
1410
+
1411
+ if (data.display_limit_reached) {
1412
+ $('#displayLimitNotice').show();
1413
+ $('#totalTokenCount').text(data.total_tokens);
1414
+ } else {
1415
+ $('#displayLimitNotice').hide();
1416
+ }
1417
+
1418
+ if (data.preview_only) {
1419
+ $('#previewNotice').show();
1420
+ } else {
1421
+ $('#previewNotice').hide();
1422
+ }
1423
+
1424
+ $('#totalTokens').text(data.stats.basic_stats.total_tokens);
1425
+ $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
1426
+ $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
1427
+ $('#specialTokens').text(data.stats.basic_stats.special_tokens);
1428
+ $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
1429
+ $('#spaceCount').text(data.stats.basic_stats.space_tokens);
1430
+ $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
1431
+ $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
1432
+
1433
+ $('#avgLength').text(data.stats.length_stats.avg_length);
1434
+ $('#medianLength').text(data.stats.length_stats.median_length);
1435
+ $('#stdDev').text(data.stats.length_stats.std_dev);
1436
+
1437
+ if (data.tokenizer_info) {
1438
+ currentTokenizerInfo = data.tokenizer_info;
1439
+ updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
1440
+ }
1441
+ }
1442
+
1443
+ $('#textInput').on('input', function() {
1444
+ if (fileJustUploaded) {
1445
+ fileJustUploaded = false;
1446
+ return;
1447
+ }
1448
+
1449
+ const currentText = $(this).val();
1450
+ const fileInput = document.getElementById('fileInput');
1451
+
1452
+ if (fileInput.files.length > 0 && originalTextContent !== null) {
1453
+ const isMajorChange =
1454
+ currentText.length < originalTextContent.length * 0.8 ||
1455
+ (currentText.length > 0 &&
1456
+ currentText !== originalTextContent.substring(0, currentText.length) &&
1457
+ currentText.substring(0, Math.min(20, currentText.length)) !==
1458
+ originalTextContent.substring(0, Math.min(20, originalTextContent.length)));
1459
+
1460
+ if (isMajorChange) {
1461
+ detachFile();
1462
+ }
1463
+ }
1464
+ });
1465
+
1466
+ function detachFile() {
1467
+ $('#fileInput').val('');
1468
+ $('#fileInfo').fadeOut(300);
1469
+ originalTextContent = $('#textInput').val();
1470
+ lastUploadedFileName = null;
1471
+ }
1472
+
1473
+ $('#modelSelect').change(function() {
1474
+ const selectedModel = $(this).val();
1475
+ $('#modelInput').val(selectedModel);
1476
+ fetchTokenizerInfo(selectedModel, false);
1477
+
1478
+ if ($('#textInput').val().trim()) {
1479
+ $('#analyzeForm').submit();
1480
+ }
1481
+ });
1482
+
1483
+ const fileDropZone = $('#fileDropZone');
1484
+ const fileUploadIcon = $('#fileUploadIcon');
1485
+
1486
+ ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
1487
+ fileDropZone[0].addEventListener(eventName, preventDefaults, false);
1488
+ document.body.addEventListener(eventName, preventDefaults, false);
1489
+ });
1490
+
1491
+ function preventDefaults(e) {
1492
+ e.preventDefault();
1493
+ e.stopPropagation();
1494
+ }
1495
+
1496
+ document.addEventListener('dragenter', showDropZone, false);
1497
+ document.addEventListener('dragover', showDropZone, false);
1498
+
1499
+ fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
1500
+ fileDropZone[0].addEventListener('drop', hideDropZone, false);
1501
+
1502
+ function showDropZone(e) {
1503
+ fileDropZone.addClass('active');
1504
+ }
1505
+
1506
+ function hideDropZone() {
1507
+ fileDropZone.removeClass('active');
1508
+ }
1509
+
1510
+ fileDropZone[0].addEventListener('drop', handleDrop, false);
1511
+
1512
+ fileUploadIcon.on('click', function() {
1513
+ const input = document.createElement('input');
1514
+ input.type = 'file';
1515
+ input.onchange = e => {
1516
+ handleFiles(e.target.files);
1517
+ };
1518
+ input.click();
1519
+ });
1520
+
1521
+ function handleDrop(e) {
1522
+ const dt = e.dataTransfer;
1523
+ const files = dt.files;
1524
+ handleFiles(files);
1525
+ }
1526
+
1527
+ function handleFiles(files) {
1528
+ if (files.length) {
1529
+ const file = files[0];
1530
+ currentFile = file;
1531
+ lastUploadedFileName = file.name;
1532
+ fileJustUploaded = true;
1533
+
1534
+ $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
1535
+
1536
+ $('#fileDetach').on('click', function(e) {
1537
+ e.stopPropagation();
1538
+ detachFile();
1539
+ return false;
1540
+ });
1541
+
1542
+ const dataTransfer = new DataTransfer();
1543
+ dataTransfer.items.add(file);
1544
+ document.getElementById('fileInput').files = dataTransfer.files;
1545
+
1546
+ const reader = new FileReader();
1547
+ reader.onload = function(e) {
1548
+ const previewText = e.target.result.slice(0, 8096);
1549
+ $('#textInput').val(previewText);
1550
+
1551
+ setTimeout(() => {
1552
+ originalTextContent = previewText;
1553
+ $('#analyzeForm').submit();
1554
+ }, 50);
1555
+ };
1556
+ reader.readAsText(file, 'utf-8');
1557
+ }
1558
+ }
1559
+
1560
+ function formatFileSize(bytes) {
1561
+ if (bytes < 1024) return bytes + ' bytes';
1562
+ else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
1563
+ else return (bytes / 1048576).toFixed(1) + ' MB';
1564
+ }
1565
+
1566
+ $('#analyzeForm').on('submit', function(e) {
1567
+ e.preventDefault();
1568
+
1569
+ if (!fileJustUploaded) {
1570
+ const textInput = $('#textInput').val();
1571
+ const fileInput = document.getElementById('fileInput');
1572
+
1573
+ if (fileInput.files.length > 0 &&
1574
+ originalTextContent !== null &&
1575
+ textInput !== originalTextContent &&
1576
+ textInput.length < originalTextContent.length * 0.8) {
1577
+ detachFile();
1578
+ }
1579
+ } else {
1580
+ fileJustUploaded = false;
1581
+ }
1582
+
1583
+ if (currentModelType === 'custom') {
1584
+ $('#customModelInputHidden').val($('#customModelInput').val());
1585
+ } else {
1586
+ $('#modelInput').val($('#modelSelect').val());
1587
+ }
1588
+
1589
+ const formData = new FormData(this);
1590
+ $('#analyzeButton').prop('disabled', true);
1591
+
1592
+ $.ajax({
1593
+ url: '/',
1594
+ method: 'POST',
1595
+ data: formData,
1596
+ processData: false,
1597
+ contentType: false,
1598
+ success: function(response) {
1599
+ if (response.error) {
1600
+ showError(response.error);
1601
+ } else {
1602
+ updateResults(response);
1603
+
1604
+ if (currentModelType === 'custom') {
1605
+ $('#modelSuccessBadge').addClass('show');
1606
+ setTimeout(() => {
1607
+ $('#modelSuccessBadge').removeClass('show');
1608
+ }, 3000);
1609
+ }
1610
+ }
1611
+ },
1612
+ error: function(xhr) {
1613
+ showError(xhr.responseText || 'An error occurred while processing the text');
1614
+ },
1615
+ complete: function() {
1616
+ $('#analyzeButton').prop('disabled', false);
1617
+ }
1618
+ });
1619
+ });
1620
+
1621
+ $('#expandButton').click(function() {
1622
+ const container = $('#tokenContainer');
1623
+ const isExpanded = container.hasClass('expanded');
1624
+
1625
+ container.toggleClass('expanded');
1626
+ $(this).text(isExpanded ? 'Show More' : 'Show Less');
1627
+ });
1628
+
1629
+ if (currentModelType === 'predefined') {
1630
+ fetchTokenizerInfo($('#modelSelect').val(), false);
1631
+ } else if ($('#customModelInput').val()) {
1632
+ fetchTokenizerInfo($('#customModelInput').val(), true);
1633
+ }
1634
+
1635
+ $('#customModelInput').on('change', function() {
1636
+ const modelValue = $(this).val();
1637
+ if (modelValue) {
1638
+ fetchTokenizerInfo(modelValue, true);
1639
+ }
1640
+ });
1641
+ });
1642
+ </script>
1643
+ </body>
1644
+ </html>
1645
+ """
1646
+
1647
+ @app.route('/tokenizer-info', methods=['GET'])
1648
+ def tokenizer_info():
1649
+ """
1650
+ Endpoint to get tokenizer information without processing text.
1651
+ """
1652
+ model_id = request.args.get('model_id', '')
1653
+ is_custom = request.args.get('is_custom', 'false').lower() == 'true'
1654
+
1655
+ if not model_id:
1656
+ return jsonify({"error": "No model ID provided"}), 400
1657
+
1658
+ try:
1659
+ if not is_custom and model_id in TOKENIZER_MODELS:
1660
+ model_id_or_name = model_id
1661
+ else:
1662
+ model_id_or_name = model_id
1663
+
1664
+ tokenizer, info, error = load_tokenizer(model_id_or_name)
1665
+
1666
+ if error:
1667
+ return jsonify({"error": error}), 400
1668
+
1669
+ return jsonify(info)
1670
+ except Exception as e:
1671
+ return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
1672
+
1673
+ @app.route('/', methods=['GET', 'POST'])
1674
+ def index():
1675
+ text = ""
1676
+ token_data = None
1677
+ error_message = ""
1678
+ selected_model = request.args.get('model', request.form.get('model', 'llama4'))
1679
+ custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
1680
+ model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
1681
+
1682
+ model_to_use = selected_model if model_type == 'predefined' else custom_model
1683
+
1684
+ if request.method == 'POST':
1685
+ if 'file' in request.files and request.files['file'].filename:
1686
+ uploaded_file = request.files['file']
1687
+ file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
1688
+ uploaded_file.save(file_path)
1689
+
1690
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
1691
+ text = f.read(8096)
1692
+
1693
+ try:
1694
+ token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
1695
+ if os.path.exists(file_path):
1696
+ os.remove(file_path)
1697
+
1698
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1699
+ return jsonify(token_data)
1700
+
1701
+ except Exception as e:
1702
+ error_message = str(e)
1703
+ if os.path.exists(file_path):
1704
+ os.remove(file_path)
1705
+
1706
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1707
+ return jsonify({"error": error_message}), 400
1708
+ return render_template_string(
1709
+ HTML_TEMPLATE,
1710
+ text=text,
1711
+ token_data=None,
1712
+ models=TOKENIZER_MODELS,
1713
+ selected_model=selected_model,
1714
+ custom_model=custom_model,
1715
+ model_type=model_type,
1716
+ error=error_message
1717
+ )
1718
+ else:
1719
+ text = request.form.get('text', '')
1720
+ if text:
1721
+ try:
1722
+ token_data = process_text(text, model_to_use)
1723
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1724
+ return jsonify(token_data)
1725
+ except Exception as e:
1726
+ error_message = str(e)
1727
+ if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1728
+ return jsonify({"error": error_message}), 400
1729
+ return render_template_string(
1730
+ HTML_TEMPLATE,
1731
+ text=text,
1732
+ token_data=None,
1733
+ models=TOKENIZER_MODELS,
1734
+ selected_model=selected_model,
1735
+ custom_model=custom_model,
1736
+ model_type=model_type,
1737
+ error=error_message
1738
+ )
1739
+
1740
+ return render_template_string(
1741
+ HTML_TEMPLATE,
1742
+ text=text,
1743
+ token_data=token_data,
1744
+ models=TOKENIZER_MODELS,
1745
+ selected_model=selected_model,
1746
+ custom_model=custom_model,
1747
+ model_type=model_type,
1748
+ error=error_message
1749
+ )
1750
+
1751
+ if __name__ == "__main__":
1752
+ app.run(host='0.0.0.0', port=7860, debug=False)