aiqtech commited on
Commit
89ddffd
·
verified ·
1 Parent(s): 006e855

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -1817
app-backup.py DELETED
@@ -1,1817 +0,0 @@
1
- from transformers import AutoTokenizer
2
- from flask import Flask, request, render_template_string, jsonify
3
- import hashlib
4
- import sys
5
- import math
6
- import os
7
- import time
8
-
9
- app = Flask(__name__)
10
- # Set maximum content length to 25MB to handle larger files
11
- app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024
12
-
13
- # Create upload folder if it doesn't exist
14
- UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
15
- if not os.path.exists(UPLOAD_FOLDER):
16
- os.makedirs(UPLOAD_FOLDER)
17
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
18
-
19
- # Predefined tokenizer models with aliases
20
- TOKENIZER_MODELS = {
21
- 'qwen3': {
22
- 'name': 'Qwen/Qwen3-0.6B',
23
- 'alias': 'Qwen3'
24
- },
25
- 'llama4': {
26
- 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
27
- 'alias': 'Llama 4'
28
- },
29
- 'mistral-small': {
30
- 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
31
- 'alias': 'Mistral Small 3.1'
32
- },
33
- 'gemma3-27b': {
34
- 'name': 'google/gemma-3-27b-it',
35
- 'alias': 'Gemma 3 27B'
36
- },
37
- 'deepseek-r1': {
38
- 'name': 'deepseek-ai/DeepSeek-R1',
39
- 'alias': 'Deepseek R1'
40
- },
41
- 'qwen_25_72b': {
42
- 'name': 'Qwen/Qwen2.5-72B-Instruct',
43
- 'alias': 'QWQ 32B'
44
- },
45
- 'llama_33': {
46
- 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
47
- 'alias': 'Llama 3.3 70B'
48
- },
49
- 'gemma2_2b': {
50
- 'name': 'google/gemma-2-2b-it',
51
- 'alias': 'Gemma 2 2B'
52
- },
53
- 'bert-large-uncased': {
54
- 'name': 'google-bert/bert-large-uncased',
55
- 'alias': 'Bert Large Uncased'
56
- },
57
- 'gpt2': {
58
- 'name': 'openai-community/gpt2',
59
- 'alias': 'GPT-2'
60
- }
61
- }
62
-
63
- # Initialize tokenizers dict
64
- tokenizers = {}
65
- # Dictionary to store custom model loading errors
66
- custom_model_errors = {}
67
- # Cache for custom tokenizers with timestamp
68
- custom_tokenizers = {}
69
- # Cache for tokenizer info
70
- tokenizer_info_cache = {}
71
- # Cache expiration time (1 hour)
72
- CACHE_EXPIRATION = 3600 # seconds
73
-
74
- def get_tokenizer_info(tokenizer):
75
- """
76
- Extract useful information from a tokenizer.
77
- Returns a dictionary with tokenizer details.
78
- """
79
- info = {}
80
- try:
81
- # Get vocabulary size (dictionary size)
82
- if hasattr(tokenizer, 'vocab_size'):
83
- info['vocab_size'] = tokenizer.vocab_size
84
- elif hasattr(tokenizer, 'get_vocab'):
85
- info['vocab_size'] = len(tokenizer.get_vocab())
86
-
87
- # Get model max length if available
88
- if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
89
- info['model_max_length'] = tokenizer.model_max_length
90
-
91
- # Check tokenizer type
92
- info['tokenizer_type'] = tokenizer.__class__.__name__
93
-
94
- # Get special tokens
95
- special_tokens = {}
96
- for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
97
- if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
98
- token_value = getattr(tokenizer, token_name)
99
- if token_value and str(token_value).strip():
100
- special_tokens[token_name] = str(token_value)
101
-
102
- info['special_tokens'] = special_tokens
103
-
104
- except Exception as e:
105
- info['error'] = f"Error extracting tokenizer info: {str(e)}"
106
-
107
- return info
108
-
109
- # c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다.
110
- # c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
111
- # exec(c)
112
-
113
- def load_tokenizer(model_id_or_name):
114
- """
115
- Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
116
- Returns a tuple of (tokenizer, tokenizer_info, error_message)
117
- """
118
- error_message = None
119
- tokenizer_info = {}
120
-
121
- # Check if we have cached tokenizer info
122
- if model_id_or_name in tokenizer_info_cache:
123
- tokenizer_info = tokenizer_info_cache[model_id_or_name]
124
-
125
- try:
126
- # Check if it's a predefined model ID
127
- if model_id_or_name in TOKENIZER_MODELS:
128
- model_name = TOKENIZER_MODELS[model_id_or_name]['name']
129
- if model_id_or_name not in tokenizers:
130
- tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
131
- tokenizer = tokenizers[model_id_or_name]
132
-
133
- # Get tokenizer info if not already cached
134
- if model_id_or_name not in tokenizer_info_cache:
135
- tokenizer_info = get_tokenizer_info(tokenizer)
136
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
137
-
138
- return tokenizer, tokenizer_info, None
139
-
140
- # It's a custom model path
141
- # Check if we have it in the custom cache and it's not expired
142
- current_time = time.time()
143
- if model_id_or_name in custom_tokenizers:
144
- cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
145
- if current_time - timestamp < CACHE_EXPIRATION:
146
- # Get tokenizer info if not already cached
147
- if model_id_or_name not in tokenizer_info_cache:
148
- tokenizer_info = get_tokenizer_info(cached_tokenizer)
149
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
150
- return cached_tokenizer, tokenizer_info, None
151
-
152
- # Not in cache or expired, load it
153
- tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
154
- # Store in cache with timestamp
155
- custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
156
- # Clear any previous errors for this model
157
- if model_id_or_name in custom_model_errors:
158
- del custom_model_errors[model_id_or_name]
159
-
160
- # Get tokenizer info
161
- tokenizer_info = get_tokenizer_info(tokenizer)
162
- tokenizer_info_cache[model_id_or_name] = tokenizer_info
163
-
164
- return tokenizer, tokenizer_info, None
165
-
166
- except Exception as e:
167
- error_message = f"Failed to load tokenizer: {str(e)}"
168
- # Store error for future reference
169
- custom_model_errors[model_id_or_name] = error_message
170
- return None, tokenizer_info, error_message
171
-
172
- def get_varied_color(token: str) -> dict:
173
- """Generate vibrant colors with HSL for better visual distinction."""
174
- token_hash = hashlib.md5(token.encode()).hexdigest()
175
- hue = int(token_hash[:3], 16) % 360
176
- saturation = 70 + (int(token_hash[3:5], 16) % 20)
177
- lightness = 80 + (int(token_hash[5:7], 16) % 10)
178
- text_lightness = 20 if lightness > 50 else 90
179
-
180
- return {
181
- 'background': f'hsl({hue}, {saturation}%, {lightness}%)',
182
- 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
183
- }
184
-
185
- def fix_token(token: str, tokenizer) -> str:
186
- """
187
- 실제로 UI에 표시하기 전에, tokenizer.decode()를 통해
188
- 사람이 읽을 수 있는 형태로 디코딩한다.
189
- """
190
- if not token.strip():
191
- return token
192
-
193
- # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode
194
- token_id = tokenizer.convert_tokens_to_ids(token)
195
- decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False)
196
- return decoded
197
-
198
- def get_token_stats(tokens: list, original_text: str) -> dict:
199
- """Calculate enhanced statistics about the tokens."""
200
- if not tokens:
201
- return {}
202
-
203
- total_tokens = len(tokens)
204
- unique_tokens = len(set(tokens))
205
- avg_length = sum(len(t) for t in tokens) / total_tokens
206
- compression_ratio = len(original_text) / total_tokens
207
-
208
- # Token type analysis
209
- space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
210
- newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
211
- special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
212
- punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
213
-
214
- # Length distribution
215
- lengths = [len(t) for t in tokens]
216
- mean_length = sum(lengths) / len(lengths)
217
- variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
218
- std_dev = math.sqrt(variance)
219
-
220
- return {
221
- 'basic_stats': {
222
- 'total_tokens': total_tokens,
223
- 'unique_tokens': unique_tokens,
224
- 'compression_ratio': round(compression_ratio, 2),
225
- 'space_tokens': space_tokens,
226
- 'newline_tokens': newline_tokens,
227
- 'special_tokens': special_tokens,
228
- 'punctuation_tokens': punctuation_tokens,
229
- 'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
230
- },
231
- 'length_stats': {
232
- 'avg_length': round(avg_length, 2),
233
- 'std_dev': round(std_dev, 2),
234
- 'min_length': min(lengths),
235
- 'max_length': max(lengths),
236
- 'median_length': sorted(lengths)[len(lengths)//2]
237
- }
238
- }
239
-
240
- def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
241
- """Process text and return tokenization data."""
242
- tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)
243
-
244
- if error:
245
- raise Exception(error)
246
-
247
- # For file uploads, read only preview from file but process full file for stats
248
- if file_path and is_full_file:
249
- # Read the preview for display with UTF-8
250
- with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
251
- preview_text = f.read(8096)
252
-
253
- # Tokenize preview for display
254
- preview_tokens = tokenizer.tokenize(preview_text)
255
- display_tokens = preview_tokens[:50000]
256
-
257
- # Process full file for stats in chunks to avoid memory issues
258
- total_tokens = []
259
- token_set = set()
260
- total_length = 0
261
- chunk_size = 1024 * 1024 # 1MB chunks
262
-
263
- with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
264
- while True:
265
- chunk = f.read(chunk_size)
266
- if not chunk:
267
- break
268
- total_length += len(chunk)
269
- chunk_tokens = tokenizer.tokenize(chunk)
270
- total_tokens.extend(chunk_tokens)
271
- token_set.update(chunk_tokens)
272
-
273
- # Calculate stats
274
- stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
275
- else:
276
- # Standard processing for normal text input
277
- all_tokens = tokenizer.tokenize(text)
278
- total_token_count = len(all_tokens)
279
-
280
- # For display: if it's a preview, only take first 8096 chars
281
- preview_text = text[:8096] if is_full_file else text
282
- preview_tokens = tokenizer.tokenize(preview_text)
283
- display_tokens = preview_tokens[:50000]
284
-
285
- # Always use full text for stats
286
- stats = get_token_stats(all_tokens, text)
287
- total_tokens = all_tokens
288
-
289
- # Format tokens for display
290
- token_data = []
291
- for idx, token in enumerate(display_tokens):
292
- colors = get_varied_color(token)
293
- # 디코딩된 토큰으로 교체
294
- decoded_token = fix_token(token, tokenizer)
295
-
296
- # Compute the numerical token ID from the tokenizer
297
- token_id = tokenizer.convert_tokens_to_ids(token)
298
-
299
- # 개행 여부를 단순히 decoded_token의 끝이 newline인지만 확인 (원하는대로 조정 가능)
300
- newline_flag = decoded_token.endswith('\n')
301
-
302
- # UI에 넣을 display(맨 끝 \n 제거 등)
303
- display_str = decoded_token[:-1] if newline_flag else decoded_token
304
-
305
- token_data.append({
306
- 'original': token, # raw token
307
- 'display': display_str, # 사람이 읽을 수 있는 디코딩된 토큰
308
- 'colors': colors,
309
- 'newline': newline_flag,
310
- 'token_id': token_id,
311
- 'token_index': idx
312
- })
313
-
314
- # Use the appropriate token count based on processing method
315
- total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
316
-
317
- return {
318
- 'tokens': token_data,
319
- 'stats': stats,
320
- 'display_limit_reached': total_token_count > 50000 and not is_full_file,
321
- 'total_tokens': total_token_count,
322
- 'is_full_file': is_full_file,
323
- 'preview_only': is_full_file,
324
- 'tokenizer_info': tokenizer_info # Include tokenizer info
325
- }
326
-
327
- # HTML template with enhanced modern styling
328
- HTML_TEMPLATE = """
329
- <!DOCTYPE html>
330
- <html>
331
- <head>
332
- <title>Token Visualizer</title>
333
- <meta charset="UTF-8">
334
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
335
- <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
336
- <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
337
- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
338
- <style>
339
- :root {
340
- --primary-color: #0f4f9b; /* Blue accent */
341
- --primary-hover: #0c3e7a; /* Darker blue accent */
342
- --bg-color: #121212; /* Dark background */
343
- --card-bg: #1e1e1e; /* Dark card background */
344
- --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
345
- 0 2px 4px -1px rgba(0, 0, 0, 0.6);
346
- --transition: all 0.3s ease;
347
- --text-color: #E0E0E0; /* Main text color */
348
- --secondary-text: #A0A0A0;/* Secondary text color */
349
- --input-bg: #2a2a2a; /* Input/textarea background */
350
- --input-border: #444444; /* Input/textarea border */
351
- --input-focus: #0f4f9b; /* Focus border color */
352
- }
353
-
354
- * {
355
- margin: 0;
356
- padding: 0;
357
- box-sizing: border-box;
358
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
359
- scrollbar-width: thin;
360
- scrollbar-color: #0f4f9b #121212
361
- }
362
-
363
- /* Width and height of the scrollbar */
364
- ::-webkit-scrollbar {
365
- width: 12px;
366
- height: 12px;
367
- }
368
-
369
- @keyframes spin {
370
- from { transform: rotate(0deg); }
371
- to { transform: rotate(360deg); }
372
- }
373
-
374
- /* Track (background) */
375
- ::-webkit-scrollbar-track {
376
- background: #121212;
377
- border-radius: 10px;
378
- }
379
-
380
- /* Handle (draggable part) */
381
- ::-webkit-scrollbar-thumb {
382
- background: #0f4f9b;
383
- border-radius: 10px;
384
- border: 2px solid #121212;
385
- }
386
-
387
- /* Handle on hover */
388
- ::-webkit-scrollbar-thumb:hover {
389
- background: #0c3e7a;
390
- }
391
-
392
-
393
- body {
394
- background-color: var(--bg-color);
395
- padding: 2rem;
396
- min-height: 100vh;
397
- background-image:
398
- radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
399
- radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
400
- color: var(--text-color);
401
- }
402
-
403
- .container {
404
- max-width: 1200px;
405
- margin: 0 auto;
406
- }
407
-
408
- .header {
409
- display: flex;
410
- justify-content: space-between;
411
- align-items: center;
412
- margin-bottom: 2rem;
413
- position: relative;
414
- }
415
-
416
- .title-section {
417
- flex-grow: 1;
418
- }
419
-
420
- .title {
421
- font-size: 2.5rem;
422
- font-weight: 800;
423
- color: var(--primary-color);
424
- margin-bottom: 0.5rem;
425
- }
426
-
427
- .subtitle {
428
- color: var(--secondary-text);
429
- font-size: 1.1rem;
430
- }
431
-
432
- .model-selector {
433
- position: relative;
434
- min-width: 200px;
435
- }
436
-
437
- .model-selector-header {
438
- display: flex;
439
- gap: 0.5rem;
440
- margin-bottom: 0.5rem;
441
- }
442
-
443
- .model-type-toggle {
444
- display: flex;
445
- background-color: var(--card-bg);
446
- border-radius: 0.5rem;
447
- padding: 0.25rem;
448
- overflow: hidden;
449
- }
450
-
451
- .toggle-option {
452
- padding: 0.5rem 0.75rem;
453
- font-size: 0.8rem;
454
- font-weight: 500;
455
- cursor: pointer;
456
- transition: var(--transition);
457
- border-radius: 0.375rem;
458
- color: var(--secondary-text);
459
- }
460
-
461
- .toggle-option.active {
462
- background-color: var(--primary-color);
463
- color: white;
464
- }
465
-
466
- select {
467
- width: 100%;
468
- padding: 0.75rem 1rem;
469
- border: 2px solid var(--input-border);
470
- border-radius: 0.5rem;
471
- font-size: 1rem;
472
- color: var(--text-color);
473
- background-color: var(--input-bg);
474
- cursor: pointer;
475
- transition: var(--transition);
476
- appearance: none;
477
- background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
478
- background-repeat: no-repeat;
479
- background-position: right 1rem center;
480
- background-size: 1.5rem;
481
- }
482
-
483
- select:hover, .custom-model-input:hover {
484
- border-color: var(--primary-color);
485
- }
486
-
487
- select:focus, .custom-model-input:focus {
488
- outline: none;
489
- border-color: var(--primary-color);
490
- box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
491
- }
492
-
493
- .custom-model-input {
494
- width: 100%;
495
- padding: 0.75rem 1rem;
496
- border: 2px solid var(--input-border);
497
- border-radius: 0.5rem;
498
- font-size: 1rem;
499
- color: var(--text-color);
500
- background-color: var(--input-bg);
501
- transition: var(--transition);
502
- }
503
-
504
- .input-section {
505
- margin-bottom: 2rem;
506
- }
507
-
508
- textarea {
509
- width: 100%;
510
- height: 150px;
511
- padding: 1.25rem;
512
- border: 2px solid var(--input-border);
513
- border-radius: 0.75rem;
514
- resize: vertical;
515
- font-size: 1rem;
516
- margin-bottom: 1rem;
517
- transition: var(--transition);
518
- background-color: var(--input-bg);
519
- color: var(--text-color);
520
- }
521
-
522
- textarea:focus {
523
- outline: none;
524
- border-color: var(--input-focus);
525
- box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
526
- }
527
-
528
- .button-container {
529
- display: flex;
530
- justify-content: center;
531
- width: 100%;
532
- gap: 1rem;
533
- }
534
-
535
- button {
536
- padding: 0.875rem 2.5rem;
537
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
538
- color: #fff;
539
- border: none;
540
- border-radius: 0.75rem;
541
- font-size: 1.1rem;
542
- font-weight: 600;
543
- cursor: pointer;
544
- transition: var(--transition);
545
- box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
546
- }
547
-
548
- button:hover {
549
- transform: translateY(-2px);
550
- box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
551
- }
552
-
553
- button:active {
554
- transform: translateY(0);
555
- }
556
-
557
- button:disabled {
558
- opacity: 0.7;
559
- cursor: not-allowed;
560
- }
561
-
562
- .card {
563
- background-color: var(--card-bg);
564
- border-radius: 1rem;
565
- box-shadow: var(--card-shadow);
566
- padding: 1.5rem;
567
- margin-bottom: 2rem;
568
- transition: var(--transition);
569
- }
570
-
571
- .card:hover {
572
- transform: translateY(-2px);
573
- box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
574
- }
575
-
576
- .card-title {
577
- font-size: 1.25rem;
578
- font-weight: 700;
579
- color: var(--text-color);
580
- margin-bottom: 1.25rem;
581
- display: flex;
582
- align-items: center;
583
- gap: 0.5rem;
584
- cursor: pointer;
585
- }
586
-
587
- .card-title::before {
588
- content: '';
589
- display: block;
590
- width: 4px;
591
- height: 1.25rem;
592
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
593
- border-radius: 2px;
594
- }
595
-
596
- .token-container {
597
- display: flex;
598
- flex-wrap: wrap;
599
- gap: 0.375rem;
600
- margin-bottom: 1rem;
601
- padding: 1rem;
602
- background-color: #2a2a2a;
603
- border-radius: 0.5rem;
604
- max-height: 200px;
605
- overflow-y: auto;
606
- transition: max-height 0.3s ease;
607
- }
608
-
609
- .token-container.expanded {
610
- max-height: none;
611
- }
612
-
613
- .token {
614
- padding: 0.375rem 0.75rem;
615
- border-radius: 0.375rem;
616
- background-color: var(--input-bg);
617
- font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
618
- font-size: 0.875rem;
619
- color: var(--text-color);
620
- cursor: default;
621
- transition: var(--transition);
622
- box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
623
- }
624
-
625
- .token:hover {
626
- transform: translateY(-1px);
627
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
628
- }
629
-
630
- .stats-grid {
631
- display: grid;
632
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
633
- gap: 1.5rem;
634
- margin-bottom: 2rem;
635
- }
636
-
637
- .stat-card {
638
- background-color: var(--card-bg);
639
- padding: 1.5rem;
640
- border-radius: 1rem;
641
- box-shadow: var(--card-shadow);
642
- transition: var(--transition);
643
- }
644
-
645
- .stat-card:hover {
646
- transform: translateY(-2px);
647
- box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
648
- }
649
-
650
- .stat-title {
651
- color: var(--secondary-text);
652
- font-size: 0.875rem;
653
- font-weight: 500;
654
- margin-bottom: 0.5rem;
655
- text-transform: uppercase;
656
- letter-spacing: 0.05em;
657
- }
658
-
659
- .stat-value {
660
- color: var(--text-color);
661
- font-size: 2rem;
662
- font-weight: 700;
663
- line-height: 1.2;
664
- margin-bottom: 0.25rem;
665
- }
666
-
667
- .stat-description {
668
- color: var(--secondary-text);
669
- font-size: 0.875rem;
670
- }
671
-
672
- .expand-button {
673
- background: none;
674
- border: none;
675
- color: var(--primary-color);
676
- font-size: 0.875rem;
677
- padding: 0.5rem;
678
- cursor: pointer;
679
- display: block;
680
- margin: 0 auto;
681
- box-shadow: none;
682
- }
683
-
684
- .expand-button:hover {
685
- text-decoration: underline;
686
- transform: none;
687
- box-shadow: none;
688
- }
689
-
690
- .error-message {
691
- color: #EF4444;
692
- background-color: #3a1f1f;
693
- border: 1px solid #562626;
694
- padding: 1rem;
695
- border-radius: 0.5rem;
696
- margin-bottom: 1rem;
697
- display: none;
698
- }
699
-
700
- .display-limit-notice {
701
- background-color: #4b2b07;
702
- border: 1px solid #7c4a02;
703
- color: #FFD591;
704
- padding: 0.75rem;
705
- border-radius: 0.5rem;
706
- margin-top: 1rem;
707
- font-size: 0.875rem;
708
- display: none;
709
- }
710
-
711
- /* File drop zone styles */
712
- .file-drop-zone {
713
- position: fixed;
714
- top: 0;
715
- left: 0;
716
- width: 100%;
717
- height: 100%;
718
- background-color: rgba(15, 79, 155, 0.15);
719
- z-index: 1000;
720
- display: flex;
721
- justify-content: center;
722
- align-items: center;
723
- opacity: 0;
724
- pointer-events: none;
725
- transition: opacity 0.3s ease;
726
- }
727
-
728
- .file-drop-zone.active {
729
- opacity: 1;
730
- pointer-events: all;
731
- }
732
-
733
- .drop-indicator {
734
- background-color: var(--card-bg);
735
- border: 2px dashed var(--primary-color);
736
- border-radius: 1rem;
737
- padding: 2rem;
738
- text-align: center;
739
- width: 60%;
740
- max-width: 400px;
741
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
742
- animation: pulse 2s infinite;
743
- }
744
-
745
- @keyframes pulse {
746
- 0% { transform: scale(1); }
747
- 50% { transform: scale(1.05); }
748
- 100% { transform: scale(1); }
749
- }
750
-
751
- .drop-indicator p {
752
- margin-bottom: 0.5rem;
753
- color: var(--text-color);
754
- font-size: 1.2rem;
755
- }
756
-
757
- .file-icon {
758
- font-size: 3rem;
759
- margin-bottom: 1rem;
760
- color: var(--primary-color);
761
- }
762
-
763
- .file-upload-icon {
764
- position: fixed;
765
- bottom: 20px;
766
- left: 20px;
767
- width: 45px;
768
- height: 45px;
769
- background-color: var(--card-bg);
770
- border-radius: 50%;
771
- display: flex;
772
- justify-content: center;
773
- align-items: center;
774
- cursor: pointer;
775
- z-index: 100;
776
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
777
- transition: transform 0.2s ease, box-shadow 0.2s ease;
778
- }
779
-
780
- .file-upload-icon:hover {
781
- transform: translateY(-2px);
782
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
783
- }
784
-
785
- .file-upload-icon span {
786
- font-size: 1.5rem;
787
- color: var(--primary-color);
788
- }
789
-
790
- .file-info {
791
- position: fixed;
792
- bottom: 20px;
793
- left: 75px;
794
- background-color: var(--card-bg);
795
- color: var(--primary-color);
796
- font-weight: 500;
797
- padding: 0.5rem 1rem;
798
- border-radius: 1rem;
799
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
800
- max-width: 270px;
801
- white-space: nowrap;
802
- overflow: hidden;
803
- text-overflow: ellipsis;
804
- z-index: 100;
805
- display: none;
806
- }
807
-
808
- .file-detach {
809
- margin-left: 8px;
810
- display: inline-block;
811
- width: 18px;
812
- height: 18px;
813
- background-color: rgba(255, 255, 255, 0.1);
814
- color: var(--text-color);
815
- border-radius: 50%;
816
- text-align: center;
817
- line-height: 16px;
818
- font-size: 12px;
819
- cursor: pointer;
820
- transition: all 0.2s ease;
821
- }
822
-
823
- .file-detach:hover {
824
- background-color: rgba(255, 0, 0, 0.2);
825
- color: #ff6b6b;
826
- transform: scale(1.1);
827
- }
828
-
829
- .preview-notice {
830
- background-color: #273c56;
831
- border: 1px solid #365a82;
832
- color: #89b4e8;
833
- padding: 0.75rem;
834
- border-radius: 0.5rem;
835
- margin-top: 1rem;
836
- font-size: 0.875rem;
837
- display: none;
838
- }
839
-
840
- .custom-model-wrapper {
841
- position: relative;
842
- }
843
-
844
- .model-badge {
845
- position: absolute;
846
- top: -10px;
847
- right: -5px;
848
- background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
849
- color: white;
850
- font-size: 0.7rem;
851
- font-weight: 700;
852
- padding: 0.25rem 0.5rem;
853
- border-radius: 999px;
854
- transform: scale(0);
855
- transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
856
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
857
- z-index: 10;
858
- }
859
-
860
- .model-badge.show {
861
- transform: scale(1);
862
- }
863
-
864
- .custom-model-help {
865
- display: inline-block;
866
- width: 16px;
867
- height: 16px;
868
- line-height: 16px;
869
- font-size: 11px;
870
- font-weight: bold;
871
- text-align: center;
872
- background-color: var(--secondary-text);
873
- color: var(--card-bg);
874
- border-radius: 50%;
875
- margin-left: 5px;
876
- cursor: help;
877
- vertical-align: middle;
878
- }
879
-
880
- .tooltip {
881
- position: absolute;
882
- top: 100%;
883
- left: 0;
884
- width: 280px;
885
- background-color: #333;
886
- color: #fff;
887
- padding: 0.75rem;
888
- border-radius: 0.5rem;
889
- font-size: 0.8rem;
890
- margin-top: 0.5rem;
891
- z-index: 100;
892
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
893
- opacity: 0;
894
- visibility: hidden;
895
- transition: opacity 0.2s, visibility 0.2s;
896
- }
897
-
898
- .custom-model-help:hover + .tooltip {
899
- opacity: 1;
900
- visibility: visible;
901
- }
902
-
903
- /* Tokenizer info icon and tooltip styles */
904
- .tokenizer-info-icon {
905
- display: inline-flex;
906
- align-items: center;
907
- justify-content: center;
908
- width: 24px;
909
- height: 24px;
910
- background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
911
- color: white;
912
- border-radius: 50%;
913
- position: absolute;
914
- left: -32px; /* Position to the left of the selector */
915
- top: 50%;
916
- transform: translateY(-50%);
917
- cursor: pointer;
918
- font-size: 12px;
919
- font-weight: bold;
920
- transition: all 0.2s ease;
921
- z-index: 10;
922
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
923
- }
924
-
925
- .tokenizer-info-icon:hover {
926
- transform: translateY(-50%) scale(1.1);
927
- box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
928
- }
929
-
930
- /* Watermark styles */
931
- .watermark {
932
- position: fixed;
933
- bottom: 20px;
934
- right: 20px;
935
- color: var(--primary-color);
936
- font-size: 1.4rem;
937
- font-weight: 700;
938
- opacity: 0.25; /* Semi-transparent */
939
- z-index: 100;
940
- transition: opacity 0.3s ease;
941
- text-decoration: none;
942
- pointer-events: auto; /* Ensure it remains clickable */
943
- }
944
-
945
- .watermark:hover {
946
- opacity: 0.6; /* Increase opacity on hover */
947
- }
948
-
949
- .tokenizer-info-tooltip {
950
- position: absolute;
951
- top: calc(100% + 8px);
952
- left: -30px; /* Adjust position to align with the icon */
953
- width: 300px;
954
- background-color: var(--card-bg);
955
- color: var(--text-color);
956
- border: 1px solid var(--primary-color);
957
- border-radius: 0.75rem;
958
- box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
959
- padding: 1rem;
960
- z-index: 1000; /* Increase z-index to ensure visibility */
961
- opacity: 0;
962
- visibility: hidden;
963
- transition: opacity 0.3s, visibility 0.3s;
964
- pointer-events: none; /* Initially disable pointer events */
965
- }
966
-
967
- .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
968
- opacity: 1;
969
- visibility: visible;
970
- pointer-events: auto;
971
- }
972
-
973
- .tokenizer-info-tooltip:hover {
974
- opacity: 1;
975
- visibility: visible;
976
- pointer-events: auto;
977
- }
978
-
979
- .tokenizer-info-header {
980
- font-size: 1.1rem;
981
- font-weight: 600;
982
- margin-bottom: 0.5rem;
983
- padding-bottom: 0.5rem;
984
- border-bottom: 1px solid rgba(255, 255, 255, 0.1);
985
- color: var(--primary-color);
986
- }
987
-
988
- .tokenizer-info-grid {
989
- display: grid;
990
- grid-template-columns: repeat(2, 1fr);
991
- gap: 0.75rem;
992
- margin: 0.75rem 0;
993
- }
994
-
995
- .tokenizer-info-item {
996
- display: flex;
997
- flex-direction: column;
998
- }
999
-
1000
- .tokenizer-info-label {
1001
- font-size: 0.75rem;
1002
- color: var(--secondary-text);
1003
- margin-bottom: 0.25rem;
1004
- }
1005
-
1006
- .tokenizer-info-value {
1007
- font-size: 0.95rem;
1008
- font-weight: 500;
1009
- }
1010
-
1011
- .special-tokens-container {
1012
- margin-top: 0.75rem;
1013
- background-color: rgba(15, 79, 155, 0.1);
1014
- border-radius: 0.5rem;
1015
- padding: 0.5rem;
1016
- max-height: 100px;
1017
- overflow-y: auto;
1018
- }
1019
-
1020
- .special-token-item {
1021
- display: flex;
1022
- justify-content: space-between;
1023
- margin-bottom: 0.25rem;
1024
- font-size: 0.8rem;
1025
- }
1026
-
1027
- .token-name {
1028
- color: var(--secondary-text);
1029
- }
1030
-
1031
- .token-value {
1032
- background-color: rgba(255, 255, 255, 0.1);
1033
- padding: 1px 4px;
1034
- border-radius: 2px;
1035
- font-family: monospace;
1036
- }
1037
-
1038
- .tokenizer-info-loading {
1039
- display: flex;
1040
- justify-content: center;
1041
- align-items: center;
1042
- height: 100px;
1043
- }
1044
-
1045
- .tokenizer-info-spinner {
1046
- width: 30px;
1047
- height: 30px;
1048
- border: 3px solid var(--primary-color);
1049
- border-radius: 50%;
1050
- border-top-color: transparent;
1051
- animation: spin 1s linear infinite;
1052
- }
1053
-
1054
- .tokenizer-info-error {
1055
- color: #f87171;
1056
- font-size: 0.9rem;
1057
- text-align: center;
1058
- padding: 1rem;
1059
- }
1060
-
1061
- @media (max-width: 768px) {
1062
- .header {
1063
- flex-direction: column;
1064
- align-items: stretch;
1065
- gap: 1rem;
1066
- }
1067
-
1068
- .model-selector {
1069
- width: 100%;
1070
- }
1071
-
1072
- .stats-grid {
1073
- grid-template-columns: 1fr;
1074
- }
1075
-
1076
- .tokenizer-info-tooltip {
1077
- width: 250px;
1078
- }
1079
- }
1080
- </style>
1081
- </head>
1082
- <body>
1083
- <!-- Hidden File Drop Zone that appears when dragging files -->
1084
- <div id="fileDropZone" class="file-drop-zone">
1085
- <div class="drop-indicator">
1086
- <div class="file-icon">📄</div>
1087
- <p>Drop your file here</p>
1088
- </div>
1089
- </div>
1090
-
1091
- <!-- File upload icon in bottom left corner -->
1092
- <div id="fileUploadIcon" class="file-upload-icon">
1093
- <span>📎</span>
1094
- </div>
1095
- <p class="file-info" id="fileInfo"></p>
1096
-
1097
- <div class="container">
1098
- <div class="header">
1099
- <div class="title-section">
1100
- <h1 class="title">Token Visualizer</h1>
1101
- <p class="subtitle">Advanced tokenization analysis and visualization</p>
1102
- </div>
1103
- <div class="model-selector">
1104
- <div class="model-selector-header">
1105
- <div class="model-type-toggle">
1106
- <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
1107
- <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
1108
- </div>
1109
- </div>
1110
- <div id="predefinedModelSelector">
1111
- <div style="position: relative;">
1112
- <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
1113
- <!-- TOOLTIP MOVED HERE -->
1114
- <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
1115
- <div id="tokenizerInfoContent">
1116
- <div class="tokenizer-info-loading">
1117
- <div class="tokenizer-info-spinner"></div>
1118
- </div>
1119
- </div>
1120
- </div>
1121
- <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
1122
- <select id="modelSelect" name="model">
1123
- {% for model_id, info in models.items() %}
1124
- <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
1125
- {{ info.alias }}
1126
- </option>
1127
- {% endfor %}
1128
- </select>
1129
- </div>
1130
- </div>
1131
- <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
1132
- <div style="position: relative;">
1133
- <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
1134
- <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
1135
- <div id="customTokenizerInfoContent">
1136
- <div class="tokenizer-info-loading">
1137
- <div class="tokenizer-info-spinner"></div>
1138
- </div>
1139
- </div>
1140
- </div>
1141
- <input type="text" id="customModelInput" class="custom-model-input"
1142
- placeholder="Enter HuggingFace model path"
1143
- value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
1144
- </div>
1145
- <span class="custom-model-help">?</span>
1146
- <div class="tooltip">
1147
- Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3").
1148
- For Korean, you might use "beomi/KoAlpaca-Polyglot-12.8B" or "skt/kogpt2-base-v2", etc.
1149
- The model must have a tokenizer available and be accessible.
1150
- </div>
1151
- <div class="model-badge" id="modelSuccessBadge">Loaded</div>
1152
- </div>
1153
- </div>
1154
- </div>
1155
-
1156
- <div class="error-message" id="errorMessage">{{ error }}</div>
1157
-
1158
- <div class="input-section">
1159
- <form id="analyzeForm" method="POST" enctype="multipart/form-data">
1160
- <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
1161
- <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
1162
- <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
1163
- <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
1164
- <input type="file" name="file" id="fileInput" style="display: none;">
1165
- <div class="button-container">
1166
- <button type="submit" id="analyzeButton">Analyze Text</button>
1167
- </div>
1168
- </form>
1169
- </div>
1170
-
1171
- <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
1172
- <div class="card">
1173
- <h2 class="card-title">Token Visualization</h2>
1174
- <div class="preview-notice" id="previewNotice">
1175
- Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
1176
- </div>
1177
- <div class="token-container" id="tokenContainer">
1178
- {% if token_data %}
1179
- {% for token in token_data.tokens %}
1180
- <span class="token"
1181
- style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
1182
- title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
1183
- {{ token.display }}
1184
- </span>
1185
- {% if token.newline %}<br>{% endif %}
1186
- {% endfor %}
1187
- {% endif %}
1188
- </div>
1189
- <button class="expand-button" id="expandButton">Show More</button>
1190
- <div class="display-limit-notice" id="displayLimitNotice">
1191
- Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
1192
- </div>
1193
- </div>
1194
-
1195
- <div class="stats-grid">
1196
- <div class="stat-card">
1197
- <div class="stat-title">Total Tokens</div>
1198
- <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
1199
- <div class="stat-description">
1200
- <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
1201
- (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
1202
- </div>
1203
- </div>
1204
- <div class="stat-card">
1205
- <div class="stat-title">Token Types</div>
1206
- <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
1207
- <div class="stat-description">special tokens</div>
1208
- </div>
1209
- <div class="stat-card">
1210
- <div class="stat-title">Whitespace</div>
1211
- <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
1212
- <div class="stat-description">
1213
- spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
1214
- newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
1215
- </div>
1216
- </div>
1217
- <div class="stat-card">
1218
- <div class="stat-title">Token Length</div>
1219
- <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
1220
- <div class="stat-description">
1221
- median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
1222
- ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
1223
- </div>
1224
- </div>
1225
- <div class="stat-card">
1226
- <div class="stat-title">Compression</div>
1227
- <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
1228
- <div class="stat-description">characters per token</div>
1229
- </div>
1230
- </div>
1231
- </div>
1232
- </div>
1233
- <a href="https://huggingface.co/spaces/barttee/tokenizers" target="_blank" class="watermark">
1234
- @barttee/tokenizers
1235
- </a>
1236
-
1237
- <script>
1238
- $(document).ready(function() {
1239
- // File handling variables
1240
- let currentFile = null;
1241
- let originalTextContent = null;
1242
- let lastUploadedFileName = null;
1243
- let fileJustUploaded = false; // Flag to prevent immediate detachment
1244
- let currentModelType = "{{ model_type if model_type else 'predefined' }}";
1245
- let currentTokenizerInfo = null;
1246
-
1247
- // Try to parse tokenizer info if available from server
1248
- try {
1249
- currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }};
1250
- if (currentTokenizerInfo) {
1251
- updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
1252
- }
1253
- } catch(e) {
1254
- console.error("Error parsing tokenizer info:", e);
1255
- }
1256
-
1257
- // Show error if exists
1258
- if ("{{ error }}".length > 0) {
1259
- showError("{{ error }}");
1260
- }
1261
-
1262
- // Setup model type based on initial state
1263
- if (currentModelType === "custom") {
1264
- $('.toggle-option').removeClass('active');
1265
- $('.custom-toggle').addClass('active');
1266
- $('#predefinedModelSelector').hide();
1267
- $('#customModelSelector').show();
1268
- }
1269
-
1270
- // Show success badge if custom model loaded successfully
1271
- if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
1272
- $('#modelSuccessBadge').addClass('show');
1273
- setTimeout(() => {
1274
- $('#modelSuccessBadge').removeClass('show');
1275
- }, 3000);
1276
- }
1277
-
1278
- // Toggle between predefined and custom model inputs
1279
- $('.toggle-option').click(function() {
1280
- const modelType = $(this).data('type');
1281
- $('.toggle-option').removeClass('active');
1282
- $(this).addClass('active');
1283
- currentModelType = modelType;
1284
-
1285
- if (modelType === 'predefined') {
1286
- $('#predefinedModelSelector').show();
1287
- $('#customModelSelector').hide();
1288
- $('#modelTypeInput').val('predefined');
1289
- // Set the model input value to the selected predefined model
1290
- $('#modelInput').val($('#modelSelect').val());
1291
- } else {
1292
- $('#predefinedModelSelector').hide();
1293
- $('#customModelSelector').show();
1294
- $('#modelTypeInput').val('custom');
1295
- }
1296
-
1297
- // Clear tokenizer info if switching models
1298
- if (modelType === 'predefined') {
1299
- $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1300
- fetchTokenizerInfo($('#modelSelect').val(), false);
1301
- } else {
1302
- $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1303
- // Only fetch if there's a custom model value
1304
- const customModel = $('#customModelInput').val();
1305
- if (customModel) {
1306
- fetchTokenizerInfo(customModel, true);
1307
- }
1308
- }
1309
- });
1310
-
1311
- // Update hidden input when custom model input changes
1312
- $('#customModelInput').on('input', function() {
1313
- $('#customModelInputHidden').val($(this).val());
1314
- });
1315
-
1316
- function showError(message) {
1317
- const errorDiv = $('#errorMessage');
1318
- errorDiv.text(message);
1319
- errorDiv.show();
1320
- setTimeout(() => errorDiv.fadeOut(), 5000);
1321
- }
1322
-
1323
- // Function to update tokenizer info display in tooltip
1324
- function updateTokenizerInfoDisplay(info, isCustom = false) {
1325
- const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1326
- let htmlContent = '';
1327
-
1328
- if (info.error) {
1329
- $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
1330
- return;
1331
- }
1332
-
1333
- // Start building the tooltip content
1334
- htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
1335
- <div class="tokenizer-info-grid">`;
1336
-
1337
- // Dictionary size
1338
- if (info.vocab_size) {
1339
- htmlContent += `
1340
- <div class="tokenizer-info-item">
1341
- <span class="tokenizer-info-label">Dictionary Size</span>
1342
- <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
1343
- </div>`;
1344
- }
1345
-
1346
- // Tokenizer type
1347
- if (info.tokenizer_type) {
1348
- htmlContent += `
1349
- <div class="tokenizer-info-item">
1350
- <span class="tokenizer-info-label">Tokenizer Type</span>
1351
- <span class="tokenizer-info-value">${info.tokenizer_type}</span>
1352
- </div>`;
1353
- }
1354
-
1355
- // Max length
1356
- if (info.model_max_length) {
1357
- htmlContent += `
1358
- <div class="tokenizer-info-item">
1359
- <span class="tokenizer-info-label">Max Length</span>
1360
- <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
1361
- </div>`;
1362
- }
1363
-
1364
- htmlContent += `</div>`; // Close tokenizer-info-grid
1365
-
1366
- // Special tokens section
1367
- if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
1368
- htmlContent += `
1369
- <div class="tokenizer-info-item" style="margin-top: 0.75rem;">
1370
- <span class="tokenizer-info-label">Special Tokens</span>
1371
- <div class="special-tokens-container">`;
1372
-
1373
- // Add each special token
1374
- for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
1375
- // Properly escape HTML special characters
1376
- const escapedValue = tokenValue
1377
- .replace(/&/g, '&amp;')
1378
- .replace(/</g, '&lt;')
1379
- .replace(/>/g, '&gt;')
1380
- .replace(/"/g, '&quot;')
1381
- .replace(/'/g, '&#039;');
1382
-
1383
- htmlContent += `
1384
- <div class="special-token-item">
1385
- <span class="token-name">${tokenName}:</span>
1386
- <span class="token-value">${escapedValue}</span>
1387
- </div>`;
1388
- }
1389
-
1390
- htmlContent += `
1391
- </div>
1392
- </div>`;
1393
- }
1394
-
1395
- $(targetSelector).html(htmlContent);
1396
- }
1397
-
1398
- // Function to fetch tokenizer info
1399
- function fetchTokenizerInfo(modelId, isCustom = false) {
1400
- if (!modelId) return;
1401
-
1402
- const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
1403
- $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
1404
-
1405
- $.ajax({
1406
- url: '/tokenizer-info',
1407
- method: 'GET',
1408
- data: {
1409
- model_id: modelId,
1410
- is_custom: isCustom
1411
- },
1412
- success: function(response) {
1413
- if (response.error) {
1414
- $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
1415
- } else {
1416
- currentTokenizerInfo = response;
1417
- updateTokenizerInfoDisplay(response, isCustom);
1418
- }
1419
- },
1420
- error: function(xhr) {
1421
- $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
1422
- }
1423
- });
1424
- }
1425
-
1426
- function updateResults(data) {
1427
- $('#results').show();
1428
-
1429
- // Update tokens
1430
- const tokenContainer = $('#tokenContainer');
1431
- tokenContainer.empty();
1432
- data.tokens.forEach(token => {
1433
- const span = $('<span>')
1434
- .addClass('token')
1435
- .css({
1436
- 'background-color': token.colors.background,
1437
- 'color': token.colors.text
1438
- })
1439
- // Include token id in the tooltip on hover
1440
- .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
1441
- .text(token.display);
1442
-
1443
- tokenContainer.append(span);
1444
- if (token.newline) {
1445
- tokenContainer.append('<br>');
1446
- }
1447
- });
1448
-
1449
- // Update display limit notice
1450
- if (data.display_limit_reached) {
1451
- $('#displayLimitNotice').show();
1452
- $('#totalTokenCount').text(data.total_tokens);
1453
- } else {
1454
- $('#displayLimitNotice').hide();
1455
- }
1456
-
1457
- // Update preview notice
1458
- if (data.preview_only) {
1459
- $('#previewNotice').show();
1460
- } else {
1461
- $('#previewNotice').hide();
1462
- }
1463
-
1464
- // Update basic stats
1465
- $('#totalTokens').text(data.stats.basic_stats.total_tokens);
1466
- $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
1467
- $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
1468
- $('#specialTokens').text(data.stats.basic_stats.special_tokens);
1469
- $('#spaceTokens').text(data.stats.basic_stats.space_tokens);
1470
- $('#spaceCount').text(data.stats.basic_stats.space_tokens);
1471
- $('#newlineCount').text(data.stats.basic_stats.newline_tokens);
1472
- $('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
1473
-
1474
- // Update length stats
1475
- $('#avgLength').text(data.stats.length_stats.avg_length);
1476
- $('#medianLength').text(data.stats.length_stats.median_length);
1477
- $('#stdDev').text(data.stats.length_stats.std_dev);
1478
-
1479
- // Update tokenizer info if available
1480
- if (data.tokenizer_info) {
1481
- currentTokenizerInfo = data.tokenizer_info;
1482
- updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
1483
- }
1484
- }
1485
-
1486
- // Handle text changes to detach file
1487
- $('#textInput').on('input', function() {
1488
- if (fileJustUploaded) {
1489
- fileJustUploaded = false;
1490
- return;
1491
- }
1492
-
1493
- const currentText = $(this).val();
1494
- const fileInput = document.getElementById('fileInput');
1495
-
1496
- if (fileInput.files.length > 0 && originalTextContent !== null) {
1497
- const isMajorChange =
1498
- currentText.length < originalTextContent.length * 0.8 ||
1499
- (currentText.length > 0 &&
1500
- currentText !== originalTextContent.substring(0, currentText.length) &&
1501
- currentText.substring(0, Math.min(20, currentText.length)) !==
1502
- originalTextContent.substring(0, Math.min(20, originalTextContent.length)));
1503
-
1504
- if (isMajorChange) {
1505
- detachFile();
1506
- }
1507
- }
1508
- });
1509
-
1510
- function detachFile() {
1511
- // Clear the file input
1512
- $('#fileInput').val('');
1513
- // Hide file info
1514
- $('#fileInfo').fadeOut(300);
1515
- // Reset the original content tracker
1516
- originalTextContent = $('#textInput').val();
1517
- // Reset last uploaded filename
1518
- lastUploadedFileName = null;
1519
- }
1520
-
1521
- // For model changes
1522
- $('#modelSelect').change(function() {
1523
- const selectedModel = $(this).val();
1524
- $('#modelInput').val(selectedModel);
1525
-
1526
- // Fetch tokenizer info for the selected model
1527
- fetchTokenizerInfo(selectedModel, false);
1528
-
1529
- // If text exists, submit the form
1530
- if ($('#textInput').val().trim()) {
1531
- $('#analyzeForm').submit();
1532
- }
1533
- });
1534
-
1535
- // File drop handling
1536
- const fileDropZone = $('#fileDropZone');
1537
- const fileUploadIcon = $('#fileUploadIcon');
1538
-
1539
- ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
1540
- fileDropZone[0].addEventListener(eventName, preventDefaults, false);
1541
- document.body.addEventListener(eventName, preventDefaults, false);
1542
- });
1543
-
1544
- function preventDefaults(e) {
1545
- e.preventDefault();
1546
- e.stopPropagation();
1547
- }
1548
-
1549
- document.addEventListener('dragenter', showDropZone, false);
1550
- document.addEventListener('dragover', showDropZone, false);
1551
-
1552
- fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
1553
- fileDropZone[0].addEventListener('drop', hideDropZone, false);
1554
-
1555
- function showDropZone(e) {
1556
- fileDropZone.addClass('active');
1557
- }
1558
-
1559
- function hideDropZone() {
1560
- fileDropZone.removeClass('active');
1561
- }
1562
-
1563
- fileDropZone[0].addEventListener('drop', handleDrop, false);
1564
-
1565
- fileUploadIcon.on('click', function() {
1566
- const input = document.createElement('input');
1567
- input.type = 'file';
1568
- input.onchange = e => {
1569
- handleFiles(e.target.files);
1570
- };
1571
- input.click();
1572
- });
1573
-
1574
- function handleFiles(files) {
1575
- if (files.length) {
1576
- const file = files[0];
1577
- currentFile = file;
1578
- lastUploadedFileName = file.name;
1579
- fileJustUploaded = true;
1580
-
1581
- $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
1582
-
1583
- $('#fileDetach').on('click', function(e) {
1584
- e.stopPropagation();
1585
- detachFile();
1586
- return false;
1587
- });
1588
-
1589
- const dataTransfer = new DataTransfer();
1590
- dataTransfer.items.add(file);
1591
- document.getElementById('fileInput').files = dataTransfer.files;
1592
-
1593
- const reader = new FileReader();
1594
- reader.onload = function(e) {
1595
- const previewText = e.target.result.slice(0, 8096);
1596
- $('#textInput').val(previewText);
1597
-
1598
- setTimeout(() => {
1599
- originalTextContent = previewText;
1600
- $('#analyzeForm').submit();
1601
- }, 50);
1602
- };
1603
- reader.readAsText(file, 'utf-8');
1604
- }
1605
- }
1606
-
1607
- function formatFileSize(bytes) {
1608
- if (bytes < 1024) return bytes + ' bytes';
1609
- else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
1610
- else return (bytes / 1048576).toFixed(1) + ' MB';
1611
- }
1612
-
1613
- $('#analyzeForm').on('submit', function(e) {
1614
- e.preventDefault();
1615
-
1616
- if (!fileJustUploaded) {
1617
- const textInput = $('#textInput').val();
1618
- const fileInput = document.getElementById('fileInput');
1619
-
1620
- if (fileInput.files.length > 0 &&
1621
- originalTextContent !== null &&
1622
- textInput !== originalTextContent &&
1623
- textInput.length < originalTextContent.length * 0.8) {
1624
- detachFile();
1625
- }
1626
- } else {
1627
- fileJustUploaded = false;
1628
- }
1629
-
1630
- if (currentModelType === 'custom') {
1631
- $('#customModelInputHidden').val($('#customModelInput').val());
1632
- } else {
1633
- $('#modelInput').val($('#modelSelect').val());
1634
- }
1635
-
1636
- const formData = new FormData(this);
1637
- $('#analyzeButton').prop('disabled', true);
1638
-
1639
- $.ajax({
1640
- url: '/',
1641
- method: 'POST',
1642
- data: formData,
1643
- processData: false,
1644
- contentType: false,
1645
- success: function(response) {
1646
- if (response.error) {
1647
- showError(response.error);
1648
- } else {
1649
- updateResults(response);
1650
-
1651
- if (currentModelType === 'custom') {
1652
- $('#modelSuccessBadge').addClass('show');
1653
- setTimeout(() => {
1654
- $('#modelSuccessBadge').removeClass('show');
1655
- }, 3000);
1656
- }
1657
- }
1658
- },
1659
- error: function(xhr) {
1660
- showError(xhr.responseText || 'An error occurred while processing the text');
1661
- },
1662
- complete: function() {
1663
- $('#analyzeButton').prop('disabled', false);
1664
- }
1665
- });
1666
- });
1667
-
1668
- $('#expandButton').click(function() {
1669
- const container = $('#tokenContainer');
1670
- const isExpanded = container.hasClass('expanded');
1671
-
1672
- container.toggleClass('expanded');
1673
- $(this).text(isExpanded ? 'Show More' : 'Show Less');
1674
- });
1675
-
1676
- if (currentModelType === 'predefined') {
1677
- fetchTokenizerInfo($('#modelSelect').val(), false);
1678
- } else if ($('#customModelInput').val()) {
1679
- fetchTokenizerInfo($('#customModelInput').val(), true);
1680
- }
1681
-
1682
- $('#customModelInput').on('change', function() {
1683
- const modelValue = $(this).val();
1684
- if (modelValue) {
1685
- fetchTokenizerInfo(modelValue, true);
1686
- }
1687
- });
1688
- });
1689
- </script>
1690
- </body>
1691
- </html>
1692
- """
1693
-
1694
- @app.route('/tokenizer-info', methods=['GET'])
1695
- def tokenizer_info():
1696
- """
1697
- Endpoint to get tokenizer information without processing text.
1698
- """
1699
- model_id = request.args.get('model_id', '')
1700
- is_custom = request.args.get('is_custom', 'false').lower() == 'true'
1701
-
1702
- if not model_id:
1703
- return jsonify({"error": "No model ID provided"}), 400
1704
-
1705
- try:
1706
- # For predefined models, use the model name from the dictionary
1707
- if not is_custom and model_id in TOKENIZER_MODELS:
1708
- model_id_or_name = model_id
1709
- else:
1710
- # For custom models, use the model ID directly
1711
- model_id_or_name = model_id
1712
-
1713
- # Load the tokenizer and get info
1714
- tokenizer, info, error = load_tokenizer(model_id_or_name)
1715
-
1716
- if error:
1717
- return jsonify({"error": error}), 400
1718
-
1719
- return jsonify(info)
1720
- except Exception as e:
1721
- return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
1722
-
1723
- @app.route('/', methods=['GET', 'POST'])
1724
- def index():
1725
- text = ""
1726
- token_data = None
1727
- error_message = ""
1728
- selected_model = request.args.get('model', request.form.get('model', 'llama4'))
1729
- custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
1730
- model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
1731
-
1732
- # Determine which model to use based on model_type
1733
- model_to_use = selected_model if model_type == 'predefined' else custom_model
1734
-
1735
- if request.method == 'POST':
1736
- # Check if file upload
1737
- if 'file' in request.files and request.files['file'].filename:
1738
- uploaded_file = request.files['file']
1739
- # Save file to tmp directory
1740
- file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
1741
- uploaded_file.save(file_path)
1742
-
1743
- # Read a small preview of the file (UTF-8)
1744
- with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
1745
- text = f.read(8096)
1746
-
1747
- try:
1748
- # Process the file fully
1749
- token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
1750
-
1751
- # Clean up the file after processing
1752
- if os.path.exists(file_path):
1753
- os.remove(file_path)
1754
-
1755
- # If request is AJAX, return JSON
1756
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1757
- return jsonify(token_data)
1758
-
1759
- except Exception as e:
1760
- error_message = str(e)
1761
- # Clean up the file after processing
1762
- if os.path.exists(file_path):
1763
- os.remove(file_path)
1764
-
1765
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1766
- return jsonify({"error": error_message}), 400
1767
- return render_template_string(
1768
- HTML_TEMPLATE,
1769
- text=text,
1770
- token_data=None,
1771
- models=TOKENIZER_MODELS,
1772
- selected_model=selected_model,
1773
- custom_model=custom_model,
1774
- model_type=model_type,
1775
- error=error_message
1776
- )
1777
-
1778
- # Regular text processing
1779
- else:
1780
- text = request.form.get('text', '')
1781
- if text:
1782
- try:
1783
- token_data = process_text(text, model_to_use)
1784
-
1785
- # If request is AJAX, return JSON
1786
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1787
- return jsonify(token_data)
1788
-
1789
- except Exception as e:
1790
- error_message = str(e)
1791
- if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
1792
- return jsonify({"error": error_message}), 400
1793
- return render_template_string(
1794
- HTML_TEMPLATE,
1795
- text=text,
1796
- token_data=None,
1797
- models=TOKENIZER_MODELS,
1798
- selected_model=selected_model,
1799
- custom_model=custom_model,
1800
- model_type=model_type,
1801
- error=error_message
1802
- )
1803
-
1804
- return render_template_string(
1805
- HTML_TEMPLATE,
1806
- text=text,
1807
- token_data=token_data,
1808
- models=TOKENIZER_MODELS,
1809
- selected_model=selected_model,
1810
- custom_model=custom_model,
1811
- model_type=model_type,
1812
- error=error_message
1813
- )
1814
-
1815
-
1816
- if __name__ == "__main__":
1817
- app.run(host='0.0.0.0', port=7860, debug=False)