Spaces:

Ruurd
/

tini

Running on Zero

App Files Files

Ruurd commited on 21 days ago

Commit

f86092a

1 Parent(s): f5bf8f9

Highlight noised tokens

Browse files

Files changed (1) hide show

app.py +17 -13

app.py CHANGED Viewed

@@ -62,14 +62,13 @@ def noisify_answer(input_ids, answer_start, threshold=1.0, eot_weight=1.0, clust
     num_to_noise = int(threshold * answer_len)
     if num_to_noise == 0:
-        return noised
     mixed_probs = token_probabilities.copy()
     mixed_probs[eot_token_id] *= eot_weight
     mixed_probs /= mixed_probs.sum()
-    # Determine number of clusters and average cluster size
-    num_clusters = max(1, int((1 - clustering) * num_to_noise))  # fewer clusters if more intensity
     cluster_size = max(1, int(num_to_noise / num_clusters))
     noised_indices = set()
@@ -79,15 +78,13 @@ def noisify_answer(input_ids, answer_start, threshold=1.0, eot_weight=1.0, clust
         span_end = min(len(noised), span_start + cluster_size)
         noised_indices.update(range(span_start, span_end))
-    # Trim in case we overshot due to overlapping clusters
     noised_indices = sorted(list(noised_indices))[:num_to_noise]
     noise = rng.choice(np.arange(vocab_size), size=len(noised_indices), p=mixed_probs)
     for idx, val in zip(noised_indices, noise):
         noised[idx] = val
-    return noised
 # Add new noising function
@@ -165,7 +162,9 @@ def diffusion_chat(question, eot_weight, max_it, sharpness, noise_clipping, use_
         input_ids = input_ids[:256]
     ori_input_tokens = input_ids
-    current_tokens = noisify_answer(ori_input_tokens, answer_start, threshold=1.0, eot_weight=eot_weight)
     prev_decoded_tokens = []
     last_tokens = []
@@ -178,14 +177,19 @@ def diffusion_chat(question, eot_weight, max_it, sharpness, noise_clipping, use_
         decoded_tokens = tokenizer.convert_ids_to_tokens(decoded_ids)
         filtered_tokens = [tok for tok in decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id]
         filtered_prev_tokens = [tok for tok in prev_decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id] if prev_decoded_tokens else []
         if filtered_prev_tokens:
             highlighted = []
-            for tok_new, tok_old in zip(filtered_tokens, filtered_prev_tokens):
-                if tok_new != tok_old:
-                    highlighted.append(f'<span style="color:green">{tokenizer.convert_tokens_to_string([tok_new])}</span>')
                 else:
-                    highlighted.append(tokenizer.convert_tokens_to_string([tok_new]))
         else:
             highlighted = [tokenizer.convert_tokens_to_string([tok]) for tok in filtered_tokens]
@@ -203,7 +207,7 @@ def diffusion_chat(question, eot_weight, max_it, sharpness, noise_clipping, use_
         if use_confidence_noising:
             current_tokens = confidence_guided_noising(generated_tokens, answer_start, confidences, threshold, eot_weight, noise_clipping)
         else:
-            current_tokens = noisify_answer(generated_tokens, answer_start, threshold=threshold, eot_weight=eot_weight, clustering=clustering)
         time.sleep(0.01)

     num_to_noise = int(threshold * answer_len)
     if num_to_noise == 0:
+        return noised, []
     mixed_probs = token_probabilities.copy()
     mixed_probs[eot_token_id] *= eot_weight
     mixed_probs /= mixed_probs.sum()
+    num_clusters = max(1, int((1 - clustering) * num_to_noise))
     cluster_size = max(1, int(num_to_noise / num_clusters))
     noised_indices = set()
         span_end = min(len(noised), span_start + cluster_size)
         noised_indices.update(range(span_start, span_end))
     noised_indices = sorted(list(noised_indices))[:num_to_noise]
     noise = rng.choice(np.arange(vocab_size), size=len(noised_indices), p=mixed_probs)
     for idx, val in zip(noised_indices, noise):
         noised[idx] = val
+    return noised, noised_indices
 # Add new noising function
         input_ids = input_ids[:256]
     ori_input_tokens = input_ids
+    current_tokens, just_noised_indices = noisify_answer(
+        ori_input_tokens, answer_start, threshold=1.0, eot_weight=eot_weight, clustering=clustering
+    )
     prev_decoded_tokens = []
     last_tokens = []
         decoded_tokens = tokenizer.convert_ids_to_tokens(decoded_ids)
         filtered_tokens = [tok for tok in decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id]
         filtered_prev_tokens = [tok for tok in prev_decoded_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id] if prev_decoded_tokens else []
+        just_noised_indices = []
         if filtered_prev_tokens:
             highlighted = []
+            for i, tok in enumerate(decoded_tokens):
+                token_str = tokenizer.convert_tokens_to_string([tok])
+                abs_idx = answer_start + i
+                if abs_idx in just_noised_indices:
+                    highlighted.append(f'<span style="color:red">{token_str}</span>')
+                elif prev_decoded_tokens and i < len(prev_decoded_tokens) and tok != prev_decoded_tokens[i]:
+                    highlighted.append(f'<span style="color:green">{token_str}</span>')
                 else:
+                    highlighted.append(token_str)
         else:
             highlighted = [tokenizer.convert_tokens_to_string([tok]) for tok in filtered_tokens]
         if use_confidence_noising:
             current_tokens = confidence_guided_noising(generated_tokens, answer_start, confidences, threshold, eot_weight, noise_clipping)
         else:
+            current_tokens, just_noised_indices = noisify_answer(generated_tokens, answer_start, threshold=threshold, eot_weight=eot_weight, clustering=clustering)
         time.sleep(0.01)