File size: 4,962 Bytes
ede5a01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b32a0cf
ede5a01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

import itertools
import pickle


# Import and download necessary NLTK data for tokenization.
import nltk
from nltk.translate.bleu_score import sentence_bleu

nltk.download('punkt')

# Import the ROUGE metric implementation.
from rouge import Rouge

rouge = Rouge()

from datasets import load_dataset
import streamlit as st

# Use name="sample-10BT" to use the 10BT sample.
fw = load_dataset("HuggingFaceFW/fineweb", name="CC-MAIN-2024-10", split="train", streaming=True)


# Define helper functions for character-level accuracy and precision.
def char_accuracy(true_output, model_output):
    # Compare matching characters in corresponding positions.
    matches = sum(1 for c1, c2 in zip(true_output, model_output) if c1 == c2)
    # Account for any extra characters in either string.
    total = max(len(true_output), len(model_output))
    return matches / total if total > 0 else 1.0


def char_precision(true_output, model_output):
    # Precision is matching characters divided by the length of the model's output.
    matches = sum(1 for c1, c2 in zip(true_output, model_output) if c1 == c2)
    return matches / len(model_output) if len(model_output) > 0 else 0.0


# Initialize Streamlit app
st.title("Model Evaluation App")
st.write("This app evaluates a model's ability to reverse input text character by character.")

# Parameters
word_threshold = st.sidebar.number_input("Word Threshold", value=100, step=10)
num_samples = st.sidebar.number_input("Number of Samples", value=1, step=1)

# Get samples
samples = list(itertools.islice(fw, num_samples))
acc = []
pres = []
bleu = []
rouges = []

for x in samples:
    nextt = x["text"].split(" ")
    for n in range(len(nextt) // word_threshold):
        inp = nextt[word_threshold * n: word_threshold * (n + 1)]
        inp = " ".join(inp).replace("\n", "")

        # Display the input text
        st.subheader("Input Text")
        st.write(inp)

        prompt = (
                "You are a helpful assistant that echoes the user's input, but backwards, "
                "do not simply rearrange the words, reverse the user's input down to the character "
                "(e.g. reverse Hello World to dlroW olleH). Surround the backwards version of the "
                "user's input with <back> </back> tags. " + inp
        )

        # Ground truth: reverse the input (character by character)
        true_output = inp[::-1]
        st.subheader("True Output")
        st.write(true_output)

        # Get the model output (Here, we simulate it or integrate your model inference)
        # For demonstration purposes, we'll reverse the input as the model output
        # Replace this part with your model's actual output
        model_output_full = st.text_input("Model Ouput:", "")

        # Extract the text between <back> and </back> tags
        tag1 = model_output_full.find("<back>")
        tag2 = model_output_full.find("</back>")
        model_output = model_output_full[tag1 + 6: tag2]
        st.subheader("Model Output")
        st.write(model_output)

        # Tokenize both outputs for BLEU calculation
        reference_tokens = nltk.word_tokenize(true_output)
        candidate_tokens = nltk.word_tokenize(model_output)

        # Compute BLEU score (using the single reference)
        bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
        st.write("**BLEU Score:**", bleu_score)

        # Compute ROUGE scores
        rouge_scores = rouge.get_scores(model_output, true_output)
        st.write("**ROUGE Scores:**")
        st.json(rouge_scores)

        # Compute character-level accuracy and precision
        accuracy_metric = char_accuracy(true_output, model_output)
        precision_metric = char_precision(true_output, model_output)
        st.write("**Character Accuracy:**", accuracy_metric)
        st.write("**Character Precision:**", precision_metric)

        st.markdown("---")

        # Append metrics to lists
        acc.append(accuracy_metric)
        pres.append(precision_metric)
        bleu.append(bleu_score)
        rouges.append(rouge_scores)

# Allow the user to download the metrics
if st.button("Download Metrics"):
    with open('accuracy.pkl', 'wb') as file:
        pickle.dump(acc, file)
    with open('precision.pkl', 'wb') as file:
        pickle.dump(pres, file)
    with open('bleu.pkl', 'wb') as file:
        pickle.dump(bleu, file)
    with open('rouge.pkl', 'wb') as file:
        pickle.dump(rouges, file)
    st.success("Metrics saved successfully!")

    # Provide download links
    st.download_button('Download Accuracy Metrics', data=open('accuracy.pkl', 'rb'), file_name='accuracy.pkl')
    st.download_button('Download Precision Metrics', data=open('precision.pkl', 'rb'), file_name='precision.pkl')
    st.download_button('Download BLEU Metrics', data=open('bleu.pkl', 'rb'), file_name='bleu.pkl')
    st.download_button('Download ROUGE Metrics', data=open('rouge.pkl', 'rb'), file_name='rouge.pkl')