Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,56 +1,237 @@
|
|
1 |
-
|
2 |
-
import nltk
|
3 |
-
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize, TweetTokenizer
|
4 |
-
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
5 |
-
from nltk.tokenize import NLTKWordTokenizer
|
6 |
-
nltk.download('punkt')
|
7 |
-
# Test cases for NLTKWordTokenizer
|
8 |
-
s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
|
9 |
-
print(word_tokenize(s1))
|
10 |
|
11 |
-
|
12 |
-
print(word_tokenize(s2))
|
13 |
|
14 |
-
|
15 |
-
print(word_tokenize(s3))
|
16 |
|
17 |
-
|
18 |
-
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
print(word_tokenize(s5))
|
22 |
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
33 |
-
print(word_tokenize(s9))
|
34 |
|
35 |
-
|
36 |
-
print(word_tokenize(s10))
|
37 |
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
43 |
-
(24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
44 |
-
(40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
45 |
-
(60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
46 |
-
print(list(NLTKWordTokenizer().span_tokenize(s)) == expected)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
st.markdown("""
|
|
|
4 |
|
5 |
+
# Thank you for contributing an eval! ♥️
|
|
|
6 |
|
7 |
+
🚨 Please make sure your PR follows these guidelines, __failure to follow
|
8 |
+
the guidelines below will result in the PR being closed automatically__.
|
9 |
+
Note that even if the criteria are met, that does not guarantee the PR
|
10 |
+
will be merged nor GPT-4 access granted. 🚨
|
11 |
|
12 |
+
__PLEASE READ THIS__:
|
|
|
13 |
|
14 |
+
In order for a PR to be merged, it must fail on GPT-4. We are aware that
|
15 |
+
right now, users do not have access, so you will not be able to tell if
|
16 |
+
the eval fails or not. Please run your eval with GPT-3.5-Turbo, but keep
|
17 |
+
in mind as we run the eval, if GPT-4 gets higher than 90% on the eval,
|
18 |
+
we will likely reject since GPT-4 is already capable of completing the
|
19 |
+
task.
|
20 |
|
21 |
+
We plan to roll out a way for users submitting evals to see the eval
|
22 |
+
performance on GPT-4 soon. Stay tuned! Until then, you will not be able
|
23 |
+
to see the eval performance on GPT-4. We encourage partial PR's with
|
24 |
+
~5-10 example that we can then run the evals on and share the results
|
25 |
+
with you so you know how your eval does with GPT-4 before writing all
|
26 |
+
100 examples.
|
27 |
|
28 |
+
## Eval details 📑
|
29 |
+
### Eval name
|
30 |
|
31 |
+
which-is-heavier
|
|
|
32 |
|
33 |
+
### Eval description
|
|
|
34 |
|
35 |
+
This evaluation tests the physical reasoning of GPT by asking which of
|
36 |
+
two quantities is heavier, where the quantities are assigned explicit
|
37 |
+
weights (e.g., "5 kilograms" or "2 pounds") and there is a clear answer
|
38 |
+
(e.g., Q: "Is 5 pounds of tissue paper heavier than 3 pounds of
|
39 |
+
granite?" A: "Yes"). The catch is that, in each example, the heavier
|
40 |
+
quantity is always associated with an item that is generally thought of
|
41 |
+
as being light (e.g., feathers, tissue paper, cotton balls) while the
|
42 |
+
lighter quantity is always associated with an item that is generally
|
43 |
+
thought of as being heavy (e.g., granite, cast iron, plutonium). Humans
|
44 |
+
can easily achieve 100% on this task, but they have to cognitively
|
45 |
+
ignore what comprises the quantities and focus on just their weights.
|
46 |
|
47 |
+
### What makes this a useful eval?
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
ChatGPT 3.5 does a decent job of comparing weights (i.e., "Is 5 pounds
|
50 |
+
greater than 3 pounds?" "Yes") as well as understanding common
|
51 |
+
attributions of "light" and "heavy" to everyday objects like feathers
|
52 |
+
and anvils, but when combining these two into a single comparison, it
|
53 |
+
appears that the model's bias towards these colloquial attributes makes
|
54 |
+
it difficult for the model to perform well.
|
55 |
|
56 |
+
What's interesting is that the errors are not even logically consistent.
|
57 |
+
Consider the following example (taken from the ChatGPT dialogue UX/UI
|
58 |
+
using GPT-4):
|
59 |
|
60 |
+
<img width="715" alt="Screen Shot 2023-03-21 at 4 45 44 PM"
|
61 |
+
src="https://user-images.githubusercontent.com/11773823/226827152-f78c0fb4-1136-433c-9d55-65eb6c6c5407.png">
|
62 |
+
|
63 |
+
According to GPT-4:
|
64 |
+
- 15 pounds of hydrogen is heavier than 10 pounds of plutonium
|
65 |
+
- 20 pounds of hydrogen is **not** heavier than 10 pounds of plutonium
|
66 |
+
- 30 pounds of hydrogen is heavier than 10 pounds of plutonium
|
67 |
+
|
68 |
+
GPT-4 appears to do better than GPT-3.5 in informal testing, but still
|
69 |
+
fails on simple cases and gives inconsistent results.
|
70 |
+
|
71 |
+
This task exposes how adversarial red herrings can potentially hamper
|
72 |
+
performance in quantitative/physical reasoning tasks such as weight
|
73 |
+
comparison.
|
74 |
+
|
75 |
+
The other nice aspect of this task is that one can generate these
|
76 |
+
examples programatically. I can submit a dataset of thousands rather
|
77 |
+
than just a few hundred if preferred.
|
78 |
+
|
79 |
+
## Criteria for a good eval ✅
|
80 |
+
|
81 |
+
Below are some of the criteria we look for in a good eval. In general,
|
82 |
+
we are seeking cases where the model does not do a good job despite
|
83 |
+
being capable of generating a good response (note that there are some
|
84 |
+
things large language models cannot do, so those would not make good
|
85 |
+
evals).
|
86 |
+
|
87 |
+
Your eval should be:
|
88 |
+
|
89 |
+
- [x] Thematically consistent: The eval should be thematically
|
90 |
+
consistent. We'd like to see a number of prompts all demonstrating some
|
91 |
+
particular failure mode. For example, we can create an eval on cases
|
92 |
+
where the model fails to reason about the physical world.
|
93 |
+
- [x] Contains failures where a human can do the task, but either GPT-4
|
94 |
+
or GPT-3.5-Turbo could not.
|
95 |
+
- [x] Includes good signal around what is the right behavior. This means
|
96 |
+
either a correct answer for `Basic` evals or the `Fact` Model-graded
|
97 |
+
eval, or an exhaustive rubric for evaluating answers for the `Criteria`
|
98 |
+
Model-graded eval.
|
99 |
+
- [x] Include at least 100 high quality examples (it is okay to only
|
100 |
+
contribute 5-10 meaningful examples and have us test them with GPT-4
|
101 |
+
before adding all 100)
|
102 |
+
|
103 |
+
If there is anything else that makes your eval worth including, please
|
104 |
+
document it below.
|
105 |
+
|
106 |
+
### Unique eval value
|
107 |
+
|
108 |
+
> Insert what makes your eval high quality that was not mentioned above.
|
109 |
+
(Not required)
|
110 |
+
|
111 |
+
## Eval structure 🏗️
|
112 |
+
|
113 |
+
Your eval should
|
114 |
+
- [x] Check that your data is in `evals/registry/data/{name}`
|
115 |
+
- [x] Check that your yaml is registered at
|
116 |
+
`evals/registry/evals/{name}.yaml`
|
117 |
+
- [x] Ensure you have the right to use the data you submit via this eval
|
118 |
+
|
119 |
+
(For now, we will only be approving evals that use one of the existing
|
120 |
+
eval classes. You may still write custom eval classes for your own
|
121 |
+
cases, and we may consider merging them in the future.)
|
122 |
+
|
123 |
+
## Final checklist 👀
|
124 |
+
|
125 |
+
### Submission agreement
|
126 |
+
|
127 |
+
By contributing to Evals, you are agreeing to make your evaluation logic
|
128 |
+
and data under the same MIT license as this repository. You must have
|
129 |
+
adequate rights to upload any data used in an Eval. OpenAI reserves the
|
130 |
+
right to use this data in future service improvements to our product.
|
131 |
+
Contributions to OpenAI Evals will be subject to our usual Usage
|
132 |
+
Policies (https://platform.openai.com/docs/usage-policies).
|
133 |
+
|
134 |
+
- [x] I agree that my submission will be made available under an MIT
|
135 |
+
license and complies with OpenAI's usage policies.
|
136 |
+
|
137 |
+
### Email address validation
|
138 |
+
|
139 |
+
If your submission is accepted, we will be granting GPT-4 access to a
|
140 |
+
limited number of contributors. Access will be given to the email
|
141 |
+
address associated with the merged pull request.
|
142 |
+
|
143 |
+
- [x] I acknowledge that GPT-4 access will only be granted, if
|
144 |
+
applicable, to the email address used for my merged pull request.
|
145 |
+
|
146 |
+
### Limited availability acknowledgement
|
147 |
+
|
148 |
+
We know that you might be excited to contribute to OpenAI's mission,
|
149 |
+
help improve our models, and gain access to GPT-4. However, due to the
|
150 |
+
requirements mentioned above and high volume of submissions, we will not
|
151 |
+
be able to accept all submissions and thus not grant everyone who opens
|
152 |
+
a PR GPT-4 access. We know this is disappointing, but we hope to set the
|
153 |
+
right expectation before you open this PR.
|
154 |
+
|
155 |
+
- [x] I understand that opening a PR, even if it meets the requirements
|
156 |
+
above, does not guarantee the PR will be merged nor GPT-4 access
|
157 |
+
granted.
|
158 |
+
|
159 |
+
### Submit eval
|
160 |
+
|
161 |
+
- [x] I have filled out all required fields in the evals PR form
|
162 |
+
- [x] (Ignore if not submitting code) I have run `pip install
|
163 |
+
pre-commit; pre-commit install` and have verified that `black`, `isort`,
|
164 |
+
and `autoflake` are running when I commit and push
|
165 |
+
|
166 |
+
Failure to fill out all required fields will result in the PR being
|
167 |
+
closed.
|
168 |
+
|
169 |
+
### Eval JSON data
|
170 |
+
|
171 |
+
Since we are using Git LFS, we are asking eval submitters to add in as
|
172 |
+
many Eval Samples (at least 5) from their contribution here:
|
173 |
+
|
174 |
+
<details>
|
175 |
+
<summary>View evals in JSON</summary>
|
176 |
+
|
177 |
+
### Eval
|
178 |
+
```jsonl
|
179 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
180 |
+
assistant."}, {"role": "user", "content": "Is 2 kilograms of eggshells
|
181 |
+
heavier than 1 kilogram of gold bars? Answer Yes or No"}], "ideal":
|
182 |
+
"Yes"}
|
183 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
184 |
+
assistant."}, {"role": "user", "content": "Is 9 pounds of soap foam
|
185 |
+
heavier than 5 pounds of iron chains? Answer Yes or No"}], "ideal":
|
186 |
+
"Yes"}
|
187 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
188 |
+
assistant."}, {"role": "user", "content": "Is 15 pounds of confetti
|
189 |
+
heavier than 10 pounds of rebar? Answer Yes or No"}], "ideal": "Yes"}
|
190 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
191 |
+
assistant."}, {"role": "user", "content": "Is 1 pound of steel beams
|
192 |
+
heavier than 2 pounds of dust particles? Answer Yes or No"}], "ideal":
|
193 |
+
"No"}
|
194 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
195 |
+
assistant."}, {"role": "user", "content": "Is 10 pounds of cast iron
|
196 |
+
heavier than 20 pounds of flakes? Answer Yes or No"}], "ideal": "No"}
|
197 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
198 |
+
assistant."}, {"role": "user", "content": "Is 10 kilograms of granite
|
199 |
+
slabs heavier than 20 kilograms of balloons? Answer Yes or No"}],
|
200 |
+
"ideal": "No"}
|
201 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
202 |
+
assistant."}, {"role": "user", "content": "Is 10 kilograms of bricks
|
203 |
+
heavier than 15 kilograms of dust particles? Answer Yes or No"}],
|
204 |
+
"ideal": "No"}
|
205 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
206 |
+
assistant."}, {"role": "user", "content": "Is 1 pound of bronze heavier
|
207 |
+
than 2 pounds of snowflakes? Answer Yes or No"}], "ideal": "No"}
|
208 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
209 |
+
assistant."}, {"role": "user", "content": "Is 1 pound of cast iron
|
210 |
+
heavier than 3 pounds of spider silk? Answer Yes or No"}], "ideal":
|
211 |
+
"No"}
|
212 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
213 |
+
assistant."}, {"role": "user", "content": "Is 12 pounds of hydrogen
|
214 |
+
heavier than 10 pounds of palladium? Answer Yes or No"}], "ideal":
|
215 |
+
"Yes"}
|
216 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
217 |
+
assistant."}, {"role": "user", "content": "Is 6 kilograms of feathers
|
218 |
+
heavier than 5 kilograms of gold bars? Answer Yes or No"}], "ideal":
|
219 |
+
"Yes"}
|
220 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
221 |
+
assistant."}, {"role": "user", "content": "Is 7 kilograms of eggshells
|
222 |
+
heavier than 5 kilograms of bronze? Answer Yes or No"}], "ideal": "Yes"}
|
223 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
224 |
+
assistant."}, {"role": "user", "content": "Is 10 kilograms of lead
|
225 |
+
heavier than 20 kilograms of floating seeds? Answer Yes or No"}],
|
226 |
+
"ideal": "No"}
|
227 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
228 |
+
assistant."}, {"role": "user", "content": "Is 30 pounds of feathers
|
229 |
+
heavier than 10 pounds of rebar? Answer Yes or No"}], "ideal": "Yes"}
|
230 |
+
{"input": [{"role": "system", "content": "You are a helpful
|
231 |
+
assistant."}, {"role": "user", "content": "Is 2 pounds of pencil
|
232 |
+
shavings heavier than 1 pound of iron chains? Answer Yes or No"}],
|
233 |
+
"ideal": "Yes"}
|
234 |
+
```
|
235 |
+
</details>
|
236 |
+
|
237 |
+
""")
|