Update app.py
#17
by
vesjanamimini
- opened
app.py
CHANGED
@@ -1,359 +1,24 @@
|
|
1 |
-
import
|
2 |
-
from
|
3 |
-
from pydantic import BaseModel, Field
|
4 |
-
import gradio as gr
|
5 |
-
from datasets import load_dataset
|
6 |
-
from huggingface_hub import InferenceClient
|
7 |
-
import black
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
# Load questions from Hugging Face dataset
|
15 |
-
EXAM_MAX_QUESTIONS = int(os.getenv("EXAM_MAX_QUESTIONS", 1))
|
16 |
-
EXAM_DATASET_ID = "agents-course/smolagents-quiz-data"
|
17 |
|
18 |
-
#
|
19 |
-
ds = load_dataset(EXAM_DATASET_ID, split="train", download_mode="force_redownload")
|
20 |
-
quiz_data = list(ds)
|
21 |
-
if EXAM_MAX_QUESTIONS:
|
22 |
-
quiz_data = quiz_data[:EXAM_MAX_QUESTIONS]
|
23 |
|
24 |
-
# Check if dataset has image feature
|
25 |
-
HAS_IMAGE_FEATURE = "image" in ds.features
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
"
|
30 |
-
|
31 |
-
criterion: str = Field(..., description="The assessment criterion being evaluated")
|
32 |
-
met: bool = Field(..., description="Whether the criterion was met")
|
33 |
-
explanation: str = Field(
|
34 |
-
..., description="Detailed explanation of how well the criterion was met"
|
35 |
-
)
|
36 |
-
improvement_tips: Optional[str] = Field(
|
37 |
-
None, description="Specific tips for improvement if needed"
|
38 |
-
)
|
39 |
-
|
40 |
-
|
41 |
-
class CodeFeedback(BaseModel):
|
42 |
-
"""Structured feedback for code submission"""
|
43 |
-
|
44 |
-
overall_feedback: str = Field(
|
45 |
-
..., description="Overall assessment of the code solution"
|
46 |
-
)
|
47 |
-
criteria_feedback: List[CriterionFeedback] = Field(
|
48 |
-
..., description="Detailed feedback for each assessment criterion"
|
49 |
-
)
|
50 |
-
|
51 |
-
|
52 |
-
def format_python_code(code: str) -> str:
|
53 |
-
"""Format Python code using black."""
|
54 |
-
try:
|
55 |
-
return black.format_str(code, mode=black.Mode())
|
56 |
-
except Exception as e:
|
57 |
-
gr.Warning(f"Code formatting failed: {str(e)}")
|
58 |
-
return code
|
59 |
-
|
60 |
-
|
61 |
-
EVALUATION_TEMPLATE = """Evaluate this Python code solution:
|
62 |
-
|
63 |
-
Challenge:
|
64 |
-
{challenge}
|
65 |
-
|
66 |
-
Reference Solution:
|
67 |
-
```python
|
68 |
-
|
69 |
-
{solution}
|
70 |
-
|
71 |
-
```
|
72 |
-
|
73 |
-
Student's Solution:
|
74 |
-
|
75 |
-
```python
|
76 |
-
|
77 |
-
{student_code}
|
78 |
-
|
79 |
-
```
|
80 |
-
|
81 |
-
Assessment Criteria:
|
82 |
-
{criteria}
|
83 |
-
|
84 |
-
Approach:
|
85 |
-
Be highly tollerent of differences in approach, as long as they meet Assessment Criteria.
|
86 |
-
|
87 |
-
Provide detailed feedback on how well each criterion was met."""
|
88 |
-
|
89 |
-
|
90 |
-
def check_code(
|
91 |
-
user_code: str, solution: str, challenge: str, assessment_criteria: List[str]
|
92 |
-
) -> dict:
|
93 |
-
"""
|
94 |
-
Use LLM to evaluate the user's code solution and provide structured feedback.
|
95 |
-
"""
|
96 |
-
# Format both user code and solution
|
97 |
-
formatted_user_code = format_python_code(user_code)
|
98 |
-
formatted_solution = format_python_code(solution)
|
99 |
-
|
100 |
-
# Format criteria as bullet points
|
101 |
-
criteria_text = "\n".join(f"- {c}" for c in assessment_criteria)
|
102 |
-
|
103 |
-
# Fill the template
|
104 |
-
prompt = EVALUATION_TEMPLATE.format(
|
105 |
-
challenge=challenge,
|
106 |
-
solution=formatted_solution,
|
107 |
-
student_code=formatted_user_code,
|
108 |
-
criteria=criteria_text,
|
109 |
-
)
|
110 |
-
|
111 |
-
try:
|
112 |
-
# Get structured feedback using response_format with schema from Pydantic model
|
113 |
-
response = client.text_generation(
|
114 |
-
prompt=prompt,
|
115 |
-
grammar={
|
116 |
-
"type": "json_object",
|
117 |
-
"value": CodeFeedback.model_json_schema(),
|
118 |
-
},
|
119 |
-
)
|
120 |
-
|
121 |
-
# Parse response into Pydantic model
|
122 |
-
feedback = CodeFeedback.model_validate_json(response)
|
123 |
-
|
124 |
-
# Format the feedback for display
|
125 |
-
formatted_feedback = [
|
126 |
-
f"### Overall Assessment\n{feedback.overall_feedback}\n\n"
|
127 |
-
]
|
128 |
-
|
129 |
-
for cf in feedback.criteria_feedback:
|
130 |
-
tip = cf.improvement_tips or ""
|
131 |
-
tip_text = f"\n💡 Tip: {tip}" if tip else ""
|
132 |
-
|
133 |
-
formatted_feedback.append(
|
134 |
-
f"### {cf.criterion}\n"
|
135 |
-
f"{'✅' if cf.met else '❌'} {cf.explanation}"
|
136 |
-
f"{tip_text}\n"
|
137 |
-
)
|
138 |
-
|
139 |
-
return {"feedback": "\n".join(formatted_feedback)}
|
140 |
-
|
141 |
-
except Exception as e:
|
142 |
-
gr.Warning(f"Error generating feedback: {str(e)}")
|
143 |
-
return {"feedback": "Unable to generate detailed feedback due to an error."}
|
144 |
-
|
145 |
-
|
146 |
-
def on_user_logged_in(token: gr.OAuthToken | None):
|
147 |
-
"""
|
148 |
-
Handle user login state.
|
149 |
-
On a valid token, hide the login button and reveal the Start button while keeping Next hidden.
|
150 |
-
Also, clear the question text, code input, status, and image.
|
151 |
-
"""
|
152 |
-
if token is not None:
|
153 |
-
return (
|
154 |
-
gr.update(visible=False), # login_btn hidden
|
155 |
-
gr.update(visible=True), # start_btn shown
|
156 |
-
gr.update(visible=False), # next_btn hidden
|
157 |
-
"", # Clear question_text
|
158 |
-
gr.update(value="", visible=False), # Clear code_input
|
159 |
-
"", # Clear status_text
|
160 |
-
gr.update(value="", visible=False), # Clear question_image
|
161 |
-
)
|
162 |
-
else:
|
163 |
-
return (
|
164 |
-
gr.update(visible=True), # login_btn visible
|
165 |
-
gr.update(visible=False), # start_btn hidden
|
166 |
-
gr.update(visible=False), # next_btn hidden
|
167 |
-
"",
|
168 |
-
gr.update(value="", visible=False),
|
169 |
-
"",
|
170 |
-
gr.update(value="", visible=False),
|
171 |
-
)
|
172 |
-
|
173 |
-
|
174 |
-
def handle_quiz(question_idx, user_answers, submitted_code, is_start):
|
175 |
-
"""Handle quiz state and progression"""
|
176 |
-
if is_start:
|
177 |
-
question_idx = 0
|
178 |
-
else:
|
179 |
-
# If not the first question and there's a submission, store it
|
180 |
-
if question_idx < len(quiz_data) and submitted_code.strip():
|
181 |
-
current_q = quiz_data[question_idx]
|
182 |
-
# Format the submitted code before checking
|
183 |
-
formatted_code = format_python_code(submitted_code)
|
184 |
-
feedback_dict = check_code(
|
185 |
-
formatted_code,
|
186 |
-
current_q["solution"],
|
187 |
-
current_q["challenge"],
|
188 |
-
current_q["assessment_criteria"],
|
189 |
-
)
|
190 |
-
user_answers.append(
|
191 |
-
{
|
192 |
-
"challenge": current_q["challenge"],
|
193 |
-
"submitted_code": formatted_code,
|
194 |
-
"correct_solution": current_q["solution"],
|
195 |
-
"assessment_criteria": current_q["assessment_criteria"],
|
196 |
-
"feedback": feedback_dict["feedback"],
|
197 |
-
}
|
198 |
-
)
|
199 |
-
question_idx += 1
|
200 |
-
|
201 |
-
# If we've reached the end, show final results
|
202 |
-
if question_idx >= len(quiz_data):
|
203 |
-
results_text = """## Code Review Complete! 📚
|
204 |
-
This feedback should help you improve your skills.
|
205 |
-
|
206 |
-
⛔️ The feedback uses Qwen/Qwen2.5-Coder-32B-Instruct to compare your response to a gold
|
207 |
-
standard solution. As we know, LLMs are not perfect. You should compare your work against
|
208 |
-
the assessment criteria if you doubt the feedback.
|
209 |
-
|
210 |
-
Here's your detailed feedback:"""
|
211 |
-
|
212 |
-
for idx, answer in enumerate(user_answers):
|
213 |
-
# Format assessment criteria as bullet points
|
214 |
-
criteria_bullets = "\n".join(
|
215 |
-
f"- {c}" for c in answer["assessment_criteria"]
|
216 |
-
)
|
217 |
-
|
218 |
-
# Build the results text piece by piece
|
219 |
-
results_text += (
|
220 |
-
f"### Question {idx + 1}: {answer['challenge']}\n\n"
|
221 |
-
"#### Your Solution:\n```python\n"
|
222 |
-
f"{answer['submitted_code']}\n```\n\n"
|
223 |
-
"#### Reference Solution:\n```python\n"
|
224 |
-
f"{answer['correct_solution']}\n```\n\n"
|
225 |
-
"#### Assessment Criteria:\n"
|
226 |
-
f"{criteria_bullets}\n\n"
|
227 |
-
"#### Feedback:\n"
|
228 |
-
f"{answer['feedback']}\n\n"
|
229 |
-
"---\n\n"
|
230 |
-
)
|
231 |
-
|
232 |
-
return (
|
233 |
-
"", # question_text cleared
|
234 |
-
gr.update(value="", visible=False), # hide code_input
|
235 |
-
"Review your feedback below to improve your coding skills!",
|
236 |
-
question_idx, # updated question index
|
237 |
-
user_answers, # accumulated answers
|
238 |
-
gr.update(visible=False), # start_btn hidden
|
239 |
-
gr.update(visible=False), # next_btn hidden
|
240 |
-
gr.update(value=results_text, visible=True), # final_markdown
|
241 |
-
gr.update(visible=False), # question_image hidden
|
242 |
-
)
|
243 |
-
else:
|
244 |
-
# Show the next question
|
245 |
-
q = quiz_data[question_idx]
|
246 |
-
# Format assessment criteria as bullet points
|
247 |
-
criteria_bullets = "\n".join(f"- {c}" for c in q["assessment_criteria"])
|
248 |
-
challenge_text = (
|
249 |
-
f"## Question {question_idx + 1}\n\n"
|
250 |
-
f"### Challenge:\n{q['challenge']}\n\n"
|
251 |
-
"### Assessment Criteria:\n"
|
252 |
-
f"{criteria_bullets}"
|
253 |
-
)
|
254 |
-
|
255 |
-
# Only show image if the feature exists and question has an image
|
256 |
-
show_image = HAS_IMAGE_FEATURE and q.get("image") is not None
|
257 |
-
image_update = gr.update(
|
258 |
-
value=q.get("image") if show_image else None, visible=show_image
|
259 |
-
)
|
260 |
-
|
261 |
-
return (
|
262 |
-
challenge_text, # question_text
|
263 |
-
gr.update(value=q["placeholder"], visible=True), # code_input
|
264 |
-
"Submit your solution and click 'Next' to continue.",
|
265 |
-
question_idx, # updated question_idx
|
266 |
-
user_answers, # user_answers
|
267 |
-
gr.update(visible=False), # start_btn hidden
|
268 |
-
gr.update(visible=True), # next_btn visible
|
269 |
-
gr.update(visible=False), # final_markdown hidden
|
270 |
-
image_update, # question_image
|
271 |
-
)
|
272 |
-
|
273 |
-
|
274 |
-
with gr.Blocks() as demo:
|
275 |
-
demo.title = f"Coding Quiz: {EXAM_DATASET_ID}"
|
276 |
-
# State variables
|
277 |
-
question_idx = gr.State(value=0)
|
278 |
-
user_answers = gr.State(value=[])
|
279 |
-
|
280 |
-
with gr.Row(variant="compact"):
|
281 |
-
intro_text = """
|
282 |
-
## Welcome to the smolagents code reviewer
|
283 |
-
|
284 |
-
This application will review your smolagents code, and provide feedback on your solutions. This exercise is not reviewed or certified! It's about trying out smolagents for the first time.
|
285 |
-
|
286 |
-
ℹ️ Log in first, then click 'Start' to begin. Complete each coding challenge and click 'Next' to proceed. You'll get feedback on your solutions at the end."""
|
287 |
-
intro_text = gr.Markdown(intro_text)
|
288 |
-
with gr.Row(variant="panel"):
|
289 |
-
with gr.Column():
|
290 |
-
question_text = gr.Markdown("")
|
291 |
-
question_image = gr.Image(
|
292 |
-
label="Question Image",
|
293 |
-
visible=True if HAS_IMAGE_FEATURE else False,
|
294 |
-
type="pil",
|
295 |
-
) # Add image component
|
296 |
-
with gr.Column():
|
297 |
-
code_input = gr.Code(
|
298 |
-
language="python", label="Your Solution", visible=False
|
299 |
-
)
|
300 |
-
|
301 |
-
with gr.Row(variant="compact"):
|
302 |
-
status_text = gr.Markdown("")
|
303 |
-
|
304 |
-
with gr.Row(variant="compact"):
|
305 |
-
login_btn = gr.LoginButton()
|
306 |
-
start_btn = gr.Button("Start")
|
307 |
-
next_btn = gr.Button("Next ⏭️", visible=False)
|
308 |
-
|
309 |
-
with gr.Row(variant="compact"):
|
310 |
-
final_markdown = gr.Markdown("", visible=False)
|
311 |
-
|
312 |
-
login_btn.click(
|
313 |
-
fn=on_user_logged_in,
|
314 |
-
inputs=None,
|
315 |
-
outputs=[
|
316 |
-
login_btn,
|
317 |
-
start_btn,
|
318 |
-
next_btn,
|
319 |
-
question_text,
|
320 |
-
code_input,
|
321 |
-
status_text,
|
322 |
-
question_image,
|
323 |
-
],
|
324 |
-
)
|
325 |
-
|
326 |
-
start_btn.click(
|
327 |
-
fn=handle_quiz,
|
328 |
-
inputs=[question_idx, user_answers, code_input, gr.State(True)],
|
329 |
-
outputs=[
|
330 |
-
question_text, # Markdown with question text
|
331 |
-
code_input, # Code input field
|
332 |
-
status_text, # Status text (instructions/status messages)
|
333 |
-
question_idx, # Updated question index (state)
|
334 |
-
user_answers, # Updated user answers (state)
|
335 |
-
start_btn, # Update for start button (will be hidden)
|
336 |
-
next_btn, # Update for next button (shown for in-progress quiz)
|
337 |
-
final_markdown, # Final results markdown (hidden until quiz ends)
|
338 |
-
question_image, # Image update for the quiz question
|
339 |
-
],
|
340 |
-
)
|
341 |
-
|
342 |
-
next_btn.click(
|
343 |
-
fn=handle_quiz,
|
344 |
-
inputs=[question_idx, user_answers, code_input, gr.State(False)],
|
345 |
-
outputs=[
|
346 |
-
question_text,
|
347 |
-
code_input,
|
348 |
-
status_text,
|
349 |
-
question_idx,
|
350 |
-
user_answers,
|
351 |
-
start_btn,
|
352 |
-
next_btn,
|
353 |
-
final_markdown,
|
354 |
-
question_image,
|
355 |
-
],
|
356 |
-
)
|
357 |
-
|
358 |
-
if __name__ == "__main__":
|
359 |
-
demo.launch()
|
|
|
1 |
+
from smolagents import DuckDuckGoSearchTool, LiteLLMModel, ToolCallingAgent
|
2 |
+
from e2b import Sandbox
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
model = LiteLLMModel(
|
5 |
+
model_id="groq/meta-llama/llama-4-maverick-17b-128e-instruct",
|
6 |
+
api_key="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"
|
7 |
+
)
|
8 |
|
|
|
|
|
|
|
9 |
|
10 |
+
sandbox = Sandbox(template="", api_key="e2b_2139e46aaa78c6a307d269c2ffe98b05b4164b22") # You can specify CPU, memory, timeout, etc.
|
|
|
|
|
|
|
|
|
11 |
|
|
|
|
|
12 |
|
13 |
+
# Create web agent and manager agent structure
|
14 |
+
web_agent = ToolCallingAgent(
|
15 |
+
tools=[DuckDuckGoSearchTool()], # Add required tools
|
16 |
+
model=model, # Add model
|
17 |
+
max_steps=5, # Adjust steps
|
18 |
+
name="My browser", # Add name
|
19 |
+
description="Searching is easy" # Add description
|
20 |
+
)
|
21 |
|
22 |
+
with sandbox:
|
23 |
+
result = web_agent.run("What are some recent breakthroughs in AI?")
|
24 |
+
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|