Update app.py
Browse files
app.py
CHANGED
@@ -329,178 +329,106 @@
|
|
329 |
# demo.launch(share=True)
|
330 |
|
331 |
import os
|
332 |
-
import
|
333 |
-
import io
|
334 |
-
import uuid
|
335 |
-
import contextlib
|
336 |
import gradio as gr
|
337 |
-
|
338 |
-
import shutil
|
339 |
|
340 |
-
#
|
341 |
-
|
342 |
|
343 |
-
|
344 |
-
from vision_agent.models import AgentMessage
|
345 |
-
|
346 |
-
#############################################
|
347 |
-
# GLOBAL INITIALIZATION
|
348 |
-
#############################################
|
349 |
-
|
350 |
-
# Create a unique temporary directory for saved images
|
351 |
-
TEMP_DIR = "temp_images"
|
352 |
-
if not os.path.exists(TEMP_DIR):
|
353 |
-
os.makedirs(TEMP_DIR)
|
354 |
-
|
355 |
-
# Initialize VisionAgentCoderV2 with verbose logging so the generated code has detailed print outputs.
|
356 |
-
agent = VisionAgentCoderV2(verbose=True)
|
357 |
-
|
358 |
-
#############################################
|
359 |
-
# UTILITY: SAVE UPLOADED IMAGE TO A TEMP FILE
|
360 |
-
#############################################
|
361 |
-
|
362 |
-
def save_uploaded_image(image):
|
363 |
"""
|
364 |
-
|
365 |
-
|
366 |
"""
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
def
|
378 |
"""
|
379 |
-
|
380 |
-
|
381 |
-
Returns a list of the extracted filenames.
|
382 |
"""
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
#############################################
|
389 |
|
390 |
-
def
|
391 |
"""
|
392 |
-
|
393 |
-
|
394 |
-
|
|
|
395 |
"""
|
396 |
-
# Parse the code for image filenames saved via save_image
|
397 |
-
filenames = parse_saved_image_filenames(code_str)
|
398 |
-
|
399 |
-
# Capture stdout using a StringIO buffer
|
400 |
-
buf = io.StringIO()
|
401 |
-
with contextlib.redirect_stdout(buf):
|
402 |
-
# IMPORTANT: Here we exec the generated code.
|
403 |
-
exec(code_str, globals(), locals())
|
404 |
-
|
405 |
-
# Gather all printed output
|
406 |
-
output = buf.getvalue()
|
407 |
-
|
408 |
-
# Check which of the parsed filenames exist on disk (prepend TEMP_DIR if needed)
|
409 |
-
existing_images = []
|
410 |
-
for fn in filenames:
|
411 |
-
# If filename is not an absolute path, assume it is in TEMP_DIR
|
412 |
-
if not os.path.isabs(fn):
|
413 |
-
fn = os.path.join(TEMP_DIR, fn)
|
414 |
-
if os.path.exists(fn):
|
415 |
-
existing_images.append(fn)
|
416 |
-
return output, existing_images
|
417 |
-
|
418 |
-
#############################################
|
419 |
-
# CHAT FUNCTION: PROCESS USER PROMPT & IMAGE
|
420 |
-
#############################################
|
421 |
-
|
422 |
-
def chat(prompt, image, history):
|
423 |
-
"""
|
424 |
-
When the user sends a prompt and optionally an image, do the following:
|
425 |
-
1. Save the image to a temp file.
|
426 |
-
2. Use VisionAgentCoderV2 to generate code for the task.
|
427 |
-
3. Execute the generated code, capturing its stdout logs and any saved image files.
|
428 |
-
4. Append the logs and image gallery info to the conversation history.
|
429 |
-
"""
|
430 |
-
# Validate that an image was provided.
|
431 |
if image is None:
|
432 |
-
|
433 |
-
return history, None
|
434 |
-
|
435 |
-
# Save the uploaded image for use in the generated code.
|
436 |
-
image_path = save_uploaded_image(image)
|
437 |
-
|
438 |
-
# Generate the code with VisionAgent using the user prompt and the image filename.
|
439 |
-
code_context = agent.generate_code(
|
440 |
-
[
|
441 |
-
AgentMessage(
|
442 |
-
role="user",
|
443 |
-
content=prompt,
|
444 |
-
media=[image_path]
|
445 |
-
)
|
446 |
-
]
|
447 |
-
)
|
448 |
|
449 |
-
#
|
450 |
-
|
451 |
-
|
452 |
-
# Run the generated code and capture output and any saved images.
|
453 |
-
stdout_text, image_files = run_and_capture_with_images(generated_code)
|
454 |
|
455 |
-
#
|
456 |
-
|
457 |
-
if image_files:
|
458 |
-
response_text += "\n**Saved Images:** " + ", ".join(image_files)
|
459 |
-
else:
|
460 |
-
response_text += "\nNo images were saved by the generated code."
|
461 |
|
462 |
-
#
|
463 |
-
|
464 |
|
465 |
-
#
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
|
|
|
|
471 |
|
|
|
472 |
with gr.Blocks() as demo:
|
473 |
-
gr.Markdown("# VisionAgent
|
474 |
gr.Markdown(
|
475 |
"""
|
476 |
-
|
477 |
-
|
478 |
-
|
|
|
|
|
|
|
|
|
|
|
479 |
"""
|
480 |
)
|
481 |
|
482 |
with gr.Row():
|
483 |
-
|
484 |
-
|
485 |
-
prompt_input = gr.Textbox(label="Enter Prompt", placeholder="e.g., Count the number of cacao oranges in the image")
|
486 |
-
submit_btn = gr.Button("Send")
|
487 |
-
with gr.Column(scale=5):
|
488 |
-
image_input = gr.Image(label="Upload Image", type="numpy")
|
489 |
|
490 |
-
|
491 |
|
492 |
-
|
493 |
-
|
494 |
|
495 |
-
|
496 |
-
def user_chat_wrapper(prompt, image, history):
|
497 |
-
history = history or []
|
498 |
-
history, image_files = chat(prompt, image, history)
|
499 |
-
return history, image_files
|
500 |
-
|
501 |
-
submit_btn.click(fn=user_chat_wrapper, inputs=[prompt_input, image_input, chatbot], outputs=[chatbot, gallery])
|
502 |
-
|
503 |
-
clear_btn.click(lambda: ([], None), None, [chatbot, gallery])
|
504 |
|
505 |
demo.launch()
|
506 |
|
|
|
|
329 |
# demo.launch(share=True)
|
330 |
|
331 |
import os
|
332 |
+
import openai
|
|
|
|
|
|
|
333 |
import gradio as gr
|
334 |
+
import vision_agent.tools as T
|
|
|
335 |
|
336 |
+
# Set your OpenAI API key (ensure the environment variable is set or replace with your key)
|
337 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "your-openai-api-key-here")
|
338 |
|
339 |
+
def get_single_prompt(user_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
"""
|
341 |
+
Uses OpenAI to rephrase the user's chatter into a single, concise prompt for object detection.
|
342 |
+
The generated prompt will not include any question marks.
|
343 |
"""
|
344 |
+
if not user_input.strip():
|
345 |
+
user_input = "Detect objects in the image"
|
346 |
+
|
347 |
+
prompt_instruction = (
|
348 |
+
f"Based on the following user input, generate a single, concise prompt for object detection. "
|
349 |
+
f"Do not include any question marks in the output. "
|
350 |
+
f"User input: \"{user_input}\""
|
351 |
+
)
|
352 |
+
|
353 |
+
response = openai.Completion.create(
|
354 |
+
engine="text-davinci-003",
|
355 |
+
prompt=prompt_instruction,
|
356 |
+
max_tokens=50,
|
357 |
+
n=1,
|
358 |
+
stop=None,
|
359 |
+
temperature=0.3,
|
360 |
+
)
|
361 |
+
generated_prompt = response.choices[0].text.strip()
|
362 |
+
# Ensure no question marks exist in the prompt.
|
363 |
+
generated_prompt = generated_prompt.replace("?", "")
|
364 |
+
return generated_prompt
|
365 |
|
366 |
+
def is_count_query(user_input):
|
367 |
"""
|
368 |
+
Check if the user's input indicates a counting request.
|
369 |
+
Looks for common keywords such as "count", "how many", "number of", etc.
|
|
|
370 |
"""
|
371 |
+
keywords = ["count", "how many", "number of", "total", "get me a count"]
|
372 |
+
for kw in keywords:
|
373 |
+
if kw.lower() in user_input.lower():
|
374 |
+
return True
|
375 |
+
return False
|
|
|
376 |
|
377 |
+
def process_question_and_detect(user_input, image):
|
378 |
"""
|
379 |
+
1. Uses OpenAI to generate a single, concise prompt (without question marks) from the user's input.
|
380 |
+
2. Feeds that prompt to the VisionAgent detection function.
|
381 |
+
3. Overlays the detection bounding boxes on the image.
|
382 |
+
4. If the user's input is a counting query, it also returns the count of detected objects.
|
383 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
if image is None:
|
385 |
+
return None, "Please upload an image."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
|
387 |
+
# Generate a concise prompt from the user's input.
|
388 |
+
generated_prompt = get_single_prompt(user_input)
|
|
|
|
|
|
|
389 |
|
390 |
+
# Run object detection using the generated prompt.
|
391 |
+
dets = T.agentic_object_detection(generated_prompt, image)
|
|
|
|
|
|
|
|
|
392 |
|
393 |
+
# Overlay bounding boxes on the image.
|
394 |
+
viz = T.overlay_bounding_boxes(image, dets)
|
395 |
|
396 |
+
# Check if the user's input implies a counting request.
|
397 |
+
count_text = ""
|
398 |
+
if is_count_query(user_input):
|
399 |
+
count = len(dets)
|
400 |
+
count_text = f"Detected {count} objects."
|
401 |
+
|
402 |
+
output_text = f"Generated prompt: {generated_prompt}\n{count_text}"
|
403 |
+
return viz, output_text
|
404 |
|
405 |
+
# Build the Gradio interface.
|
406 |
with gr.Blocks() as demo:
|
407 |
+
gr.Markdown("# VisionAgent Object Detection and Counting App")
|
408 |
gr.Markdown(
|
409 |
"""
|
410 |
+
Enter your input (for example:
|
411 |
+
- "What is the number of fruit in my image?"
|
412 |
+
- "How many bicycles can you see?"
|
413 |
+
- "Get me a count of my bottles")
|
414 |
+
and upload an image.
|
415 |
+
|
416 |
+
The app uses OpenAI to generate a single, concise prompt for object detection (without question marks),
|
417 |
+
then runs the detection. If your input implies a counting request, it will also display the count of detected objects.
|
418 |
"""
|
419 |
)
|
420 |
|
421 |
with gr.Row():
|
422 |
+
user_input = gr.Textbox(label="Enter your input", placeholder="Type your input here...")
|
423 |
+
image_input = gr.Image(label="Upload Image", type="numpy")
|
|
|
|
|
|
|
|
|
424 |
|
425 |
+
submit_btn = gr.Button("Detect and Count")
|
426 |
|
427 |
+
output_image = gr.Image(label="Detection Result")
|
428 |
+
output_text = gr.Textbox(label="Output Details")
|
429 |
|
430 |
+
submit_btn.click(fn=process_question_and_detect, inputs=[user_input, image_input], outputs=[output_image, output_text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
|
432 |
demo.launch()
|
433 |
|
434 |
+
|