Spaces:

obichimav
/

Object-Detection-and-Plant-Analysis-System

Running

App Files Files Community

obichimav commited on Feb 21

Commit

e5939e0

verified ·

1 Parent(s): 2541bac

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -145

app.py CHANGED Viewed

@@ -329,178 +329,106 @@
 # demo.launch(share=True)
 import os
-import re
-import io
-import uuid
-import contextlib
 import gradio as gr
-from PIL import Image
-import shutil
-# Required packages:
-# pip install vision-agent gradio openai anthropic
-from vision_agent.agent import VisionAgentCoderV2
-from vision_agent.models import AgentMessage
-#############################################
-# GLOBAL INITIALIZATION
-#############################################
-# Create a unique temporary directory for saved images
-TEMP_DIR = "temp_images"
-if not os.path.exists(TEMP_DIR):
-    os.makedirs(TEMP_DIR)
-# Initialize VisionAgentCoderV2 with verbose logging so the generated code has detailed print outputs.
-agent = VisionAgentCoderV2(verbose=True)
-#############################################
-# UTILITY: SAVE UPLOADED IMAGE TO A TEMP FILE
-#############################################
-def save_uploaded_image(image):
     """
-    Saves the uploaded image (a numpy array) to a temporary file.
-    Returns the filename (including path) to be passed as media to VisionAgent.
     """
-    # Generate a unique filename
-    filename = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}.jpg")
-    im = Image.fromarray(image)
-    im.save(filename)
-    return filename
-#############################################
-# UTILITY: PARSE FILENAMES FROM save_image(...)
-#############################################
-def parse_saved_image_filenames(code_str):
     """
-    Find all filenames in lines that look like:
-        save_image(..., 'filename.jpg')
-    Returns a list of the extracted filenames.
     """
-    pattern = r"save_image\s*\(\s*[^,]+,\s*'([^']+)'\s*\)"
-    return re.findall(pattern, code_str)
-#############################################
-# UTILITY: EXECUTE CODE, CAPTURE STDOUT, IDENTIFY IMAGES
-#############################################
-def run_and_capture_with_images(code_str):
     """
-    Executes the given code_str, capturing stdout and returning:
-      - output: a string with all print statements (the step logs)
-      - existing_images: list of filenames that were saved and exist on disk.
     """
-    # Parse the code for image filenames saved via save_image
-    filenames = parse_saved_image_filenames(code_str)
-    # Capture stdout using a StringIO buffer
-    buf = io.StringIO()
-    with contextlib.redirect_stdout(buf):
-        # IMPORTANT: Here we exec the generated code.
-        exec(code_str, globals(), locals())
-    # Gather all printed output
-    output = buf.getvalue()
-    # Check which of the parsed filenames exist on disk (prepend TEMP_DIR if needed)
-    existing_images = []
-    for fn in filenames:
-        # If filename is not an absolute path, assume it is in TEMP_DIR
-        if not os.path.isabs(fn):
-            fn = os.path.join(TEMP_DIR, fn)
-        if os.path.exists(fn):
-            existing_images.append(fn)
-    return output, existing_images
-#############################################
-# CHAT FUNCTION: PROCESS USER PROMPT & IMAGE
-#############################################
-def chat(prompt, image, history):
-    """
-    When the user sends a prompt and optionally an image, do the following:
-    1. Save the image to a temp file.
-    2. Use VisionAgentCoderV2 to generate code for the task.
-    3. Execute the generated code, capturing its stdout logs and any saved image files.
-    4. Append the logs and image gallery info to the conversation history.
-    """
-    # Validate that an image was provided.
     if image is None:
-        history.append(("System", "Please upload an image."))
-        return history, None
-    # Save the uploaded image for use in the generated code.
-    image_path = save_uploaded_image(image)
-    # Generate the code with VisionAgent using the user prompt and the image filename.
-    code_context = agent.generate_code(
-        [
-            AgentMessage(
-                role="user",
-                content=prompt,
-                media=[image_path]
-            )
-        ]
-    )
-    # Combine the generated code and its test snippet.
-    generated_code = code_context.code + "\n" + code_context.test
-    # Run the generated code and capture output and any saved images.
-    stdout_text, image_files = run_and_capture_with_images(generated_code)
-    # Format the response text (the captured logs).
-    response_text = f"**Execution Logs:**\n{stdout_text}\n"
-    if image_files:
-        response_text += "\n**Saved Images:** " + ", ".join(image_files)
-    else:
-        response_text += "\nNo images were saved by the generated code."
-    # Append the prompt and response to the chat history.
-    history.append((prompt, response_text))
-    # Optionally, you could clear the image input after use.
-    return history, image_files
-#############################################
-# GRADIO CHAT INTERFACE
-#############################################
 with gr.Blocks() as demo:
-    gr.Markdown("# VisionAgent Chat App")
     gr.Markdown(
         """
-        This chat app lets you enter a prompt (e.g., "Count the number of cacao oranges in the image")
-        along with an image. The app then uses VisionAgentCoderV2 to generate multi-step code, executes it,
-        and returns the detailed logs and any saved images.
         """
     )
     with gr.Row():
-        with gr.Column(scale=7):
-            chatbot = gr.Chatbot(label="Chat History")
-            prompt_input = gr.Textbox(label="Enter Prompt", placeholder="e.g., Count the number of cacao oranges in the image")
-            submit_btn = gr.Button("Send")
-        with gr.Column(scale=5):
-            image_input = gr.Image(label="Upload Image", type="numpy")
-    gallery = gr.Gallery(label="Generated Images").style(grid=[2], height="auto")
-    # Clear chat history button
-    clear_btn = gr.Button("Clear Chat")
-    # Chat function wrapper (it takes current chat history, prompt, image)
-    def user_chat_wrapper(prompt, image, history):
-        history = history or []
-        history, image_files = chat(prompt, image, history)
-        return history, image_files
-    submit_btn.click(fn=user_chat_wrapper, inputs=[prompt_input, image_input, chatbot], outputs=[chatbot, gallery])
-    clear_btn.click(lambda: ([], None), None, [chatbot, gallery])
     demo.launch()

 # demo.launch(share=True)
 import os
+import openai
 import gradio as gr
+import vision_agent.tools as T
+# Set your OpenAI API key (ensure the environment variable is set or replace with your key)
+openai.api_key = os.getenv("OPENAI_API_KEY", "your-openai-api-key-here")
+def get_single_prompt(user_input):
     """
+    Uses OpenAI to rephrase the user's chatter into a single, concise prompt for object detection.
+    The generated prompt will not include any question marks.
     """
+    if not user_input.strip():
+        user_input = "Detect objects in the image"
+    prompt_instruction = (
+        f"Based on the following user input, generate a single, concise prompt for object detection. "
+        f"Do not include any question marks in the output. "
+        f"User input: \"{user_input}\""
+    )
+    response = openai.Completion.create(
+        engine="text-davinci-003",
+        prompt=prompt_instruction,
+        max_tokens=50,
+        n=1,
+        stop=None,
+        temperature=0.3,
+    )
+    generated_prompt = response.choices[0].text.strip()
+    # Ensure no question marks exist in the prompt.
+    generated_prompt = generated_prompt.replace("?", "")
+    return generated_prompt
+def is_count_query(user_input):
     """
+    Check if the user's input indicates a counting request.
+    Looks for common keywords such as "count", "how many", "number of", etc.
     """
+    keywords = ["count", "how many", "number of", "total", "get me a count"]
+    for kw in keywords:
+        if kw.lower() in user_input.lower():
+            return True
+    return False
+def process_question_and_detect(user_input, image):
     """
+    1. Uses OpenAI to generate a single, concise prompt (without question marks) from the user's input.
+    2. Feeds that prompt to the VisionAgent detection function.
+    3. Overlays the detection bounding boxes on the image.
+    4. If the user's input is a counting query, it also returns the count of detected objects.
     """
     if image is None:
+        return None, "Please upload an image."
+    # Generate a concise prompt from the user's input.
+    generated_prompt = get_single_prompt(user_input)
+    # Run object detection using the generated prompt.
+    dets = T.agentic_object_detection(generated_prompt, image)
+    # Overlay bounding boxes on the image.
+    viz = T.overlay_bounding_boxes(image, dets)
+    # Check if the user's input implies a counting request.
+    count_text = ""
+    if is_count_query(user_input):
+        count = len(dets)
+        count_text = f"Detected {count} objects."
+    output_text = f"Generated prompt: {generated_prompt}\n{count_text}"
+    return viz, output_text
+# Build the Gradio interface.
 with gr.Blocks() as demo:
+    gr.Markdown("# VisionAgent Object Detection and Counting App")
     gr.Markdown(
         """
+        Enter your input (for example:
+        - "What is the number of fruit in my image?"
+        - "How many bicycles can you see?"
+        - "Get me a count of my bottles")
+        and upload an image.
+        The app uses OpenAI to generate a single, concise prompt for object detection (without question marks),
+        then runs the detection. If your input implies a counting request, it will also display the count of detected objects.
         """
     )
     with gr.Row():
+        user_input = gr.Textbox(label="Enter your input", placeholder="Type your input here...")
+        image_input = gr.Image(label="Upload Image", type="numpy")
+    submit_btn = gr.Button("Detect and Count")
+    output_image = gr.Image(label="Detection Result")
+    output_text = gr.Textbox(label="Output Details")
+    submit_btn.click(fn=process_question_and_detect, inputs=[user_input, image_input], outputs=[output_image, output_text])
     demo.launch()