Spaces:

nvidia
/

Eagle2-Demo

Running on Zero

App Files Files Community

liuguilin commited on 15 days ago

Commit

37df18e

1 Parent(s): d62b227

update

Browse files

Files changed (3) hide show

app.py +41 -26
eagle_vl/serve/chat_utils.py +42 -20
eagle_vl/serve/inference.py +32 -13

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ logger = configure_logger()
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, default="Eagle-2.5-8B")
     parser.add_argument(
         "--local-path",
         type=str,
@@ -57,7 +57,7 @@ def fetch_model(model_name: str):
     if args.local_path:
         model_path = args.local_path
     else:
-        model_path = f"nvidia/{args.model}"
     if model_name in DEPLOY_MODELS:
         model_info = DEPLOY_MODELS[model_name]
@@ -100,6 +100,7 @@ def predict(
     temperature,
     max_length_tokens,
     max_context_length_tokens,
     chunk_size: int = 512,
 ):
     """
@@ -116,18 +117,7 @@ def predict(
         max_context_length_tokens (int): The max context length tokens.
         chunk_size (int): The chunk size.
     """
-    print("running the prediction function")
-    try:
-        logger.info("fetching model")
-        model, processor = fetch_model(args.model)
-        logger.info("model fetched")
-        if text == "":
-            yield chatbot, history, "Empty context."
-            return
-    except KeyError:
-        logger.info("no model found")
-        yield [[text, "No Model Found"]], [], "No Model Found"
-        return
     if images is None:
         images = []
@@ -136,15 +126,33 @@ def predict(
     pil_images = []
     for img_or_file in images:
         try:
             # load as pil image
             if isinstance(images, Image.Image):
                 pil_images.append(img_or_file)
-            else:
-                image = Image.open(img_or_file.name).convert("RGB")
-                pil_images.append(image)
         except Exception as e:
             print(f"Error loading image: {e}")
     # generate prompt
     conversation = generate_prompt_with_history(
         text,
@@ -166,6 +174,7 @@ def predict(
             max_length=max_length_tokens,
             temperature=temperature,
             top_p=top_p,
         ):
             full_response += x
             response = strip_stop_words(full_response, stop_words)
@@ -174,12 +183,12 @@ def predict(
             yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
-    if last_image is not None:
-        vg_image = parse_ref_bbox(response, last_image)
-        if vg_image is not None:
-            vg_base64 = pil_to_base64(vg_image, "vg", max_size=800, min_size=400)
-            gradio_chatbot_output[-1][1] += vg_base64
-            yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
     logger.info("flushed result to gradio")
@@ -202,6 +211,7 @@ def retry(
     temperature,
     max_length_tokens,
     max_context_length_tokens,
     chunk_size: int = 512,
 ):
     """
@@ -226,6 +236,7 @@ def retry(
         temperature,
         max_length_tokens,
         max_context_length_tokens,
         chunk_size,
     )
@@ -265,9 +276,10 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
             with gr.Column():
                 # add note no more than 2 images once
                 # gr.Markdown("Note: you can upload no more than 2 images once")
-                upload_images = gr.Files(file_types=["image"], show_label=True)
                 gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
                 upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
                 # Parameter Setting Tab for control the generation parameters
                 with gr.Tab(label="Parameter Setting"):
                     top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p")
@@ -280,7 +292,9 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
                     max_context_length_tokens = gr.Slider(
                         minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Context Length Tokens"
                     )
                     show_images = gr.HTML(visible=False)
         gr.Examples(
@@ -298,6 +312,7 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
             temperature,
             max_length_tokens,
             max_context_length_tokens,
         ]
         output_widgets = [chatbot, history, status_display]
@@ -336,7 +351,7 @@ def main(args: argparse.Namespace):
     demo.queue().launch(
         favicon_path=favicon_path,
         server_name=args.ip,
-        server_port=args.port
     )

 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="Eagle2.5-VL-8B-Preview")
     parser.add_argument(
         "--local-path",
         type=str,
     if args.local_path:
         model_path = args.local_path
     else:
+        model_path = f"NVEagle/{args.model}"
     if model_name in DEPLOY_MODELS:
         model_info = DEPLOY_MODELS[model_name]
     temperature,
     max_length_tokens,
     max_context_length_tokens,
+    video_nframes,
     chunk_size: int = 512,
 ):
     """
         max_context_length_tokens (int): The max context length tokens.
         chunk_size (int): The chunk size.
     """
     if images is None:
         images = []
     pil_images = []
     for img_or_file in images:
         try:
+            logger.info(f"img_or_file: {img_or_file}")
             # load as pil image
             if isinstance(images, Image.Image):
                 pil_images.append(img_or_file)
+            elif isinstance(img_or_file, str):
+                if img_or_file.endswith((".mp4", ".mov", ".avi", ".webm")):
+                    pil_images.append(img_or_file)
+                else:
+                    image = Image.open(img_or_file.name).convert("RGB")
+                    pil_images.append(image)
         except Exception as e:
             print(f"Error loading image: {e}")
+    print("running the prediction function")
+    try:
+        logger.info("fetching model")
+        model, processor = fetch_model(args.model)
+        logger.info("model fetched")
+        if text == "":
+            yield chatbot, history, "Empty context."
+            return
+    except KeyError:
+        logger.info("no model found")
+        yield [[text, "No Model Found"]], [], "No Model Found"
+        return
     # generate prompt
     conversation = generate_prompt_with_history(
         text,
             max_length=max_length_tokens,
             temperature=temperature,
             top_p=top_p,
+            video_nframes=video_nframes,
         ):
             full_response += x
             response = strip_stop_words(full_response, stop_words)
             yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
+    # if last_image is not None:
+    #     vg_image = parse_ref_bbox(response, last_image)
+    #     if vg_image is not None:
+    #         vg_base64 = pil_to_base64(vg_image, "vg", max_size=800, min_size=400)
+    #         gradio_chatbot_output[-1][1] += vg_base64
+    #         yield gradio_chatbot_output, to_gradio_history(conversation), "Generating..."
     logger.info("flushed result to gradio")
     temperature,
     max_length_tokens,
     max_context_length_tokens,
+    video_nframes,
     chunk_size: int = 512,
 ):
     """
         temperature,
         max_length_tokens,
         max_context_length_tokens,
+        video_nframes,
         chunk_size,
     )
             with gr.Column():
                 # add note no more than 2 images once
                 # gr.Markdown("Note: you can upload no more than 2 images once")
+                upload_images = gr.Files(file_types=["image", "video"], show_label=True)
                 gallery = gr.Gallery(columns=[3], height="200px", show_label=True)
                 upload_images.change(preview_images, inputs=upload_images, outputs=gallery)
                 # Parameter Setting Tab for control the generation parameters
                 with gr.Tab(label="Parameter Setting"):
                     top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.05, interactive=True, label="Top-p")
                     max_context_length_tokens = gr.Slider(
                         minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Context Length Tokens"
                     )
+                    video_nframes = gr.Slider(
+                        minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
+                    )
                     show_images = gr.HTML(visible=False)
         gr.Examples(
             temperature,
             max_length_tokens,
             max_context_length_tokens,
+            video_nframes
         ]
         output_widgets = [chatbot, history, status_display]
     demo.queue().launch(
         favicon_path=favicon_path,
         server_name=args.ip,
+        server_port=args.port,
     )

eagle_vl/serve/chat_utils.py CHANGED Viewed

@@ -13,7 +13,7 @@ import gradio as gr
 import torch
 import os
 from .utils import pil_to_base64
 IMAGE_TOKEN = "<image>"
 logger = logging.getLogger("gradio_logger")
@@ -324,6 +324,7 @@ def convert_conversation_to_prompts(conversation: Conversation):
     Convert the conversation to prompts.
     """
     conv_prompts = []
     last_image = None
     messages = conversation.messages
@@ -342,34 +343,55 @@ def convert_conversation_to_prompts(conversation: Conversation):
 def to_gradio_chatbot(conversation: Conversation) -> list:
-    """Convert the conversation to gradio chatbot format."""
     ret = []
     for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
         if i % 2 == 0:
-            if type(msg) is tuple:
-                msg, images = copy.deepcopy(msg)
-                if isinstance(images, list):
-                    img_str = ""
-                    for j, image in enumerate(images):
-                        if isinstance(image, str):
-                            with open(image, "rb") as f:
-                                data = f.read()
-                            img_b64_str = base64.b64encode(data).decode()
-                            image_str = (
-                                f'<img src="data:image/png;base64,{img_b64_str}" '
-                                f'alt="user upload image" style="max-width: 300px; height: auto;" />'
                             )
                         else:
-                            image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
-                        img_str += image_str
-                    msg = img_str + msg
-                else:
-                    pass
             ret.append([msg, None])
         else:
             ret[-1][-1] = msg
     return ret

 import torch
 import os
 from .utils import pil_to_base64
+import mimetypes
 IMAGE_TOKEN = "<image>"
 logger = logging.getLogger("gradio_logger")
     Convert the conversation to prompts.
     """
     conv_prompts = []
     last_image = None
     messages = conversation.messages
 def to_gradio_chatbot(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format, supporting images and video."""
     ret = []
     for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        # User message
         if i % 2 == 0:
+            if isinstance(msg, tuple):
+                msg_text, media = copy.deepcopy(msg)
+                media_str = ""
+                # Handle list of media items
+                if isinstance(media, list):
+                    items = media
+                else:
+                    items = [media]
+                for j, item in enumerate(items):
+                    # If string path, determine type
+                    if isinstance(item, str):
+                        mime, _ = mimetypes.guess_type(item)
+                        with open(item, "rb") as f:
+                            data = f.read()
+                        b64 = base64.b64encode(data).decode()
+                        if mime and mime.startswith("image/"):
+                            media_str += (
+                                f'<img src="data:{mime};base64,{b64}" '
+                                f'alt="user upload image_{j}" '
+                                f'style="max-width:300px;height:auto;" />'
+                            )
+                        elif mime and mime.startswith("video/"):
+                            media_str += (
+                                f'<video controls '
+                                f'style="max-width:300px;height:auto;" '
+                                f'src="data:{mime};base64,{b64}"></video>'
                             )
                         else:
+                            # Fallback to link
+                            media_str += f'<a href="{item}" target="_blank">{item}</a>'
+                    # If PIL image
+                    else:
+                        media_str += pil_to_base64(item, f"user upload image_{j}", max_size=800, min_size=400)
+                msg = media_str + msg_text
+            # Append user side
             ret.append([msg, None])
         else:
+            # Assistant side, fill previous tuple
             ret[-1][-1] = msg
     return ret

eagle_vl/serve/inference.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import (
     StoppingCriteriaList,
     TextIteratorStreamer,
 )
 from .chat_utils import Conversation, get_conv_template
 logger = logging.getLogger(__name__)
@@ -91,6 +91,7 @@ class StoppingCriteriaSub(StoppingCriteria):
 def preprocess(
     messages: list[dict],
     processor,
 ):
     """
     Build messages from the conversations and images.
@@ -110,12 +111,28 @@ def preprocess(
             if "images" in message:
                 per_round_images = message["images"]
                 for image in per_round_images:
-                    record["content"].append(
-                        {
-                            "type": "image",
-                            "image": image,
-                        }
-                    )
             if 'content' in message:
                 record["content"].append(
                     {
@@ -148,12 +165,12 @@ def preprocess(
                 formatted_answer.count(processor.image_token) == 0
             ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
-    print(f"messages = {results}")
     text = processor.apply_chat_template(results, add_generation_prompt=False)
-    print(f"raw text = {text}")
-    image_inputs, video_inputs = processor.process_vision_info(results)
     inputs = processor(
         images=image_inputs,
         videos=video_inputs,
@@ -161,6 +178,7 @@ def preprocess(
         return_tensors="pt",
         padding=True,
         truncation=True,
     )
     return inputs
@@ -176,10 +194,11 @@ def eagle_vl_generate(
     temperature: float = 1.0,
     top_p: float = 1.0,
     chunk_size: int = -1,
 ):
     # convert conversation to inputs
     print(f"conversations = {conversations}")
-    inputs = preprocess(conversations, processor=processor)
     inputs = inputs.to(model.device)
     return generate(
@@ -202,7 +221,7 @@ def generate(
     temperature: float = 0,
     top_p: float = 0.95,
     stop_words: List[str] = [],
-    chunk_size: int = -1,
 ):
     """Stream the text output from the multimodality model with prompt and image inputs."""
     tokenizer = processor.tokenizer

     StoppingCriteriaList,
     TextIteratorStreamer,
 )
+from PIL import Image
 from .chat_utils import Conversation, get_conv_template
 logger = logging.getLogger(__name__)
 def preprocess(
     messages: list[dict],
     processor,
+    video_nframes: int = 16,
 ):
     """
     Build messages from the conversations and images.
             if "images" in message:
                 per_round_images = message["images"]
                 for image in per_round_images:
+                    if isinstance(image, Image.Image):
+                        record["content"].append(
+                            {
+                                "type": "image",
+                                "image": image,
+                            }
+                        )
+                    elif isinstance(image, str) and image.endswith((".jpeg", ".jpg", ".png", ".gif")):
+                        record["content"].append(
+                            {
+                                "type": "image",
+                                "image": image,
+                            }
+                        )
+                    elif isinstance(image, str) and image.endswith((".mp4", ".mov", ".avi", ".webm")):
+                        record["content"].append(
+                            {
+                                "type": "video",
+                                "video": image,
+                                "nframes": video_nframes,
+                            }
+                        )
             if 'content' in message:
                 record["content"].append(
                     {
                 formatted_answer.count(processor.image_token) == 0
             ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
+    # print(f"messages = {results}")
     text = processor.apply_chat_template(results, add_generation_prompt=False)
+    # print(f"raw text = {text}")
+    image_inputs, video_inputs, video_kwargs = processor.process_vision_info(results, return_video_kwargs=True)
     inputs = processor(
         images=image_inputs,
         videos=video_inputs,
         return_tensors="pt",
         padding=True,
         truncation=True,
+        videos_kwargs=video_kwargs,
     )
     return inputs
     temperature: float = 1.0,
     top_p: float = 1.0,
     chunk_size: int = -1,
+    video_nframes: int = 16,
 ):
     # convert conversation to inputs
     print(f"conversations = {conversations}")
+    inputs = preprocess(conversations, processor=processor, video_nframes=video_nframes)
     inputs = inputs.to(model.device)
     return generate(
     temperature: float = 0,
     top_p: float = 0.95,
     stop_words: List[str] = [],
+    chunk_size: int = -1
 ):
     """Stream the text output from the multimodality model with prompt and image inputs."""
     tokenizer = processor.tokenizer