File size: 11,160 Bytes
e425a6f
 
097e23d
b7a2f31
 
 
 
d26e194
 
097e23d
e425a6f
 
 
 
 
 
d26e194
 
 
e425a6f
097e23d
b7a2f31
 
 
 
 
 
 
 
 
097e23d
d26e194
 
 
 
 
 
 
 
 
 
 
e425a6f
 
 
 
d26e194
 
 
e425a6f
 
d26e194
e425a6f
 
 
 
 
097e23d
b7a2f31
e425a6f
 
 
 
097e23d
e425a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d26e194
b7a2f31
 
 
 
 
 
 
d26e194
 
 
b7a2f31
 
 
 
 
d26e194
b7a2f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e425a6f
 
 
 
 
 
b7a2f31
 
d26e194
 
 
 
 
 
 
 
 
b7a2f31
e425a6f
 
d26e194
 
 
 
 
 
e425a6f
d26e194
 
e425a6f
 
 
 
 
 
d26e194
 
 
e425a6f
 
d26e194
097e23d
b7a2f31
d26e194
e425a6f
 
 
 
 
b7a2f31
e425a6f
 
d26e194
 
b7a2f31
e425a6f
d26e194
 
b7a2f31
 
 
d26e194
b7a2f31
d26e194
 
 
b7a2f31
 
 
d26e194
 
b7a2f31
 
097e23d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python3

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, Qwen2VLForConditionalGeneration
from utils import image_to_base64, rescale_bounding_boxes, draw_bounding_boxes, florence_draw_bboxes
from qwen_vl_utils import process_vision_info
import re
import base64
import os

llms = {
    "Qwen2-1.5B":     {"model": "Qwen/Qwen2-1.5B-Instruct", "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    "Qwen2-3B":       {"model": "Qwen/Qwen2-3B-Instruct",   "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    "Qwen2-7B":       {"model": "Qwen/Qwen2-7B-Instruct",   "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    "Qwen2.5-1.5B":   {"model": "Qwen/Qwen2.5-1.5B-Instruct", "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    "Qwen2.5-3B":     {"model": "Qwen/Qwen2.5-3B-Instruct",   "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    "DeepSeek-Coder-1.3B": {"model": "deepseek-ai/deepseek-coder-1.3b-instruct", "prefix": "You are a helpful assistant."},
    "DeepSeek-r1-Qwen-1.5B": {"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "prefix": "You are a helpful assistant."},

}

vlms = {
    "Florence-2-base":   {"model": "microsoft/Florence-2-base", "prefix": "help me"},
    "Florence-2-large":  {"model": "microsoft/Florence-2-large", "prefix": "help me"},
    "Qwen2-vl-2B":   {"model": "Qwen/Qwen2-VL-2B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
    "Qwen2-vl-7B":   {"model": "Qwen/Qwen2-VL-7B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
    "Qwen2.5-vl-3B": {"model": "Qwen/Qwen2.5-VL-3B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."}
}

tasks = ["<OD>", "<OCR>", "<CAPTION>", "<OCR_WITH_REGION>"]

def get_image_base64(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode()
    return encoded_string

# At the top of your file, after imports
current_dir = os.path.dirname(os.path.abspath(__file__))
image_path = os.path.join(current_dir, "assets", "hailo_logo.gif")
image_base64 = get_image_base64(image_path)

def run_llm(text_input, model_id="Qwen2-1.5B", prefix=None):
    global messages
    tokenizer = AutoTokenizer.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
    
    # Use the provided prefix if available, otherwise fall back to the default
    system_prefix = prefix if prefix is not None else llms[model_id]["prefix"]
    
    if messages is None:
        messages = [
            {"role": "system", "content": system_prefix},
            {"role": "user", "content": text_input},
        ]
    else:
        messages.append({"role": "user", "content": text_input})
    

    text = tokenizer.apply_chat_template (
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

def run_vlm(image, text_input, model_id="Qwen2-vl-2B", prompt="<OD>", custom_prefix=None):
    if "Qwen" in model_id:
        model = Qwen2VLForConditionalGeneration.from_pretrained(vlms[model_id]["model"], torch_dtype="auto", device_map="auto")
    else:
        model = AutoModelForCausalLM.from_pretrained(vlms[model_id]["model"], trust_remote_code=True)
    processor = AutoProcessor.from_pretrained(vlms[model_id]["model"], trust_remote_code=True)
   
    if "Qwen" in model_id:
        # Use custom prefix if provided, otherwise use default from vlms dictionary
        prefix_to_use = custom_prefix if custom_prefix is not None else vlms[model_id]["prefix"]
        
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
                    {"type": "text", "text": prefix_to_use},
                    {"type": "text", "text": text_input},
                ],
            }
        ]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(model.device)
        generated_ids = model.generate(**inputs, max_new_tokens=256)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        print(output_text)
        pattern = r'\[\s*([.\d]+)\s*,\s*([.\d]+)\s*,\s*([.\d]+)\s*,\s*([.\d]+)\s*\]'
        matches = re.findall(pattern, str(output_text))
        parsed_boxes = [[float(num) for num in match] for match in matches]
        scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
        print(scaled_boxes)
        draw = draw_bounding_boxes(image, scaled_boxes)
    else:
        messages = prompt + text_input
        inputs = processor(text=messages, images=image, return_tensors="pt").to(model.device)
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed_answer = processor.post_process_generation(
            generated_text, 
            task=prompt, 
            image_size=(image.width, image.height)
        )
        print(parsed_answer)
        if prompt == '<OD>':
            parsed_boxes = parsed_answer['<OD>']['bboxes']
            draw = florence_draw_bboxes(image, parsed_answer)
            output_text = "None"
        elif prompt == '<OCR>':
            output_text = parsed_answer['<OCR>']
            draw = image
            parsed_boxes = None
    
    return output_text, parsed_boxes, draw

messages = list()
def reset_conversation():
    global messages
    messages = list()

def update_task_dropdown(model):
    if "Florence" in model:
        return [gr.Dropdown(visible=True), gr.Textbox(value=vlms[model]["prefix"])]
    elif model in vlms:
        return [gr.Dropdown(visible=False), gr.Textbox(value=vlms[model]["prefix"])]
    return [gr.Dropdown(visible=False), gr.Textbox(value="")]

def update_prefix_llm(model):
    if model in llms:
        return gr.Textbox(value=llms[model]["prefix"], visible=True)
    return gr.Textbox(visible=True)

with gr.Blocks() as demo:
    gr.Markdown(
    f"""
    <div style="display: flex; align-items: center; gap: 10px;">
        <img src="data:image/gif;base64,{image_base64}" height="40px" style="margin-right: 10px;">
        <h1 style="margin: 0;">LLM & VLM Demo</h1>
    </div>
    
    Use the different LLMs or VLMs to experience the different models.

    <u>Note</u>: first use of any model will take more time, for the downloading of the weights.
    """)
    with gr.Tab(label="LLM"):
        with gr.Row():
            with gr.Column():
                model_selector = gr.Dropdown(choices=list(llms.keys()), label="Model", value="Qwen2-1.5B")
                text_input = gr.Textbox(label="User Prompt")
                prefix_input = gr.Textbox(label="Prefix", value=llms["Qwen2.5-1.5B"]["prefix"])
                submit_btn = gr.Button(value="Submit", variant='primary')
                reset_btn = gr.Button(value="Reset conversation", variant='stop')
            with gr.Column():
                model_output_text = gr.Textbox(label="Model Output Text")
            model_selector.change(update_prefix_llm, inputs=model_selector, outputs=prefix_input)

        submit_btn.click(run_llm, 
                         [text_input, model_selector, prefix_input], 
                         [model_output_text])
        
        reset_btn.click(reset_conversation)
                        
    with gr.Tab(label="VLM (WIP)"):
    # taken from https://huggingface.co./spaces/maxiw/Qwen2-VL-Detection/blob/main/app.py
        with gr.Row():
            with gr.Column():
               input_img = gr.Image(label="Input Image", type="pil", scale=2, height=400)
               model_selector = gr.Dropdown(choices=list(vlms.keys()), label="Model", value="Qwen2-vl-2B")
               task_select = gr.Dropdown(choices=tasks, label="task", value= "<OD>")
               text_input = gr.Textbox(label="User Prompt")
               prefix_input = gr.Textbox(label="Prefix")
               submit_btn = gr.Button(value="Submit", variant='primary')
            with gr.Column():
                model_output_text = gr.Textbox(label="Model Output Text")
                parsed_boxes = gr.Textbox(label="Parsed Boxes")
                annotated_image = gr.Image(label="Annotated Image", scale=2, height=400)
        
            model_selector.change(update_task_dropdown, 
                                inputs=model_selector, 
                                outputs=[task_select, prefix_input])


        submit_btn.click(run_vlm, 
                        [input_img, text_input, model_selector, task_select, prefix_input], 
                        [model_output_text, parsed_boxes, annotated_image])

        

if __name__ == "__main__":
    demo.launch()