File size: 4,898 Bytes
7cb92c4
 
6e2faa9
7cb92c4
 
 
 
 
 
 
6e2faa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cb92c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e2faa9
7cb92c4
 
 
 
 
 
 
 
 
6e2faa9
 
 
 
 
 
 
 
7cb92c4
 
 
be21c0d
7cb92c4
 
 
 
 
6e2faa9
be21c0d
 
 
 
 
 
 
 
 
 
919a251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cb92c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import scipy.io.wavfile as wavfile
from transformers import pipeline

# Load pipelines
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

# Function to apply Non-Maximum Suppression (NMS)
def compute_iou(box1, boxes):
    x1 = np.maximum(box1['xmin'], boxes[:, 0])
    y1 = np.maximum(box1['ymin'], boxes[:, 1])
    x2 = np.minimum(box1['xmax'], boxes[:, 2])
    y2 = np.minimum(box1['ymax'], boxes[:, 3])

    intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
    box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

    union = box1_area + boxes_area - intersection
    return intersection / union

def nms(detections, iou_threshold=0.5):
    if len(detections) == 0:
        return []

    boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
    scores = np.array([d['score'] for d in detections])
    indices = np.argsort(scores)[::-1]

    keep = []
    while len(indices) > 0:
        current = indices[0]
        keep.append(current)
        rest = indices[1:]

        ious = compute_iou({
            'xmin': boxes[current, 0],
            'ymin': boxes[current, 1],
            'xmax': boxes[current, 2],
            'ymax': boxes[current, 3]
        }, boxes[rest])
        
        indices = rest[np.where(ious < iou_threshold)[0]]

    return [detections[i] for i in keep]

# Function to generate audio from text
def generate_audio(text):
    narrated_text = narrator(text)
    wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
    return "output.wav"

# Function to read and summarize detected objects
def read_objects(detection_objects):
    object_counts = {}
    for detection in detection_objects:
        label = detection['label']
        object_counts[label] = object_counts.get(label, 0) + 1

    response = "This picture contains"
    labels = list(object_counts.keys())
    for i, label in enumerate(labels):
        response += f" {object_counts[label]} {label}"
        if object_counts[label] > 1:
            response += "s"
        if i < len(labels) - 2:
            response += ","
        elif i == len(labels) - 2:
            response += " and"
    response += "."
    return response

# Function to draw bounding boxes on the image
def draw_bounding_boxes(image, detections):
    draw_image = image.copy()
    draw = ImageDraw.Draw(draw_image)
    font = ImageFont.load_default()

    for detection in detections:
        box = detection['box']
        xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

        label = detection['label']
        score = detection['score']
        text = f"{label}: {score:.2f}"
        text_size = draw.textbbox((xmin, ymin), text, font=font)
        draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
        draw.text((xmin, ymin), text, fill="white", font=font)

    return draw_image

# Main function to process the image
def detect_object(image):
    detections = object_detector(image)

    # Apply confidence threshold and NMS
    confidence_threshold = 0.5
    filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
    filtered_detections = nms(filtered_detections)

    processed_image = draw_bounding_boxes(image, filtered_detections)
    description_text = read_objects(filtered_detections)
    processed_audio = generate_audio(description_text)
    return processed_image, processed_audio

description_text = """
Upload an image to detect objects and hear a natural language description.
### Credits:
Developed by Taizun S
"""

# Google Analytics script
ga_script = """
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());
  gtag('config', 'G-WEYXHDZ3GQ');
</script>
"""

# Use Gradio Blocks to organize the layout
with gr.Blocks() as demo:
    gr.HTML(ga_script)  # Injecting Google Analytics script
    gr.Markdown(description_text)  # Adding the description as Markdown
    
    # Define the Interface components within Blocks
    gr.Interface(
        fn=detect_object,
        inputs=gr.Image(label="Upload an Image", type="pil"),
        outputs=[
            gr.Image(label="Processed Image", type="pil"),
            gr.Audio(label="Generated Audio")
        ],
        title="Multi-Object Detection with Audio Narration",
    )

# Launch the Blocks interface
demo.launch()