Harsh1306 commited on
Commit
f6697b8
·
verified ·
1 Parent(s): 91e509c

Upload 7 files

Browse files

# ImageRecogniserConversationalChatbot
The app is designed to identify objects in images and then answer questions related to those objects using a conversational chatbot interface. It effectively bridges the gap between computer vision and natural language understanding, making it a versatile tool for various applications, including education, tourism, and general information retrieval

LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Harsh Sanga
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1,2 @@
1
- ---
2
- title: ImageRecogineserConversationalBot
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: ImageRecogniserConversationalChatbot
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # ImageRecogniserConversationalChatbot
2
+ The app is designed to identify objects in images and then answer questions related to those objects using a conversational chatbot interface. It effectively bridges the gap between computer vision and natural language understanding, making it a versatile tool for various applications, including education, tourism, and general information retrieval
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/GeneriCaptioner.cpython-312.pyc ADDED
Binary file (655 Bytes). View file
 
__pycache__/final_captioner.cpython-312.pyc ADDED
Binary file (7.09 kB). View file
 
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import streamlit as st
4
+ from groq import Groq
5
+ from PIL import Image, UnidentifiedImageError, ExifTags
6
+ import requests
7
+ from io import BytesIO
8
+ from transformers import pipeline
9
+ from final_captioner import generate_final_caption
10
+ import hashlib
11
+
12
+ # Streamlit page title
13
+ st.title("PicSamvaad : Image Conversational Chatbot")
14
+
15
+ # # Load configuration
16
+ # working_dir = os.path.dirname(os.path.abspath(__file__))
17
+ # config_data = json.load(open(f"{working_dir}/config.json"))
18
+
19
+ # GROQ_API_KEY = config_data["GROQ_API_KEY"]
20
+
21
+ # Save the API key to environment variable
22
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
23
+
24
+ client = Groq()
25
+
26
+ # Sidebar for image upload and URL input
27
+ with st.sidebar:
28
+ st.header("Upload Image or Enter URL")
29
+
30
+ uploaded_file = st.file_uploader(
31
+ "Upload an image to chat...", type=["jpg", "jpeg", "png"]
32
+ )
33
+ url = st.text_input("Or enter a valid image URL...")
34
+
35
+ image = None
36
+ error_message = None
37
+
38
+
39
+ def correct_image_orientation(img):
40
+ try:
41
+ for orientation in ExifTags.TAGS.keys():
42
+ if ExifTags.TAGS[orientation] == "Orientation":
43
+ break
44
+ exif = img._getexif()
45
+ if exif is not None:
46
+ orientation = exif[orientation]
47
+ if orientation == 3:
48
+ img = img.rotate(180, expand=True)
49
+ elif orientation == 6:
50
+ img = img.rotate(270, expand=True)
51
+ elif orientation == 8:
52
+ img = img.rotate(90, expand=True)
53
+ except (AttributeError, KeyError, IndexError):
54
+ pass
55
+ return img
56
+
57
+
58
+ def get_image_hash(image):
59
+ # Generate a unique hash for the image
60
+ img_bytes = image.tobytes()
61
+ return hashlib.md5(img_bytes).hexdigest()
62
+
63
+
64
+ # Check if a new image or URL has been provided and reset chat history
65
+ if "last_uploaded_hash" not in st.session_state:
66
+ st.session_state.last_uploaded_hash = None
67
+
68
+ if uploaded_file is not None:
69
+ image = Image.open(uploaded_file)
70
+ image_hash = get_image_hash(image)
71
+
72
+ if st.session_state.last_uploaded_hash != image_hash:
73
+ st.session_state.chat_history = [] # Clear chat history
74
+ st.session_state.last_uploaded_hash = image_hash # Update last uploaded hash
75
+
76
+ image = correct_image_orientation(image)
77
+ st.image(image, caption="Uploaded Image.", use_column_width=True)
78
+
79
+ elif url:
80
+ try:
81
+ response = requests.get(url)
82
+ response.raise_for_status() # Check if the request was successful
83
+ image = Image.open(BytesIO(response.content))
84
+ image_hash = get_image_hash(image)
85
+
86
+ if st.session_state.last_uploaded_hash != image_hash:
87
+ st.session_state.chat_history = [] # Clear chat history
88
+ st.session_state.last_uploaded_hash = (
89
+ image_hash # Update last uploaded hash
90
+ )
91
+
92
+ image = correct_image_orientation(image)
93
+ st.image(image, caption="Image from URL.", use_column_width=True)
94
+ except (requests.exceptions.RequestException, UnidentifiedImageError) as e:
95
+ image = None
96
+ error_message = "Error: The provided URL is invalid or the image could not be loaded. Sometimes some image URLs don't work. We suggest you upload the downloaded image instead ;)"
97
+
98
+ caption = ""
99
+ if image is not None:
100
+ caption += generate_final_caption(image)
101
+ st.write("ChatBot : " + caption)
102
+
103
+ # Display error message if any
104
+ if error_message:
105
+ st.error(error_message)
106
+
107
+ # Initialize chat history in Streamlit session state if not present already
108
+ if "chat_history" not in st.session_state:
109
+ st.session_state.chat_history = []
110
+
111
+ # Display chat history
112
+ for message in st.session_state.chat_history:
113
+ with st.chat_message(message["role"]):
114
+ st.markdown(message["content"])
115
+
116
+ # Input field for user's message
117
+ user_prompt = st.chat_input("Ask the Chatbot about the image...")
118
+
119
+ if user_prompt:
120
+ st.chat_message("user").markdown(user_prompt)
121
+ st.session_state.chat_history.append({"role": "user", "content": user_prompt})
122
+
123
+ # Send user's message to the LLM and get a response
124
+ messages = [
125
+ {
126
+ "role": "system",
127
+ "content": "You are a helpful, accurate image conversational assistant. You don't hallucinate, and your answers are very precise and have a positive approach.The caption of the image is: "
128
+ + caption,
129
+ },
130
+ *st.session_state.chat_history,
131
+ ]
132
+
133
+ response = client.chat.completions.create(
134
+ model="llama-3.1-8b-instant", messages=messages
135
+ )
136
+
137
+ assistant_response = response.choices[0].message.content
138
+ st.session_state.chat_history.append(
139
+ {"role": "assistant", "content": assistant_response}
140
+ )
141
+
142
+ # Display the LLM's response
143
+ with st.chat_message("assistant"):
144
+ st.markdown(assistant_response)
final_captioner.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tensorflow.keras.preprocessing import image
2
+ import tensorflow as tf
3
+ from tensorflow.keras.models import load_model
4
+ import numpy as np
5
+ from transformers import pipeline
6
+ import gdown
7
+ import os
8
+
9
+ git_pipe = pipeline("image-to-text", model="microsoft/git-large-textcaps")
10
+
11
+ flower_output = "Flower_classifier.h5"
12
+ flower_model_id = "1AlBunIPDg4HYYCqhcHtOiXxnPFhmsoSn"
13
+ flower_url = f"https://drive.google.com/uc?id={flower_model_id}"
14
+ if not os.path.exists(flower_output):
15
+ gdown.download(flower_url, flower_output, quiet=False)
16
+ flower_model = load_model(flower_output)
17
+ flower_model.summary()
18
+
19
+
20
+ bird_output = "Bird_classifier.h5"
21
+ bird_model_id = "1a6vqFERbrr_Cw-NyBqVHG7fsjU2-xKJ4"
22
+ bird_url = f"https://drive.google.com/uc?id={bird_model_id}"
23
+ if not os.path.exists(bird_output):
24
+ gdown.download(bird_url, bird_output, quiet=False)
25
+ bird_model = load_model(bird_output)
26
+ bird_model.summary()
27
+
28
+
29
+ dog_output = "DogClassifier.h5"
30
+ dog_model_id = "1UFn1NGVtP5rhvcWnAANQ_4E9YRJvDEad"
31
+ dog_url = f"https://drive.google.com/uc?id={dog_model_id}"
32
+ if not os.path.exists(dog_output):
33
+ gdown.download(dog_url, dog_output, quiet=False)
34
+ dog_model = load_model(dog_output)
35
+ dog_model.summary()
36
+
37
+
38
+ landmark_output = "LandmarkClassifierV5.h5"
39
+ landmark_model_id = "1PXixJsrUaVcHEEC-jDlv4tHT2qrCrf5c" # Replace with your file ID
40
+ landmark_url = f"https://drive.google.com/uc?id={landmark_model_id}"
41
+ if not os.path.exists(landmark_output):
42
+ gdown.download(landmark_url, landmark_output, quiet=False)
43
+ landmark_model = load_model(landmark_output)
44
+ landmark_model.summary()
45
+
46
+
47
+ dog_list = [
48
+ "Bulldog",
49
+ "Chihuahua (dog breed)",
50
+ "Dobermann",
51
+ "German Shepherd",
52
+ "Golden Retriever",
53
+ "Husky",
54
+ "Labrador Retriever",
55
+ "Pomeranian dog",
56
+ "Pug",
57
+ "Rottweiler",
58
+ "Street dog",
59
+ ]
60
+ flower_list = [
61
+ "Jasmine",
62
+ "Lavender",
63
+ "Lily",
64
+ "Lotus",
65
+ "Orchid",
66
+ "Rose",
67
+ "Sunflower",
68
+ "Tulip",
69
+ "daisy",
70
+ "dandelion",
71
+ ]
72
+ bird_list = [
73
+ "Crow",
74
+ "Eagle",
75
+ "Flamingo",
76
+ "Hummingbird",
77
+ "Parrot",
78
+ "Peacock",
79
+ "Pigeon",
80
+ "Sparrow",
81
+ "Swan",
82
+ ]
83
+ landmark_list = [
84
+ "The Agra Fort",
85
+ "Ajanta Caves",
86
+ "Alai Darwaza",
87
+ "Amarnath Temple",
88
+ "The Amber Fort",
89
+ "Basilica of Bom Jesus",
90
+ "Brihadisvara Temple",
91
+ "Charar-e-Sharief shrine",
92
+ "Charminar",
93
+ "Chhatrapati Shivaji Terminus",
94
+ "Chota Imambara",
95
+ "Dal Lake",
96
+ "The Elephanta Caves",
97
+ "Ellora Caves",
98
+ "Fatehpur Sikri",
99
+ "Gateway of India",
100
+ "Ghats in Varanasi",
101
+ "Gol Gumbaz",
102
+ "Golden Temple",
103
+ "Group of Monuments at Mahabalipuram",
104
+ "Hampi",
105
+ "Hawa Mahal",
106
+ "Humayun's Tomb",
107
+ "The India gate",
108
+ "Iron Pillar",
109
+ "Jagannath Temple, Puri",
110
+ "Jageshwar",
111
+ "Jama Masjid",
112
+ "Jamali Kamali Tomb",
113
+ "Jantar Mantar, Jaipur",
114
+ "Jantar Mantar, New Delhi",
115
+ "Kedarnath Temple",
116
+ "Khajuraho Temple",
117
+ "Konark Sun Temple",
118
+ "Mahabodhi Temple",
119
+ "Meenakshi Temple",
120
+ "Nalanda mahavihara",
121
+ "Parliament House, New Delhi",
122
+ "Qutb Minar",
123
+ "Qutb Minar Complex",
124
+ "Ram Mandir",
125
+ "Rani ki Vav",
126
+ "Rashtrapati Bhavan",
127
+ "The Red Fort",
128
+ "Sanchi",
129
+ "Supreme Court of India",
130
+ "Swaminarayan Akshardham (Delhi)",
131
+ "Taj Hotels",
132
+ "The Lotus Temple",
133
+ "The Mysore Palace",
134
+ "The Statue of Unity",
135
+ "The Taj Mahal",
136
+ "Vaishno Devi Temple",
137
+ "Venkateswara Temple, Tirumala",
138
+ "Victoria Memorial, Kolkata",
139
+ "Vivekananda Rock Memorial",
140
+ ]
141
+
142
+
143
+ def identify_dog(img):
144
+ img = img.resize((224, 224))
145
+ img_array = image.img_to_array(img)
146
+ img_array = np.expand_dims(img_array, axis=0)
147
+ img_array /= 255.0
148
+
149
+ # Get predictions
150
+ predictions = dog_model.predict(img_array)
151
+
152
+ # Get the index of the class with the highest probability
153
+ predicted_class_index = np.argmax(predictions[0])
154
+
155
+ # Get the probability of the predicted class
156
+ predicted_probability = predictions[0][predicted_class_index]
157
+
158
+ # Map the predicted class index to the class label
159
+ predicted_class_label = dog_list[predicted_class_index]
160
+
161
+ return predicted_class_label
162
+
163
+
164
+
165
+ def identify_flower(img):
166
+ img = img.resize((224, 224))
167
+ img_array = image.img_to_array(img)
168
+ img_array = np.expand_dims(img_array, axis=0)
169
+ img_array /= 255.0
170
+
171
+ # Get predictions
172
+ predictions = flower_model.predict(img_array)
173
+
174
+ # Get the index of the class with the highest probability
175
+ predicted_class_index = np.argmax(predictions[0])
176
+
177
+ # Get the probability of the predicted class
178
+ predicted_probability = predictions[0][predicted_class_index]
179
+
180
+ # Map the predicted class index to the class label
181
+ predicted_class_label = flower_list[predicted_class_index]
182
+
183
+ return predicted_class_label
184
+
185
+
186
+
187
+ def identify_bird(img):
188
+ # Preprocess the image
189
+ img = img.resize((224, 224))
190
+ img_array = image.img_to_array(img)
191
+ img_array = np.expand_dims(img_array, axis=0)
192
+ img_array /= 255.0
193
+
194
+ # Get predictions
195
+ predictions = bird_model.predict(img_array)
196
+
197
+ # Get the index of the class with the highest probability
198
+ predicted_class_index = np.argmax(predictions[0])
199
+
200
+ # Get the probability of the predicted class
201
+ predicted_probability = predictions[0][predicted_class_index]
202
+
203
+ # Map the predicted class index to the class label
204
+ predicted_class_label = bird_list[predicted_class_index]
205
+
206
+ return predicted_class_label
207
+
208
+
209
+ def identify_landmark(img):
210
+ # Preprocess the image
211
+ img = img.resize((224, 224))
212
+ img_array = image.img_to_array(img)
213
+ img_array = np.expand_dims(img_array, axis=0)
214
+ img_array /= 255.0
215
+
216
+ # Get predictions
217
+ predictions = landmark_model.predict(img_array)
218
+
219
+ # Get the index of the class with the highest probability
220
+ predicted_class_index = np.argmax(predictions[0])
221
+
222
+ # Get the probability of the predicted class
223
+ predicted_probability = predictions[0][predicted_class_index]
224
+
225
+ # Map the predicted class index to the class label
226
+ predicted_class_label = landmark_list[predicted_class_index]
227
+
228
+ return predicted_class_label
229
+
230
+
231
+ def generate_final_caption(image):
232
+ caption_dict = git_pipe(image)
233
+ caption = caption_dict[0]["generated_text"]
234
+ image = image.resize((256, 256))
235
+ caption = caption_dict[0]["generated_text"]
236
+ phrases_to_cut = ["with the word", "that says"]
237
+ for phrase in phrases_to_cut:
238
+ index = caption.find(phrase)
239
+ if index != -1:
240
+ caption = caption[:index].strip()
241
+
242
+ if (
243
+ "building" in caption.lower()
244
+ or "monument" in caption.lower()
245
+ or "tower" in caption.lower()
246
+ ):
247
+ caption += "\nThe landmark is : " + identify_landmark(image)
248
+ elif "flower" in caption.lower() or "flowers" in caption.lower():
249
+ caption += "\nThe Flower is : " + identify_flower(image)
250
+ elif "dog" in caption.lower() or "puppy" in caption.lower():
251
+ caption += "\nThe Dog is : " + identify_dog(image)
252
+ elif "birds" in caption.lower() or "bird" in caption.lower():
253
+ caption += "\nThe Bird is : " + identify_bird(image)
254
+ return caption
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pillow
3
+ requests
4
+ matplotlib
5
+ tensorflow
6
+ transformers
7
+ torch
8
+ tf-keras
9
+ easygoogletranslate
10
+ groq
11
+ gdown