Spaces:
Running
Running
Implemented Google Search and Images Functionality
Browse files- .DS_Store +0 -0
- Search/__pycache__/gettyimages.cpython-312.pyc +0 -0
- Search/__pycache__/main.cpython-312.pyc +0 -0
- Search/__pycache__/useragentka.cpython-312.pyc +0 -0
- Search/gettyimages.py +21 -0
- Search/main.py +163 -0
- Search/useragentka.py +20 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/prompts.cpython-312.pyc +0 -0
- app.py +36 -80
- logs.json +0 -1
- test.py +18 -0
- utils/__pycache__/logger.cpython-312.pyc +0 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
Search/__pycache__/gettyimages.cpython-312.pyc
ADDED
Binary file (1.03 kB). View file
|
|
Search/__pycache__/main.cpython-312.pyc
ADDED
Binary file (6.96 kB). View file
|
|
Search/__pycache__/useragentka.cpython-312.pyc
ADDED
Binary file (1.68 kB). View file
|
|
Search/gettyimages.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from curl_cffi import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
def get_images(query):
|
5 |
+
res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
|
6 |
+
|
7 |
+
soup = BeautifulSoup(res.text, 'html.parser')
|
8 |
+
|
9 |
+
images = soup.find_all('img')
|
10 |
+
|
11 |
+
results = []
|
12 |
+
|
13 |
+
for image in images:
|
14 |
+
print(image['src'])
|
15 |
+
if image['src'].startswith('https://media.gettyimages.com'):
|
16 |
+
results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
|
17 |
+
else:
|
18 |
+
continue
|
19 |
+
|
20 |
+
return results
|
21 |
+
|
Search/main.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""googlesearch is a Python library for searching Google, easily."""
|
2 |
+
from time import sleep
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from requests import get
|
5 |
+
from urllib.parse import unquote # to decode the url
|
6 |
+
from Search.useragentka import get_useragent
|
7 |
+
from curl_cffi import requests as curlreq
|
8 |
+
from Search.gettyimages import get_images
|
9 |
+
|
10 |
+
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
|
11 |
+
resp = get(
|
12 |
+
url="https://www.google.com/search",
|
13 |
+
headers={
|
14 |
+
"User-Agent": get_useragent(),
|
15 |
+
"Accept": "*/*"
|
16 |
+
},
|
17 |
+
params={
|
18 |
+
"q": term,
|
19 |
+
"num": results + 2, # Prevents multiple requests
|
20 |
+
"hl": lang,
|
21 |
+
"start": start,
|
22 |
+
"safe": safe,
|
23 |
+
"gl": region,
|
24 |
+
},
|
25 |
+
proxies=proxies,
|
26 |
+
timeout=timeout,
|
27 |
+
verify=ssl_verify,
|
28 |
+
cookies = {
|
29 |
+
'CONSENT': 'PENDING+987', # Bypasses the consent page
|
30 |
+
'SOCS': 'CAESHAgBEhIaAB',
|
31 |
+
}
|
32 |
+
)
|
33 |
+
resp.raise_for_status()
|
34 |
+
return resp
|
35 |
+
|
36 |
+
|
37 |
+
class SearchResult:
|
38 |
+
def __init__(self, url, title, description):
|
39 |
+
self.url = url
|
40 |
+
self.title = title
|
41 |
+
self.description = description
|
42 |
+
|
43 |
+
def __repr__(self):
|
44 |
+
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
|
45 |
+
|
46 |
+
|
47 |
+
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
|
48 |
+
"""Search the Google search engine"""
|
49 |
+
|
50 |
+
# Proxy setup
|
51 |
+
proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
|
52 |
+
|
53 |
+
start = start_num
|
54 |
+
fetched_results = 0
|
55 |
+
fetched_links = set()
|
56 |
+
results_list = []
|
57 |
+
image_results = [] # New list for image results
|
58 |
+
|
59 |
+
while fetched_results < num_results:
|
60 |
+
# Send request
|
61 |
+
resp = _req(term, num_results - start,
|
62 |
+
lang, start, proxies, timeout, safe, ssl_verify, region)
|
63 |
+
|
64 |
+
# Parse
|
65 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
66 |
+
result_block = soup.find_all("div", class_="ezO2md")
|
67 |
+
new_results = 0
|
68 |
+
|
69 |
+
# Find all images on the page
|
70 |
+
try:
|
71 |
+
all_images = soup.find_all("img") # Google's image class
|
72 |
+
for img in all_images:
|
73 |
+
img_src = img.get("src") or img.get("data-src")
|
74 |
+
if img_src:
|
75 |
+
# Handle base64 images
|
76 |
+
if img_src.startswith("data:image"):
|
77 |
+
image_results.append({
|
78 |
+
"src": img_src, # Already base64 encoded
|
79 |
+
"alt": img.get("alt", ""),
|
80 |
+
"class": img.get("class", []),
|
81 |
+
})
|
82 |
+
# Handle regular image URLs
|
83 |
+
elif img_src.startswith("http"):
|
84 |
+
image_results.append({
|
85 |
+
"src": img_src,
|
86 |
+
"alt": img.get("alt", ""),
|
87 |
+
"class": img.get("class", []),
|
88 |
+
})
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error parsing images: {str(e)}")
|
91 |
+
|
92 |
+
for result in result_block:
|
93 |
+
link_tag = result.find("a", href=True)
|
94 |
+
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
|
95 |
+
description_tag = result.find("span", class_="FrIlee")
|
96 |
+
|
97 |
+
if link_tag and title_tag and description_tag:
|
98 |
+
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
|
99 |
+
if link in fetched_links and unique:
|
100 |
+
continue
|
101 |
+
fetched_links.add(link)
|
102 |
+
title = title_tag.text if title_tag else ""
|
103 |
+
description = description_tag.text if description_tag else ""
|
104 |
+
|
105 |
+
# Only get page_text if advanced mode and we haven't gotten any yet
|
106 |
+
if advanced and not any('page_text' in result for result in results_list):
|
107 |
+
try:
|
108 |
+
page_scrape = curlreq.get(link, impersonate='chrome110')
|
109 |
+
page_scrape.encoding = 'utf-8'
|
110 |
+
page_soup = BeautifulSoup(page_scrape.text, "html.parser")
|
111 |
+
|
112 |
+
# Try multiple strategies to find main content
|
113 |
+
main_content = (
|
114 |
+
page_soup.find(['article', 'main']) or
|
115 |
+
page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
|
116 |
+
page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
|
117 |
+
page_soup.find('div', {'role': 'main'}) or
|
118 |
+
page_soup.body
|
119 |
+
)
|
120 |
+
if main_content:
|
121 |
+
# Remove unwanted elements
|
122 |
+
for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
|
123 |
+
element.decompose()
|
124 |
+
# Extract text with better cleaning
|
125 |
+
text = main_content.get_text(separator=' ', strip=True)
|
126 |
+
text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
|
127 |
+
page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
|
128 |
+
else:
|
129 |
+
page_text = ""
|
130 |
+
except Exception as e:
|
131 |
+
print(f"Error scraping {link}: {str(e)}")
|
132 |
+
page_text = ""
|
133 |
+
else:
|
134 |
+
page_text = ""
|
135 |
+
|
136 |
+
|
137 |
+
fetched_results += 1
|
138 |
+
new_results += 1
|
139 |
+
|
140 |
+
if advanced:
|
141 |
+
results_list.append({
|
142 |
+
"link": link,
|
143 |
+
"title": title,
|
144 |
+
"description": description,
|
145 |
+
"page_text": page_text,
|
146 |
+
})
|
147 |
+
else:
|
148 |
+
results_list.append(link)
|
149 |
+
|
150 |
+
if fetched_results >= num_results:
|
151 |
+
break
|
152 |
+
|
153 |
+
if new_results == 0:
|
154 |
+
break
|
155 |
+
|
156 |
+
start += 10
|
157 |
+
sleep(sleep_interval)
|
158 |
+
|
159 |
+
if image_results == [] :
|
160 |
+
images = get_images(term)
|
161 |
+
return {"results": results_list, "images": images}
|
162 |
+
else:
|
163 |
+
return {"results": results_list, "images": image_results}
|
Search/useragentka.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
def get_useragent():
|
4 |
+
"""
|
5 |
+
Generates a random user agent string mimicking the format of various software versions.
|
6 |
+
|
7 |
+
The user agent string is composed of:
|
8 |
+
- Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
|
9 |
+
- libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
|
10 |
+
- SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
|
11 |
+
- OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
str: A randomly generated user agent string.
|
15 |
+
"""
|
16 |
+
lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
|
17 |
+
libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
|
18 |
+
ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
|
19 |
+
openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
|
20 |
+
return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
|
__pycache__/app.cpython-312.pyc
ADDED
Binary file (11.7 kB). View file
|
|
__pycache__/config.cpython-312.pyc
ADDED
Binary file (2.61 kB). View file
|
|
__pycache__/prompts.cpython-312.pyc
ADDED
Binary file (2.57 kB). View file
|
|
app.py
CHANGED
@@ -15,13 +15,14 @@ from fastapi.templating import Jinja2Templates
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
|
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
21 |
# Add CORS middleware
|
22 |
app.add_middleware(
|
23 |
CORSMiddleware,
|
24 |
-
allow_origins=["
|
25 |
allow_credentials=True,
|
26 |
allow_methods=["*"],
|
27 |
allow_headers=["*"],
|
@@ -222,95 +223,50 @@ async def chat(request: ChatRequest):
|
|
222 |
log_request("/chat", selected_generator.__name__)
|
223 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
async def generate_modules(request: Request):
|
228 |
data = await request.json()
|
229 |
-
|
|
|
|
|
|
|
230 |
|
231 |
-
|
|
|
232 |
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
return {"error": "searchQuery is required"}
|
236 |
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
'role': 'system',
|
242 |
-
'content': [{
|
243 |
-
'type': 'text',
|
244 |
-
'text': system_prompt
|
245 |
-
}]
|
246 |
-
},
|
247 |
-
{
|
248 |
-
'role': 'user',
|
249 |
-
'content': [{
|
250 |
-
'type': 'text',
|
251 |
-
'text': search_query
|
252 |
-
}]
|
253 |
-
}
|
254 |
-
]
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
'max_tokens': None,
|
259 |
-
'temperature': 0.7,
|
260 |
-
'top_p': 0.7,
|
261 |
-
'top_k': 50,
|
262 |
-
'repetition_penalty': 1,
|
263 |
-
'stream_tokens': True,
|
264 |
-
'stop': ['<|eot_id|>', '<|eom_id|>'],
|
265 |
-
'messages': current_messages,
|
266 |
-
'stream': True,
|
267 |
-
}
|
268 |
-
selected_generator = random.choice([groqgenerate])
|
269 |
-
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
270 |
|
|
|
271 |
|
272 |
-
@app.post("/
|
273 |
-
async def
|
274 |
data = await request.json()
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
log_request("/generate-topics", search_query)
|
281 |
-
|
282 |
-
|
283 |
-
system_prompt = ChiplingPrompts.generateTopics(search_query)
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
'role': 'system',
|
288 |
-
'content': [{
|
289 |
-
'type': 'text',
|
290 |
-
'text': system_prompt
|
291 |
-
}]
|
292 |
-
},
|
293 |
-
{
|
294 |
-
'role': 'user',
|
295 |
-
'content': [{
|
296 |
-
'type': 'text',
|
297 |
-
'text': search_query
|
298 |
-
}]
|
299 |
-
}
|
300 |
-
]
|
301 |
|
302 |
-
|
303 |
-
|
304 |
-
'max_tokens': None,
|
305 |
-
'temperature': 0.7,
|
306 |
-
'top_p': 0.7,
|
307 |
-
'top_k': 50,
|
308 |
-
'repetition_penalty': 1,
|
309 |
-
'stream_tokens': True,
|
310 |
-
'stop': ['<|eot_id|>', '<|eom_id|>'],
|
311 |
-
'messages': current_messages,
|
312 |
-
'stream': True,
|
313 |
-
}
|
314 |
|
315 |
-
|
316 |
-
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
18 |
+
from Search.main import search
|
19 |
|
20 |
app = FastAPI()
|
21 |
|
22 |
# Add CORS middleware
|
23 |
app.add_middleware(
|
24 |
CORSMiddleware,
|
25 |
+
allow_origins=["http://localhost:8080", "https://www.chipling.xyz"],
|
26 |
allow_credentials=True,
|
27 |
allow_methods=["*"],
|
28 |
allow_headers=["*"],
|
|
|
223 |
log_request("/chat", selected_generator.__name__)
|
224 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
225 |
|
226 |
+
@app.post("/fetch-images")
|
227 |
+
async def fetch_images(request: Request):
|
|
|
228 |
data = await request.json()
|
229 |
+
query = data.get("query", "")
|
230 |
+
num_results = data.get("num_results", 5)
|
231 |
+
lang = data.get("lang", "en")
|
232 |
+
advanced = data.get("advanced", False)
|
233 |
|
234 |
+
# Call the search function
|
235 |
+
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
236 |
|
237 |
+
# Log the request
|
238 |
+
log_request("/fetch-images", query)
|
239 |
|
240 |
+
return results['images']
|
|
|
241 |
|
242 |
+
@app.post("/fetch-links")
|
243 |
+
async def fetch_links(request: Request):
|
244 |
+
data = await request.json()
|
245 |
+
query = data.get("query", "")
|
246 |
+
num_results = data.get("num_results", 5)
|
247 |
+
lang = data.get("lang", "en")
|
248 |
+
advanced = data.get("advanced", False)
|
249 |
|
250 |
+
# Call the search function
|
251 |
+
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
+
# Log the request
|
254 |
+
log_request("/fetch-links", query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
+
return results['results']
|
257 |
|
258 |
+
@app.post("/fetch-google")
|
259 |
+
async def fetch_google(request: Request):
|
260 |
data = await request.json()
|
261 |
+
query = data.get("query", "")
|
262 |
+
num_results = data.get("num_results", 5)
|
263 |
+
lang = data.get("lang", "en")
|
264 |
+
advanced = data.get("advanced", True)
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
# Call the search function
|
267 |
+
results = search(query, num_results=num_results, lang=lang, advanced=advanced)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
+
# Log the request
|
270 |
+
log_request("/fetch-google", query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
+
return results
|
|
logs.json
CHANGED
@@ -1 +0,0 @@
|
|
1 |
-
[]
|
|
|
|
test.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
url = "http://localhost:8000/fetch-images" # or your deployed URL
|
4 |
+
|
5 |
+
payload = {
|
6 |
+
"query": "sunset beach",
|
7 |
+
"num_results": 5,
|
8 |
+
"lang": "en",
|
9 |
+
"advanced": False
|
10 |
+
}
|
11 |
+
|
12 |
+
response = requests.post(url, json=payload)
|
13 |
+
|
14 |
+
if response.ok:
|
15 |
+
results = response.json()
|
16 |
+
print("Fetched Images:", results)
|
17 |
+
else:
|
18 |
+
print("Error:", response.status_code, response.text)
|
utils/__pycache__/logger.cpython-312.pyc
ADDED
Binary file (1.26 kB). View file
|
|