Maouu commited on
Commit
208a601
·
1 Parent(s): dd32b74

Implemented Google Search and Images Functionality

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
Search/__pycache__/gettyimages.cpython-312.pyc ADDED
Binary file (1.03 kB). View file
 
Search/__pycache__/main.cpython-312.pyc ADDED
Binary file (6.96 kB). View file
 
Search/__pycache__/useragentka.cpython-312.pyc ADDED
Binary file (1.68 kB). View file
 
Search/gettyimages.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from curl_cffi import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def get_images(query):
5
+ res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
6
+
7
+ soup = BeautifulSoup(res.text, 'html.parser')
8
+
9
+ images = soup.find_all('img')
10
+
11
+ results = []
12
+
13
+ for image in images:
14
+ print(image['src'])
15
+ if image['src'].startswith('https://media.gettyimages.com'):
16
+ results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
17
+ else:
18
+ continue
19
+
20
+ return results
21
+
Search/main.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """googlesearch is a Python library for searching Google, easily."""
2
+ from time import sleep
3
+ from bs4 import BeautifulSoup
4
+ from requests import get
5
+ from urllib.parse import unquote # to decode the url
6
+ from Search.useragentka import get_useragent
7
+ from curl_cffi import requests as curlreq
8
+ from Search.gettyimages import get_images
9
+
10
+ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
11
+ resp = get(
12
+ url="https://www.google.com/search",
13
+ headers={
14
+ "User-Agent": get_useragent(),
15
+ "Accept": "*/*"
16
+ },
17
+ params={
18
+ "q": term,
19
+ "num": results + 2, # Prevents multiple requests
20
+ "hl": lang,
21
+ "start": start,
22
+ "safe": safe,
23
+ "gl": region,
24
+ },
25
+ proxies=proxies,
26
+ timeout=timeout,
27
+ verify=ssl_verify,
28
+ cookies = {
29
+ 'CONSENT': 'PENDING+987', # Bypasses the consent page
30
+ 'SOCS': 'CAESHAgBEhIaAB',
31
+ }
32
+ )
33
+ resp.raise_for_status()
34
+ return resp
35
+
36
+
37
+ class SearchResult:
38
+ def __init__(self, url, title, description):
39
+ self.url = url
40
+ self.title = title
41
+ self.description = description
42
+
43
+ def __repr__(self):
44
+ return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
45
+
46
+
47
+ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
48
+ """Search the Google search engine"""
49
+
50
+ # Proxy setup
51
+ proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
52
+
53
+ start = start_num
54
+ fetched_results = 0
55
+ fetched_links = set()
56
+ results_list = []
57
+ image_results = [] # New list for image results
58
+
59
+ while fetched_results < num_results:
60
+ # Send request
61
+ resp = _req(term, num_results - start,
62
+ lang, start, proxies, timeout, safe, ssl_verify, region)
63
+
64
+ # Parse
65
+ soup = BeautifulSoup(resp.text, "html.parser")
66
+ result_block = soup.find_all("div", class_="ezO2md")
67
+ new_results = 0
68
+
69
+ # Find all images on the page
70
+ try:
71
+ all_images = soup.find_all("img") # Google's image class
72
+ for img in all_images:
73
+ img_src = img.get("src") or img.get("data-src")
74
+ if img_src:
75
+ # Handle base64 images
76
+ if img_src.startswith("data:image"):
77
+ image_results.append({
78
+ "src": img_src, # Already base64 encoded
79
+ "alt": img.get("alt", ""),
80
+ "class": img.get("class", []),
81
+ })
82
+ # Handle regular image URLs
83
+ elif img_src.startswith("http"):
84
+ image_results.append({
85
+ "src": img_src,
86
+ "alt": img.get("alt", ""),
87
+ "class": img.get("class", []),
88
+ })
89
+ except Exception as e:
90
+ print(f"Error parsing images: {str(e)}")
91
+
92
+ for result in result_block:
93
+ link_tag = result.find("a", href=True)
94
+ title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
95
+ description_tag = result.find("span", class_="FrIlee")
96
+
97
+ if link_tag and title_tag and description_tag:
98
+ link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
99
+ if link in fetched_links and unique:
100
+ continue
101
+ fetched_links.add(link)
102
+ title = title_tag.text if title_tag else ""
103
+ description = description_tag.text if description_tag else ""
104
+
105
+ # Only get page_text if advanced mode and we haven't gotten any yet
106
+ if advanced and not any('page_text' in result for result in results_list):
107
+ try:
108
+ page_scrape = curlreq.get(link, impersonate='chrome110')
109
+ page_scrape.encoding = 'utf-8'
110
+ page_soup = BeautifulSoup(page_scrape.text, "html.parser")
111
+
112
+ # Try multiple strategies to find main content
113
+ main_content = (
114
+ page_soup.find(['article', 'main']) or
115
+ page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
116
+ page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
117
+ page_soup.find('div', {'role': 'main'}) or
118
+ page_soup.body
119
+ )
120
+ if main_content:
121
+ # Remove unwanted elements
122
+ for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
123
+ element.decompose()
124
+ # Extract text with better cleaning
125
+ text = main_content.get_text(separator=' ', strip=True)
126
+ text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
127
+ page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
128
+ else:
129
+ page_text = ""
130
+ except Exception as e:
131
+ print(f"Error scraping {link}: {str(e)}")
132
+ page_text = ""
133
+ else:
134
+ page_text = ""
135
+
136
+
137
+ fetched_results += 1
138
+ new_results += 1
139
+
140
+ if advanced:
141
+ results_list.append({
142
+ "link": link,
143
+ "title": title,
144
+ "description": description,
145
+ "page_text": page_text,
146
+ })
147
+ else:
148
+ results_list.append(link)
149
+
150
+ if fetched_results >= num_results:
151
+ break
152
+
153
+ if new_results == 0:
154
+ break
155
+
156
+ start += 10
157
+ sleep(sleep_interval)
158
+
159
+ if image_results == [] :
160
+ images = get_images(term)
161
+ return {"results": results_list, "images": images}
162
+ else:
163
+ return {"results": results_list, "images": image_results}
Search/useragentka.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ def get_useragent():
4
+ """
5
+ Generates a random user agent string mimicking the format of various software versions.
6
+
7
+ The user agent string is composed of:
8
+ - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
9
+ - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
10
+ - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
11
+ - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
12
+
13
+ Returns:
14
+ str: A randomly generated user agent string.
15
+ """
16
+ lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
17
+ libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
18
+ ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
19
+ openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
20
+ return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
__pycache__/app.cpython-312.pyc ADDED
Binary file (11.7 kB). View file
 
__pycache__/config.cpython-312.pyc ADDED
Binary file (2.61 kB). View file
 
__pycache__/prompts.cpython-312.pyc ADDED
Binary file (2.57 kB). View file
 
app.py CHANGED
@@ -15,13 +15,14 @@ from fastapi.templating import Jinja2Templates
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
 
18
 
19
  app = FastAPI()
20
 
21
  # Add CORS middleware
22
  app.add_middleware(
23
  CORSMiddleware,
24
- allow_origins=["*"],
25
  allow_credentials=True,
26
  allow_methods=["*"],
27
  allow_headers=["*"],
@@ -222,95 +223,50 @@ async def chat(request: ChatRequest):
222
  log_request("/chat", selected_generator.__name__)
223
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
224
 
225
-
226
- @app.post("/generate-modules")
227
- async def generate_modules(request: Request):
228
  data = await request.json()
229
- search_query = data.get("searchQuery")
 
 
 
230
 
231
- log_request("/generate-modules", search_query)
 
232
 
 
 
233
 
234
- if not search_query:
235
- return {"error": "searchQuery is required"}
236
 
237
- system_prompt = ChiplingPrompts.generateModules(search_query)
 
 
 
 
 
 
238
 
239
- current_messages = [
240
- {
241
- 'role': 'system',
242
- 'content': [{
243
- 'type': 'text',
244
- 'text': system_prompt
245
- }]
246
- },
247
- {
248
- 'role': 'user',
249
- 'content': [{
250
- 'type': 'text',
251
- 'text': search_query
252
- }]
253
- }
254
- ]
255
 
256
- json_data = {
257
- 'model': "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
258
- 'max_tokens': None,
259
- 'temperature': 0.7,
260
- 'top_p': 0.7,
261
- 'top_k': 50,
262
- 'repetition_penalty': 1,
263
- 'stream_tokens': True,
264
- 'stop': ['<|eot_id|>', '<|eom_id|>'],
265
- 'messages': current_messages,
266
- 'stream': True,
267
- }
268
- selected_generator = random.choice([groqgenerate])
269
- return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
270
 
 
271
 
272
- @app.post("/generate-topics")
273
- async def generate_topics(request: Request):
274
  data = await request.json()
275
- search_query = data.get("searchQuery")
276
-
277
- if not search_query:
278
- return {"error": "searchQuery is required"}
279
-
280
- log_request("/generate-topics", search_query)
281
-
282
-
283
- system_prompt = ChiplingPrompts.generateTopics(search_query)
284
 
285
- current_messages = [
286
- {
287
- 'role': 'system',
288
- 'content': [{
289
- 'type': 'text',
290
- 'text': system_prompt
291
- }]
292
- },
293
- {
294
- 'role': 'user',
295
- 'content': [{
296
- 'type': 'text',
297
- 'text': search_query
298
- }]
299
- }
300
- ]
301
 
302
- json_data = {
303
- 'model': "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
304
- 'max_tokens': None,
305
- 'temperature': 0.7,
306
- 'top_p': 0.7,
307
- 'top_k': 50,
308
- 'repetition_penalty': 1,
309
- 'stream_tokens': True,
310
- 'stop': ['<|eot_id|>', '<|eom_id|>'],
311
- 'messages': current_messages,
312
- 'stream': True,
313
- }
314
 
315
- selected_generator = random.choice([groqgenerate, generate])
316
- return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
 
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
18
+ from Search.main import search
19
 
20
  app = FastAPI()
21
 
22
  # Add CORS middleware
23
  app.add_middleware(
24
  CORSMiddleware,
25
+ allow_origins=["http://localhost:8080", "https://www.chipling.xyz"],
26
  allow_credentials=True,
27
  allow_methods=["*"],
28
  allow_headers=["*"],
 
223
  log_request("/chat", selected_generator.__name__)
224
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
225
 
226
+ @app.post("/fetch-images")
227
+ async def fetch_images(request: Request):
 
228
  data = await request.json()
229
+ query = data.get("query", "")
230
+ num_results = data.get("num_results", 5)
231
+ lang = data.get("lang", "en")
232
+ advanced = data.get("advanced", False)
233
 
234
+ # Call the search function
235
+ results = search(query, num_results=num_results, lang=lang, advanced=advanced)
236
 
237
+ # Log the request
238
+ log_request("/fetch-images", query)
239
 
240
+ return results['images']
 
241
 
242
+ @app.post("/fetch-links")
243
+ async def fetch_links(request: Request):
244
+ data = await request.json()
245
+ query = data.get("query", "")
246
+ num_results = data.get("num_results", 5)
247
+ lang = data.get("lang", "en")
248
+ advanced = data.get("advanced", False)
249
 
250
+ # Call the search function
251
+ results = search(query, num_results=num_results, lang=lang, advanced=advanced)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ # Log the request
254
+ log_request("/fetch-links", query)
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ return results['results']
257
 
258
+ @app.post("/fetch-google")
259
+ async def fetch_google(request: Request):
260
  data = await request.json()
261
+ query = data.get("query", "")
262
+ num_results = data.get("num_results", 5)
263
+ lang = data.get("lang", "en")
264
+ advanced = data.get("advanced", True)
 
 
 
 
 
265
 
266
+ # Call the search function
267
+ results = search(query, num_results=num_results, lang=lang, advanced=advanced)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ # Log the request
270
+ log_request("/fetch-google", query)
 
 
 
 
 
 
 
 
 
 
271
 
272
+ return results
 
logs.json CHANGED
@@ -1 +0,0 @@
1
- []
 
 
test.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = "http://localhost:8000/fetch-images" # or your deployed URL
4
+
5
+ payload = {
6
+ "query": "sunset beach",
7
+ "num_results": 5,
8
+ "lang": "en",
9
+ "advanced": False
10
+ }
11
+
12
+ response = requests.post(url, json=payload)
13
+
14
+ if response.ok:
15
+ results = response.json()
16
+ print("Fetched Images:", results)
17
+ else:
18
+ print("Error:", response.status_code, response.text)
utils/__pycache__/logger.cpython-312.pyc ADDED
Binary file (1.26 kB). View file