Maouu commited on
Commit
328de20
·
1 Parent(s): 949c5db

added google search!

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
__pycache__/app.cpython-312.pyc CHANGED
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
 
app.py CHANGED
@@ -15,6 +15,7 @@ from fastapi.templating import Jinja2Templates
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
 
18
 
19
  app = FastAPI()
20
 
@@ -412,3 +413,14 @@ async def chat(request: ChatRequest):
412
  log_request("/chat", selected_generator.__name__)
413
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
414
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
18
+ from chipsearch.main import search
19
 
20
  app = FastAPI()
21
 
 
413
  log_request("/chat", selected_generator.__name__)
414
  return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
415
 
416
+
417
+
418
+ @app.post("/chipsearch")
419
+ async def chipsearch(request: Request):
420
+ data = search(
421
+ term=request.query_params.get("term"),
422
+ num_results=int(request.query_params.get("num_results", 10)),
423
+ advanced=bool(request.query_params.get("advanced", False)),
424
+ unique=bool(request.query_params.get("unique", False))
425
+ )
426
+ return data
chipsearch/__pycache__/gettyimages.cpython-312.pyc ADDED
Binary file (1.04 kB). View file
 
chipsearch/__pycache__/main.cpython-312.pyc ADDED
Binary file (6.98 kB). View file
 
chipsearch/__pycache__/useragentka.cpython-312.pyc ADDED
Binary file (1.68 kB). View file
 
chipsearch/gettyimages.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from curl_cffi import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def get_images(query):
5
+ res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
6
+
7
+ soup = BeautifulSoup(res.text, 'html.parser')
8
+
9
+ images = soup.find_all('img')
10
+
11
+ results = []
12
+
13
+ for image in images:
14
+ print(image['src'])
15
+ if image['src'].startswith('https://media.gettyimages.com'):
16
+ results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
17
+ else:
18
+ continue
19
+
20
+ return results
21
+
chipsearch/main.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """googlesearch is a Python library for searching Google, easily."""
2
+ from time import sleep
3
+ from bs4 import BeautifulSoup
4
+ from requests import get
5
+ from urllib.parse import unquote # to decode the url
6
+ from chipsearch.useragentka import get_useragent
7
+ from curl_cffi import requests as curlreq
8
+ from chipsearch.gettyimages import get_images
9
+
10
+ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
11
+ resp = get(
12
+ url="https://www.google.com/search",
13
+ headers={
14
+ "User-Agent": get_useragent(),
15
+ "Accept": "*/*"
16
+ },
17
+ params={
18
+ "q": term,
19
+ "num": results + 2, # Prevents multiple requests
20
+ "hl": lang,
21
+ "start": start,
22
+ "safe": safe,
23
+ "gl": region,
24
+ },
25
+ proxies=proxies,
26
+ timeout=timeout,
27
+ verify=ssl_verify,
28
+ cookies = {
29
+ 'CONSENT': 'PENDING+987', # Bypasses the consent page
30
+ 'SOCS': 'CAESHAgBEhIaAB',
31
+ }
32
+ )
33
+ resp.raise_for_status()
34
+ return resp
35
+
36
+
37
+ class SearchResult:
38
+ def __init__(self, url, title, description):
39
+ self.url = url
40
+ self.title = title
41
+ self.description = description
42
+
43
+ def __repr__(self):
44
+ return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
45
+
46
+
47
+ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
48
+ """Search the Google search engine"""
49
+
50
+ # Proxy setup
51
+ proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
52
+
53
+ start = start_num
54
+ fetched_results = 0
55
+ fetched_links = set()
56
+ results_list = []
57
+ image_results = [] # New list for image results
58
+
59
+ while fetched_results < num_results:
60
+ # Send request
61
+ resp = _req(term, num_results - start,
62
+ lang, start, proxies, timeout, safe, ssl_verify, region)
63
+
64
+ # Parse
65
+ soup = BeautifulSoup(resp.text, "html.parser")
66
+ result_block = soup.find_all("div", class_="ezO2md")
67
+ new_results = 0
68
+
69
+ # Find all images on the page
70
+ try:
71
+ all_images = soup.find_all("img") # Google's image class
72
+ for img in all_images:
73
+ img_src = img.get("src") or img.get("data-src")
74
+ if img_src:
75
+ # Handle base64 images
76
+ if img_src.startswith("data:image"):
77
+ image_results.append({
78
+ "src": img_src, # Already base64 encoded
79
+ "alt": img.get("alt", ""),
80
+ "class": img.get("class", []),
81
+ })
82
+ # Handle regular image URLs
83
+ elif img_src.startswith("http"):
84
+ image_results.append({
85
+ "src": img_src,
86
+ "alt": img.get("alt", ""),
87
+ "class": img.get("class", []),
88
+ })
89
+ except Exception as e:
90
+ print(f"Error parsing images: {str(e)}")
91
+
92
+ for result in result_block:
93
+ link_tag = result.find("a", href=True)
94
+ title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
95
+ description_tag = result.find("span", class_="FrIlee")
96
+
97
+ if link_tag and title_tag and description_tag:
98
+ link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
99
+ if link in fetched_links and unique:
100
+ continue
101
+ fetched_links.add(link)
102
+ title = title_tag.text if title_tag else ""
103
+ description = description_tag.text if description_tag else ""
104
+
105
+ # Only get page_text if advanced mode and we haven't gotten any yet
106
+ if advanced and not any('page_text' in result for result in results_list):
107
+ try:
108
+ page_scrape = curlreq.get(link, impersonate='chrome110')
109
+ page_scrape.encoding = 'utf-8'
110
+ page_soup = BeautifulSoup(page_scrape.text, "html.parser")
111
+
112
+ # Try multiple strategies to find main content
113
+ main_content = (
114
+ page_soup.find(['article', 'main']) or
115
+ page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
116
+ page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
117
+ page_soup.find('div', {'role': 'main'}) or
118
+ page_soup.body
119
+ )
120
+ if main_content:
121
+ # Remove unwanted elements
122
+ for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
123
+ element.decompose()
124
+ # Extract text with better cleaning
125
+ text = main_content.get_text(separator=' ', strip=True)
126
+ text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
127
+ page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
128
+ else:
129
+ page_text = ""
130
+ except Exception as e:
131
+ print(f"Error scraping {link}: {str(e)}")
132
+ page_text = ""
133
+ else:
134
+ page_text = ""
135
+
136
+
137
+ fetched_results += 1
138
+ new_results += 1
139
+
140
+ if advanced:
141
+ results_list.append({
142
+ "link": link,
143
+ "title": title,
144
+ "description": description,
145
+ "page_text": page_text,
146
+ })
147
+ else:
148
+ results_list.append(link)
149
+
150
+ if fetched_results >= num_results:
151
+ break
152
+
153
+ if new_results == 0:
154
+ break
155
+
156
+ start += 10
157
+ sleep(sleep_interval)
158
+
159
+ if image_results == [] :
160
+ images = get_images(term)
161
+ return {"results": results_list, "images": images}
162
+ else:
163
+ return {"results": results_list, "images": image_results}
chipsearch/useragentka.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ def get_useragent():
4
+ """
5
+ Generates a random user agent string mimicking the format of various software versions.
6
+
7
+ The user agent string is composed of:
8
+ - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
9
+ - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
10
+ - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
11
+ - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
12
+
13
+ Returns:
14
+ str: A randomly generated user agent string.
15
+ """
16
+ lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
17
+ libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
18
+ ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
19
+ openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
20
+ return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
requirements.txt CHANGED
@@ -7,4 +7,6 @@ asyncio
7
  groq
8
  jinja2
9
  aiofiles
10
- matplotlib
 
 
 
7
  groq
8
  jinja2
9
  aiofiles
10
+ matplotlib
11
+ curl_cffi
12
+ beautifulsoup4
test.py CHANGED
@@ -1,32 +1,50 @@
1
  import requests
2
  import json
3
 
4
- url = "http://127.0.0.1:8000/chat"
5
-
6
- payload = {
7
- "model": "your-model-name",
8
- "message": "Create a json object of top 10 anime! answer in json only!",
9
- "messages": []
10
- }
11
-
12
- headers = {
13
- "Content-Type": "application/json"
14
- }
15
-
16
- # Send streaming POST request
17
- with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
18
- if response.status_code == 200:
19
- for line in response.iter_lines(decode_unicode=True):
20
- if line and line.startswith('data: '):
21
- try:
22
- # Remove 'data: ' prefix and parse JSON
23
- json_data = json.loads(line[6:])
24
- # Extract text from choices if available
25
- if json_data.get('choices') and len(json_data['choices']) > 0:
26
- text = json_data['choices'][0].get('text', '')
27
- if text:
28
- print(text, end='')
29
- except json.JSONDecodeError:
30
- continue
31
- else:
32
- print("Error:", response.status_code, response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  import json
3
 
4
+ # import requests
5
+ # import json
6
+
7
+ # url = "http://127.0.0.1:8000/chat"
8
+
9
+ # payload = {
10
+ # "model": "your-model-name",
11
+ # "message": "Create a json object of top 10 anime! answer in json only!",
12
+ # "messages": []
13
+ # }
14
+
15
+ # headers = {
16
+ # "Content-Type": "application/json"
17
+ # }
18
+
19
+ # # Send streaming POST request
20
+ # with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
21
+ # if response.status_code == 200:
22
+ # for line in response.iter_lines(decode_unicode=True):
23
+ # if line and line.startswith('data: '):
24
+ # try:
25
+ # # Remove 'data: ' prefix and parse JSON
26
+ # json_data = json.loads(line[6:])
27
+ # # Extract text from choices if available
28
+ # if json_data.get('choices') and len(json_data['choices']) > 0:
29
+ # text = json_data['choices'][0].get('text', '')
30
+ # if text:
31
+ # print(text, end='')
32
+ # except json.JSONDecodeError:
33
+ # continue
34
+ # else:
35
+ # print("Error:", response.status_code, response.text)
36
+
37
+
38
+ def search_chips(term, num_results=10, advanced=False, unique=False):
39
+ url = f"http://127.0.0.1:8000/chipsearch?term={term}&num_results={num_results}&advanced={advanced}&unique={unique}"
40
+
41
+ try:
42
+ response = requests.post(url)
43
+ response.raise_for_status()
44
+ return response.json()
45
+ except requests.exceptions.RequestException as e:
46
+ print(f"Error: {e}")
47
+ return None
48
+
49
+ results = search_chips("top 10 anime of all time")
50
+ print(results)