Spaces:
Running
Running
added google search!
Browse files- .DS_Store +0 -0
- __pycache__/app.cpython-312.pyc +0 -0
- app.py +12 -0
- chipsearch/__pycache__/gettyimages.cpython-312.pyc +0 -0
- chipsearch/__pycache__/main.cpython-312.pyc +0 -0
- chipsearch/__pycache__/useragentka.cpython-312.pyc +0 -0
- chipsearch/gettyimages.py +21 -0
- chipsearch/main.py +163 -0
- chipsearch/useragentka.py +20 -0
- requirements.txt +3 -1
- test.py +47 -29
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
__pycache__/app.cpython-312.pyc
CHANGED
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -15,6 +15,7 @@ from fastapi.templating import Jinja2Templates
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
|
|
18 |
|
19 |
app = FastAPI()
|
20 |
|
@@ -412,3 +413,14 @@ async def chat(request: ChatRequest):
|
|
412 |
log_request("/chat", selected_generator.__name__)
|
413 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
from pathlib import Path
|
16 |
from collections import Counter, defaultdict
|
17 |
from utils.logger import log_request
|
18 |
+
from chipsearch.main import search
|
19 |
|
20 |
app = FastAPI()
|
21 |
|
|
|
413 |
log_request("/chat", selected_generator.__name__)
|
414 |
return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
|
415 |
|
416 |
+
|
417 |
+
|
418 |
+
@app.post("/chipsearch")
|
419 |
+
async def chipsearch(request: Request):
|
420 |
+
data = search(
|
421 |
+
term=request.query_params.get("term"),
|
422 |
+
num_results=int(request.query_params.get("num_results", 10)),
|
423 |
+
advanced=bool(request.query_params.get("advanced", False)),
|
424 |
+
unique=bool(request.query_params.get("unique", False))
|
425 |
+
)
|
426 |
+
return data
|
chipsearch/__pycache__/gettyimages.cpython-312.pyc
ADDED
Binary file (1.04 kB). View file
|
|
chipsearch/__pycache__/main.cpython-312.pyc
ADDED
Binary file (6.98 kB). View file
|
|
chipsearch/__pycache__/useragentka.cpython-312.pyc
ADDED
Binary file (1.68 kB). View file
|
|
chipsearch/gettyimages.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from curl_cffi import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
def get_images(query):
|
5 |
+
res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
|
6 |
+
|
7 |
+
soup = BeautifulSoup(res.text, 'html.parser')
|
8 |
+
|
9 |
+
images = soup.find_all('img')
|
10 |
+
|
11 |
+
results = []
|
12 |
+
|
13 |
+
for image in images:
|
14 |
+
print(image['src'])
|
15 |
+
if image['src'].startswith('https://media.gettyimages.com'):
|
16 |
+
results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
|
17 |
+
else:
|
18 |
+
continue
|
19 |
+
|
20 |
+
return results
|
21 |
+
|
chipsearch/main.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""googlesearch is a Python library for searching Google, easily."""
|
2 |
+
from time import sleep
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from requests import get
|
5 |
+
from urllib.parse import unquote # to decode the url
|
6 |
+
from chipsearch.useragentka import get_useragent
|
7 |
+
from curl_cffi import requests as curlreq
|
8 |
+
from chipsearch.gettyimages import get_images
|
9 |
+
|
10 |
+
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
|
11 |
+
resp = get(
|
12 |
+
url="https://www.google.com/search",
|
13 |
+
headers={
|
14 |
+
"User-Agent": get_useragent(),
|
15 |
+
"Accept": "*/*"
|
16 |
+
},
|
17 |
+
params={
|
18 |
+
"q": term,
|
19 |
+
"num": results + 2, # Prevents multiple requests
|
20 |
+
"hl": lang,
|
21 |
+
"start": start,
|
22 |
+
"safe": safe,
|
23 |
+
"gl": region,
|
24 |
+
},
|
25 |
+
proxies=proxies,
|
26 |
+
timeout=timeout,
|
27 |
+
verify=ssl_verify,
|
28 |
+
cookies = {
|
29 |
+
'CONSENT': 'PENDING+987', # Bypasses the consent page
|
30 |
+
'SOCS': 'CAESHAgBEhIaAB',
|
31 |
+
}
|
32 |
+
)
|
33 |
+
resp.raise_for_status()
|
34 |
+
return resp
|
35 |
+
|
36 |
+
|
37 |
+
class SearchResult:
|
38 |
+
def __init__(self, url, title, description):
|
39 |
+
self.url = url
|
40 |
+
self.title = title
|
41 |
+
self.description = description
|
42 |
+
|
43 |
+
def __repr__(self):
|
44 |
+
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
|
45 |
+
|
46 |
+
|
47 |
+
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
|
48 |
+
"""Search the Google search engine"""
|
49 |
+
|
50 |
+
# Proxy setup
|
51 |
+
proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
|
52 |
+
|
53 |
+
start = start_num
|
54 |
+
fetched_results = 0
|
55 |
+
fetched_links = set()
|
56 |
+
results_list = []
|
57 |
+
image_results = [] # New list for image results
|
58 |
+
|
59 |
+
while fetched_results < num_results:
|
60 |
+
# Send request
|
61 |
+
resp = _req(term, num_results - start,
|
62 |
+
lang, start, proxies, timeout, safe, ssl_verify, region)
|
63 |
+
|
64 |
+
# Parse
|
65 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
66 |
+
result_block = soup.find_all("div", class_="ezO2md")
|
67 |
+
new_results = 0
|
68 |
+
|
69 |
+
# Find all images on the page
|
70 |
+
try:
|
71 |
+
all_images = soup.find_all("img") # Google's image class
|
72 |
+
for img in all_images:
|
73 |
+
img_src = img.get("src") or img.get("data-src")
|
74 |
+
if img_src:
|
75 |
+
# Handle base64 images
|
76 |
+
if img_src.startswith("data:image"):
|
77 |
+
image_results.append({
|
78 |
+
"src": img_src, # Already base64 encoded
|
79 |
+
"alt": img.get("alt", ""),
|
80 |
+
"class": img.get("class", []),
|
81 |
+
})
|
82 |
+
# Handle regular image URLs
|
83 |
+
elif img_src.startswith("http"):
|
84 |
+
image_results.append({
|
85 |
+
"src": img_src,
|
86 |
+
"alt": img.get("alt", ""),
|
87 |
+
"class": img.get("class", []),
|
88 |
+
})
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error parsing images: {str(e)}")
|
91 |
+
|
92 |
+
for result in result_block:
|
93 |
+
link_tag = result.find("a", href=True)
|
94 |
+
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
|
95 |
+
description_tag = result.find("span", class_="FrIlee")
|
96 |
+
|
97 |
+
if link_tag and title_tag and description_tag:
|
98 |
+
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
|
99 |
+
if link in fetched_links and unique:
|
100 |
+
continue
|
101 |
+
fetched_links.add(link)
|
102 |
+
title = title_tag.text if title_tag else ""
|
103 |
+
description = description_tag.text if description_tag else ""
|
104 |
+
|
105 |
+
# Only get page_text if advanced mode and we haven't gotten any yet
|
106 |
+
if advanced and not any('page_text' in result for result in results_list):
|
107 |
+
try:
|
108 |
+
page_scrape = curlreq.get(link, impersonate='chrome110')
|
109 |
+
page_scrape.encoding = 'utf-8'
|
110 |
+
page_soup = BeautifulSoup(page_scrape.text, "html.parser")
|
111 |
+
|
112 |
+
# Try multiple strategies to find main content
|
113 |
+
main_content = (
|
114 |
+
page_soup.find(['article', 'main']) or
|
115 |
+
page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
|
116 |
+
page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
|
117 |
+
page_soup.find('div', {'role': 'main'}) or
|
118 |
+
page_soup.body
|
119 |
+
)
|
120 |
+
if main_content:
|
121 |
+
# Remove unwanted elements
|
122 |
+
for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
|
123 |
+
element.decompose()
|
124 |
+
# Extract text with better cleaning
|
125 |
+
text = main_content.get_text(separator=' ', strip=True)
|
126 |
+
text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
|
127 |
+
page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
|
128 |
+
else:
|
129 |
+
page_text = ""
|
130 |
+
except Exception as e:
|
131 |
+
print(f"Error scraping {link}: {str(e)}")
|
132 |
+
page_text = ""
|
133 |
+
else:
|
134 |
+
page_text = ""
|
135 |
+
|
136 |
+
|
137 |
+
fetched_results += 1
|
138 |
+
new_results += 1
|
139 |
+
|
140 |
+
if advanced:
|
141 |
+
results_list.append({
|
142 |
+
"link": link,
|
143 |
+
"title": title,
|
144 |
+
"description": description,
|
145 |
+
"page_text": page_text,
|
146 |
+
})
|
147 |
+
else:
|
148 |
+
results_list.append(link)
|
149 |
+
|
150 |
+
if fetched_results >= num_results:
|
151 |
+
break
|
152 |
+
|
153 |
+
if new_results == 0:
|
154 |
+
break
|
155 |
+
|
156 |
+
start += 10
|
157 |
+
sleep(sleep_interval)
|
158 |
+
|
159 |
+
if image_results == [] :
|
160 |
+
images = get_images(term)
|
161 |
+
return {"results": results_list, "images": images}
|
162 |
+
else:
|
163 |
+
return {"results": results_list, "images": image_results}
|
chipsearch/useragentka.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
def get_useragent():
|
4 |
+
"""
|
5 |
+
Generates a random user agent string mimicking the format of various software versions.
|
6 |
+
|
7 |
+
The user agent string is composed of:
|
8 |
+
- Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
|
9 |
+
- libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
|
10 |
+
- SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
|
11 |
+
- OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
str: A randomly generated user agent string.
|
15 |
+
"""
|
16 |
+
lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
|
17 |
+
libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
|
18 |
+
ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
|
19 |
+
openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
|
20 |
+
return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
|
requirements.txt
CHANGED
@@ -7,4 +7,6 @@ asyncio
|
|
7 |
groq
|
8 |
jinja2
|
9 |
aiofiles
|
10 |
-
matplotlib
|
|
|
|
|
|
7 |
groq
|
8 |
jinja2
|
9 |
aiofiles
|
10 |
+
matplotlib
|
11 |
+
curl_cffi
|
12 |
+
beautifulsoup4
|
test.py
CHANGED
@@ -1,32 +1,50 @@
|
|
1 |
import requests
|
2 |
import json
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
import json
|
3 |
|
4 |
+
# import requests
|
5 |
+
# import json
|
6 |
+
|
7 |
+
# url = "http://127.0.0.1:8000/chat"
|
8 |
+
|
9 |
+
# payload = {
|
10 |
+
# "model": "your-model-name",
|
11 |
+
# "message": "Create a json object of top 10 anime! answer in json only!",
|
12 |
+
# "messages": []
|
13 |
+
# }
|
14 |
+
|
15 |
+
# headers = {
|
16 |
+
# "Content-Type": "application/json"
|
17 |
+
# }
|
18 |
+
|
19 |
+
# # Send streaming POST request
|
20 |
+
# with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
|
21 |
+
# if response.status_code == 200:
|
22 |
+
# for line in response.iter_lines(decode_unicode=True):
|
23 |
+
# if line and line.startswith('data: '):
|
24 |
+
# try:
|
25 |
+
# # Remove 'data: ' prefix and parse JSON
|
26 |
+
# json_data = json.loads(line[6:])
|
27 |
+
# # Extract text from choices if available
|
28 |
+
# if json_data.get('choices') and len(json_data['choices']) > 0:
|
29 |
+
# text = json_data['choices'][0].get('text', '')
|
30 |
+
# if text:
|
31 |
+
# print(text, end='')
|
32 |
+
# except json.JSONDecodeError:
|
33 |
+
# continue
|
34 |
+
# else:
|
35 |
+
# print("Error:", response.status_code, response.text)
|
36 |
+
|
37 |
+
|
38 |
+
def search_chips(term, num_results=10, advanced=False, unique=False):
|
39 |
+
url = f"http://127.0.0.1:8000/chipsearch?term={term}&num_results={num_results}&advanced={advanced}&unique={unique}"
|
40 |
+
|
41 |
+
try:
|
42 |
+
response = requests.post(url)
|
43 |
+
response.raise_for_status()
|
44 |
+
return response.json()
|
45 |
+
except requests.exceptions.RequestException as e:
|
46 |
+
print(f"Error: {e}")
|
47 |
+
return None
|
48 |
+
|
49 |
+
results = search_chips("top 10 anime of all time")
|
50 |
+
print(results)
|