Spaces:

Maouu
/

chipling-api

Running

App Files Files Community

Maouu commited on 15 days ago

Commit

328de20

1 Parent(s): 949c5db

added google search!

Browse files

Files changed (11) hide show

.DS_Store +0 -0
__pycache__/app.cpython-312.pyc +0 -0
app.py +12 -0
chipsearch/__pycache__/gettyimages.cpython-312.pyc +0 -0
chipsearch/__pycache__/main.cpython-312.pyc +0 -0
chipsearch/__pycache__/useragentka.cpython-312.pyc +0 -0
chipsearch/gettyimages.py +21 -0
chipsearch/main.py +163 -0
chipsearch/useragentka.py +20 -0
requirements.txt +3 -1
test.py +47 -29

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

__pycache__/app.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from fastapi.templating import Jinja2Templates
 from pathlib import Path
 from collections import Counter, defaultdict
 from utils.logger import log_request
 app = FastAPI()
@@ -412,3 +413,14 @@ async def chat(request: ChatRequest):
     log_request("/chat", selected_generator.__name__)
     return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')

 from pathlib import Path
 from collections import Counter, defaultdict
 from utils.logger import log_request
+from chipsearch.main import search
 app = FastAPI()
     log_request("/chat", selected_generator.__name__)
     return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
+@app.post("/chipsearch")
+async def chipsearch(request: Request):
+    data = search(
+        term=request.query_params.get("term"),
+        num_results=int(request.query_params.get("num_results", 10)),
+        advanced=bool(request.query_params.get("advanced", False)),
+        unique=bool(request.query_params.get("unique", False))
+    )
+    return data

chipsearch/__pycache__/gettyimages.cpython-312.pyc ADDED Viewed

Binary file (1.04 kB). View file

chipsearch/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (6.98 kB). View file

chipsearch/__pycache__/useragentka.cpython-312.pyc ADDED Viewed

Binary file (1.68 kB). View file

chipsearch/gettyimages.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from curl_cffi import requests
+from bs4 import BeautifulSoup
+def get_images(query):
+    res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
+    soup = BeautifulSoup(res.text, 'html.parser')
+    images = soup.find_all('img')
+    results = []
+    for image in images:
+        print(image['src'])
+        if image['src'].startswith('https://media.gettyimages.com'):
+            results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
+        else:
+            continue
+    return results

chipsearch/main.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""googlesearch is a Python library for searching Google, easily."""
+from time import sleep
+from bs4 import BeautifulSoup
+from requests import get
+from urllib.parse import unquote # to decode the url
+from chipsearch.useragentka import get_useragent
+from curl_cffi import requests as curlreq
+from chipsearch.gettyimages import get_images
+def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
+    resp = get(
+        url="https://www.google.com/search",
+        headers={
+            "User-Agent": get_useragent(),
+            "Accept": "*/*"
+        },
+        params={
+            "q": term,
+            "num": results + 2,  # Prevents multiple requests
+            "hl": lang,
+            "start": start,
+            "safe": safe,
+            "gl": region,
+        },
+        proxies=proxies,
+        timeout=timeout,
+        verify=ssl_verify,
+        cookies = {
+            'CONSENT': 'PENDING+987', # Bypasses the consent page
+            'SOCS': 'CAESHAgBEhIaAB',
+        }
+    )
+    resp.raise_for_status()
+    return resp
+class SearchResult:
+    def __init__(self, url, title, description):
+        self.url = url
+        self.title = title
+        self.description = description
+    def __repr__(self):
+        return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
+def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
+    """Search the Google search engine"""
+    # Proxy setup
+    proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
+    start = start_num
+    fetched_results = 0
+    fetched_links = set()
+    results_list = []
+    image_results = []  # New list for image results
+    while fetched_results < num_results:
+        # Send request
+        resp = _req(term, num_results - start,
+                    lang, start, proxies, timeout, safe, ssl_verify, region)
+        # Parse
+        soup = BeautifulSoup(resp.text, "html.parser")
+        result_block = soup.find_all("div", class_="ezO2md")
+        new_results = 0
+        # Find all images on the page
+        try:
+            all_images = soup.find_all("img")  # Google's image class
+            for img in all_images:
+                img_src = img.get("src") or img.get("data-src")
+                if img_src:
+                    # Handle base64 images
+                    if img_src.startswith("data:image"):
+                        image_results.append({
+                            "src": img_src,  # Already base64 encoded
+                            "alt": img.get("alt", ""),
+                            "class": img.get("class", []),
+                        })
+                    # Handle regular image URLs
+                    elif img_src.startswith("http"):
+                        image_results.append({
+                            "src": img_src,
+                            "alt": img.get("alt", ""),
+                            "class": img.get("class", []),
+                        })
+        except Exception as e:
+            print(f"Error parsing images: {str(e)}")
+        for result in result_block:
+            link_tag = result.find("a", href=True)
+            title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
+            description_tag = result.find("span", class_="FrIlee")
+            if link_tag and title_tag and description_tag:
+                link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
+                if link in fetched_links and unique:
+                    continue
+                fetched_links.add(link)
+                title = title_tag.text if title_tag else ""
+                description = description_tag.text if description_tag else ""
+                # Only get page_text if advanced mode and we haven't gotten any yet
+                if advanced and not any('page_text' in result for result in results_list):
+                    try:
+                        page_scrape = curlreq.get(link, impersonate='chrome110')
+                        page_scrape.encoding = 'utf-8'
+                        page_soup = BeautifulSoup(page_scrape.text, "html.parser")
+                        # Try multiple strategies to find main content
+                        main_content = (
+                            page_soup.find(['article', 'main']) or
+                            page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
+                            page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
+                            page_soup.find('div', {'role': 'main'}) or
+                            page_soup.body
+                        )
+                        if main_content:
+                            # Remove unwanted elements
+                            for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
+                                element.decompose()
+                            # Extract text with better cleaning
+                            text = main_content.get_text(separator=' ', strip=True)
+                            text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
+                            page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
+                        else:
+                            page_text = ""
+                    except Exception as e:
+                        print(f"Error scraping {link}: {str(e)}")
+                        page_text = ""
+                else:
+                    page_text = ""
+                fetched_results += 1
+                new_results += 1
+                if advanced:
+                    results_list.append({
+                        "link": link,
+                        "title": title,
+                        "description": description,
+                        "page_text": page_text,
+                    })
+                else:
+                    results_list.append(link)
+                if fetched_results >= num_results:
+                    break
+        if new_results == 0:
+            break
+        start += 10
+        sleep(sleep_interval)
+    if image_results == [] :
+        images = get_images(term)
+        return {"results": results_list, "images": images}
+    else:
+        return {"results": results_list, "images": image_results}

chipsearch/useragentka.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import random
+def get_useragent():
+    """
+    Generates a random user agent string mimicking the format of various software versions.
+    The user agent string is composed of:
+    - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
+    - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
+    - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
+    - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
+    Returns:
+        str: A randomly generated user agent string.
+    """
+    lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
+    libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
+    ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
+    openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
+    return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ asyncio
 groq
 jinja2
 aiofiles
-matplotlib

 groq
 jinja2
 aiofiles
+matplotlib
+curl_cffi
+beautifulsoup4

test.py CHANGED Viewed

@@ -1,32 +1,50 @@
 import requests
 import json
-url = "http://127.0.0.1:8000/chat"
-payload = {
-    "model": "your-model-name",
-    "message": "Create a json object of top 10 anime! answer in json only!",
-    "messages": []
-}
-headers = {
-    "Content-Type": "application/json"
-}
-# Send streaming POST request
-with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
-    if response.status_code == 200:
-        for line in response.iter_lines(decode_unicode=True):
-            if line and line.startswith('data: '):
-                try:
-                    # Remove 'data: ' prefix and parse JSON
-                    json_data = json.loads(line[6:])
-                    # Extract text from choices if available
-                    if json_data.get('choices') and len(json_data['choices']) > 0:
-                        text = json_data['choices'][0].get('text', '')
-                        if text:
-                            print(text, end='')
-                except json.JSONDecodeError:
-                    continue
-    else:
-        print("Error:", response.status_code, response.text)

 import requests
 import json
+# import requests
+# import json
+# url = "http://127.0.0.1:8000/chat"
+# payload = {
+#     "model": "your-model-name",
+#     "message": "Create a json object of top 10 anime! answer in json only!",
+#     "messages": []
+# }
+# headers = {
+#     "Content-Type": "application/json"
+# }
+# # Send streaming POST request
+# with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
+#     if response.status_code == 200:
+#         for line in response.iter_lines(decode_unicode=True):
+#             if line and line.startswith('data: '):
+#                 try:
+#                     # Remove 'data: ' prefix and parse JSON
+#                     json_data = json.loads(line[6:])
+#                     # Extract text from choices if available
+#                     if json_data.get('choices') and len(json_data['choices']) > 0:
+#                         text = json_data['choices'][0].get('text', '')
+#                         if text:
+#                             print(text, end='')
+#                 except json.JSONDecodeError:
+#                     continue
+#     else:
+#         print("Error:", response.status_code, response.text)
+def search_chips(term, num_results=10, advanced=False, unique=False):
+    url = f"http://127.0.0.1:8000/chipsearch?term={term}&num_results={num_results}&advanced={advanced}&unique={unique}"
+    try:
+        response = requests.post(url)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        print(f"Error: {e}")
+        return None
+results = search_chips("top 10 anime of all time")
+print(results)