Spaces:

Maouu
/

chipling-api

Running

+from curl_cffi import requests
+from bs4 import BeautifulSoup
+def get_images(query):
+    res = requests.get(f'https://www.gettyimages.in/search/2/image?phrase={query}=editorial', impersonate='chrome110')
+    soup = BeautifulSoup(res.text, 'html.parser')
+    images = soup.find_all('img')
+    results = []
+    for image in images:
+        print(image['src'])
+        if image['src'].startswith('https://media.gettyimages.com'):
+            results.append({'src': image['src'], 'alt': image['alt'], 'class':''})
+        else:
+            continue
+    return results

Search/main.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""googlesearch is a Python library for searching Google, easily."""
+from time import sleep
+from bs4 import BeautifulSoup
+from requests import get
+from urllib.parse import unquote # to decode the url
+from Search.useragentka import get_useragent
+from curl_cffi import requests as curlreq
+from Search.gettyimages import get_images
+def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
+    resp = get(
+        url="https://www.google.com/search",
+        headers={
+            "User-Agent": get_useragent(),
+            "Accept": "*/*"
+        },
+        params={
+            "q": term,
+            "num": results + 2,  # Prevents multiple requests
+            "hl": lang,
+            "start": start,
+            "safe": safe,
+            "gl": region,
+        },
+        proxies=proxies,
+        timeout=timeout,
+        verify=ssl_verify,
+        cookies = {
+            'CONSENT': 'PENDING+987', # Bypasses the consent page
+            'SOCS': 'CAESHAgBEhIaAB',
+        }
+    )
+    resp.raise_for_status()
+    return resp
+class SearchResult:
+    def __init__(self, url, title, description):
+        self.url = url
+        self.title = title
+        self.description = description
+    def __repr__(self):
+        return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
+def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
+    """Search the Google search engine"""
+    # Proxy setup
+    proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
+    start = start_num
+    fetched_results = 0
+    fetched_links = set()
+    results_list = []
+    image_results = []  # New list for image results
+    while fetched_results < num_results:
+        # Send request
+        resp = _req(term, num_results - start,
+                    lang, start, proxies, timeout, safe, ssl_verify, region)
+        # Parse
+        soup = BeautifulSoup(resp.text, "html.parser")
+        result_block = soup.find_all("div", class_="ezO2md")
+        new_results = 0
+        # Find all images on the page
+        try:
+            all_images = soup.find_all("img")  # Google's image class
+            for img in all_images:
+                img_src = img.get("src") or img.get("data-src")
+                if img_src:
+                    # Handle base64 images
+                    if img_src.startswith("data:image"):
+                        image_results.append({
+                            "src": img_src,  # Already base64 encoded
+                            "alt": img.get("alt", ""),
+                            "class": img.get("class", []),
+                        })
+                    # Handle regular image URLs
+                    elif img_src.startswith("http"):
+                        image_results.append({
+                            "src": img_src,
+                            "alt": img.get("alt", ""),
+                            "class": img.get("class", []),
+                        })
+        except Exception as e:
+            print(f"Error parsing images: {str(e)}")
+        for result in result_block:
+            link_tag = result.find("a", href=True)
+            title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
+            description_tag = result.find("span", class_="FrIlee")
+            if link_tag and title_tag and description_tag:
+                link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
+                if link in fetched_links and unique:
+                    continue
+                fetched_links.add(link)
+                title = title_tag.text if title_tag else ""
+                description = description_tag.text if description_tag else ""
+                # Only get page_text if advanced mode and we haven't gotten any yet
+                if advanced and not any('page_text' in result for result in results_list):
+                    try:
+                        page_scrape = curlreq.get(link, impersonate='chrome110')
+                        page_scrape.encoding = 'utf-8'
+                        page_soup = BeautifulSoup(page_scrape.text, "html.parser")
+                        # Try multiple strategies to find main content
+                        main_content = (
+                            page_soup.find(['article', 'main']) or
+                            page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
+                            page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
+                            page_soup.find('div', {'role': 'main'}) or
+                            page_soup.body
+                        )
+                        if main_content:
+                            # Remove unwanted elements
+                            for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
+                                element.decompose()
+                            # Extract text with better cleaning
+                            text = main_content.get_text(separator=' ', strip=True)
+                            text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
+                            page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
+                        else:
+                            page_text = ""
+                    except Exception as e:
+                        print(f"Error scraping {link}: {str(e)}")
+                        page_text = ""
+                else:
+                    page_text = ""
+                fetched_results += 1
+                new_results += 1
+                if advanced:
+                    results_list.append({
+                        "link": link,
+                        "title": title,
+                        "description": description,
+                        "page_text": page_text,
+                    })
+                else:
+                    results_list.append(link)
+                if fetched_results >= num_results:
+                    break
+        if new_results == 0:
+            break
+        start += 10
+        sleep(sleep_interval)
+    if image_results == [] :
+        images = get_images(term)
+        return {"results": results_list, "images": images}
+    else:
+        return {"results": results_list, "images": image_results}

Search/useragentka.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import random
+def get_useragent():
+    """
+    Generates a random user agent string mimicking the format of various software versions.
+    The user agent string is composed of:
+    - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
+    - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
+    - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
+    - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
+    Returns:
+        str: A randomly generated user agent string.
+    """
+    lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
+    libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
+    ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
+    openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
+    return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (2.61 kB). View file

__pycache__/prompts.cpython-312.pyc ADDED Viewed

Binary file (2.57 kB). View file

app.py CHANGED Viewed

@@ -15,13 +15,14 @@ from fastapi.templating import Jinja2Templates
 from pathlib import Path
 from collections import Counter, defaultdict
 from utils.logger import log_request
 app = FastAPI()
 # Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
@@ -222,95 +223,50 @@ async def chat(request: ChatRequest):
     log_request("/chat", selected_generator.__name__)
     return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
-@app.post("/generate-modules")
-async def generate_modules(request: Request):
     data = await request.json()
-    search_query = data.get("searchQuery")
-    log_request("/generate-modules", search_query)
-    if not search_query:
-        return {"error": "searchQuery is required"}
-    system_prompt = ChiplingPrompts.generateModules(search_query)
-    current_messages = [
-        {
-            'role': 'system',
-            'content': [{
-                'type': 'text',
-                'text': system_prompt
-            }]
-        },
-        {
-            'role': 'user',
-            'content': [{
-                'type': 'text',
-                'text': search_query
-            }]
-        }
-    ]
-    json_data = {
-        'model': "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        'max_tokens': None,
-        'temperature': 0.7,
-        'top_p': 0.7,
-        'top_k': 50,
-        'repetition_penalty': 1,
-        'stream_tokens': True,
-        'stop': ['<|eot_id|>', '<|eom_id|>'],
-        'messages': current_messages,
-        'stream': True,
-    }
-    selected_generator = random.choice([groqgenerate])
-    return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
-@app.post("/generate-topics")
-async def generate_topics(request: Request):
     data = await request.json()
-    search_query = data.get("searchQuery")
-    if not search_query:
-        return {"error": "searchQuery is required"}
-    log_request("/generate-topics", search_query)
-    system_prompt = ChiplingPrompts.generateTopics(search_query)
-    current_messages = [
-        {
-            'role': 'system',
-            'content': [{
-                'type': 'text',
-                'text': system_prompt
-            }]
-        },
-        {
-            'role': 'user',
-            'content': [{
-                'type': 'text',
-                'text': search_query
-            }]
-        }
-    ]
-    json_data = {
-        'model': "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        'max_tokens': None,
-        'temperature': 0.7,
-        'top_p': 0.7,
-        'top_k': 50,
-        'repetition_penalty': 1,
-        'stream_tokens': True,
-        'stop': ['<|eot_id|>', '<|eom_id|>'],
-        'messages': current_messages,
-        'stream': True,
-    }
-    selected_generator = random.choice([groqgenerate, generate])
-    return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')

 from pathlib import Path
 from collections import Counter, defaultdict
 from utils.logger import log_request
+from Search.main import search
 app = FastAPI()
 # Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["http://localhost:8080", "https://www.chipling.xyz"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
     log_request("/chat", selected_generator.__name__)
     return StreamingResponse(selected_generator(json_data), media_type='text/event-stream')
+@app.post("/fetch-images")
+async def fetch_images(request: Request):
     data = await request.json()
+    query = data.get("query", "")
+    num_results = data.get("num_results", 5)
+    lang = data.get("lang", "en")
+    advanced = data.get("advanced", False)
+    # Call the search function
+    results = search(query, num_results=num_results, lang=lang, advanced=advanced)
+    # Log the request
+    log_request("/fetch-images", query)
+    return results['images']
+@app.post("/fetch-links")
+async def fetch_links(request: Request):
+    data = await request.json()
+    query = data.get("query", "")
+    num_results = data.get("num_results", 5)
+    lang = data.get("lang", "en")
+    advanced = data.get("advanced", False)
+    # Call the search function
+    results = search(query, num_results=num_results, lang=lang, advanced=advanced)
+    # Log the request
+    log_request("/fetch-links", query)
+    return results['results']
+@app.post("/fetch-google")
+async def fetch_google(request: Request):
     data = await request.json()
+    query = data.get("query", "")
+    num_results = data.get("num_results", 5)
+    lang = data.get("lang", "en")
+    advanced = data.get("advanced", True)
+    # Call the search function
+    results = search(query, num_results=num_results, lang=lang, advanced=advanced)
+    # Log the request
+    log_request("/fetch-google", query)
+    return results

logs.json CHANGED Viewed

	@@ -1 +0,0 @@
1	- []

test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import requests
+url = "http://localhost:8000/fetch-images"  # or your deployed URL
+payload = {
+    "query": "sunset beach",
+    "num_results": 5,
+    "lang": "en",
+    "advanced": False
+}
+response = requests.post(url, json=payload)
+if response.ok:
+    results = response.json()
+    print("Fetched Images:", results)
+else:
+    print("Error:", response.status_code, response.text)

utils/__pycache__/logger.cpython-312.pyc ADDED Viewed

Binary file (1.26 kB). View file