|
import logging |
|
import os |
|
import urllib.parse |
|
from typing import Callable, Union, List |
|
import json |
|
import dspy |
|
import pandas as pd |
|
import requests |
|
import re |
|
import uuid |
|
import os |
|
|
|
from tqdm import tqdm |
|
|
|
from .utils import WebPageHelper |
|
|
|
|
|
def clean_text(res): |
|
|
|
pattern = r'\[.*?\]\(.*?\)' |
|
|
|
result = re.sub(pattern, '', res) |
|
url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' |
|
result = re.sub(url_pattern, '', result) |
|
result = re.sub(r"\n\n+", "\n", result) |
|
return result |
|
|
|
|
|
|
|
class GoogleSearchAli_new(dspy.Retrieve): |
|
def __init__(self, bing_search_api_key=None, k=3, is_valid_source: Callable = None, |
|
min_char_count: int = 150, snippet_chunk_size: int = 1000, webpage_helper_max_threads=10, |
|
mkt='en-US', language='en-US', **kwargs): |
|
|
|
super().__init__(k=k) |
|
key = os.environ.get('searchkey', 'default_value') |
|
self.header = { |
|
"Content-Type": "application/json", |
|
"Accept-Encoding": "utf-8", |
|
"Authorization": f"Bearer lm-/{key}== ", |
|
} |
|
self.template = { |
|
"rid": str(uuid.uuid4()), |
|
"scene": "dolphin_search_bing_nlp", |
|
"uq": "", |
|
"debug": True, |
|
"fields": [], |
|
"page": 1, |
|
"rows": 10, |
|
"customConfigInfo": { |
|
"multiSearch": False, |
|
"qpMultiQuery": False, |
|
"qpMultiQueryHistory": [], |
|
"qpSpellcheck": False, |
|
"qpEmbedding": False, |
|
"knnWithScript": False, |
|
"qpTermsWeight": False, |
|
"pluginServiceConfig": {"qp": "mvp_search_qp_qwen"}, |
|
}, |
|
"headers": {"__d_head_qto": 5000}, |
|
} |
|
|
|
self.webpage_helper = WebPageHelper( |
|
min_char_count=min_char_count, |
|
snippet_chunk_size=snippet_chunk_size, |
|
max_thread_num=webpage_helper_max_threads |
|
) |
|
self.usage = 0 |
|
|
|
|
|
if is_valid_source: |
|
self.is_valid_source = is_valid_source |
|
else: |
|
self.is_valid_source = lambda x: True |
|
|
|
def get_usage_and_reset(self): |
|
usage = self.usage |
|
self.usage = 0 |
|
|
|
return {'BingSearch': usage} |
|
|
|
def forward(self, query_or_queries: Union[str, List[str]], exclude_urls: List[str] = []): |
|
|
|
queries = ( |
|
[query_or_queries] |
|
if isinstance(query_or_queries, str) |
|
else query_or_queries |
|
) |
|
self.usage += len(queries) |
|
|
|
url_to_results = {} |
|
|
|
for query in queries: |
|
try: |
|
self.template["uq"] = query |
|
|
|
response = requests.post( |
|
"https://nlp-cn-beijing.aliyuncs.com/gw/v1/api/msearch-sp/qwen-search", |
|
data=json.dumps(self.template), |
|
headers=self.header, |
|
) |
|
response = json.loads(response.text) |
|
search_results = response['data']['docs'] |
|
for result in search_results: |
|
url_to_results[result['url']] = { |
|
'url': result['url'], |
|
'title': result['title'], |
|
'description': result.get('snippet', '') |
|
} |
|
except Exception as e: |
|
logging.error(f'Error occurs when searching query {query}: {e}') |
|
|
|
valid_url_to_snippets = self.webpage_helper.urls_to_snippets(list(url_to_results.keys())) |
|
collected_results = [] |
|
for url in valid_url_to_snippets: |
|
r = url_to_results[url] |
|
r['snippets'] = valid_url_to_snippets[url]['snippets'] |
|
collected_results.append(r) |
|
|
|
print(f'lengt of collected_results :{len(collected_results)}') |
|
return collected_results |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
retrieval = GoogleSearchAli_new() |
|
retrieval("中国") |
|
|