Spaces:
Sleeping
Sleeping
Update application/utils/web_search.py
Browse files
application/utils/web_search.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
|
|
|
2 |
from bs4 import BeautifulSoup
|
|
|
3 |
import re
|
4 |
-
from duckduckgo_search import DDGS
|
5 |
|
6 |
class WebScarper:
|
7 |
def __init__(self):
|
@@ -19,16 +20,15 @@ class WebScarper:
|
|
19 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
20 |
}
|
21 |
response = requests.get(url, headers=headers, timeout=10)
|
22 |
-
response.raise_for_status()
|
23 |
return response.text
|
24 |
except requests.exceptions.RequestException as e:
|
25 |
print(f"Error fetching URL {url}: {e}")
|
26 |
return None
|
27 |
-
|
28 |
def get_text(self, data):
|
29 |
soup = BeautifulSoup(data, 'html.parser')
|
30 |
text = soup.get_text()
|
31 |
-
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
32 |
return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text
|
33 |
|
34 |
def scarpe(self, query):
|
|
|
1 |
+
# application/utils/web_search.py
|
2 |
+
from duckduckgo_search import DDGS # Simpler import
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
import requests
|
5 |
import re
|
|
|
6 |
|
7 |
class WebScarper:
|
8 |
def __init__(self):
|
|
|
20 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
21 |
}
|
22 |
response = requests.get(url, headers=headers, timeout=10)
|
23 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
24 |
return response.text
|
25 |
except requests.exceptions.RequestException as e:
|
26 |
print(f"Error fetching URL {url}: {e}")
|
27 |
return None
|
|
|
28 |
def get_text(self, data):
|
29 |
soup = BeautifulSoup(data, 'html.parser')
|
30 |
text = soup.get_text()
|
31 |
+
cleaned_text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
|
32 |
return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text
|
33 |
|
34 |
def scarpe(self, query):
|