Spaces:
Running
Running
"""Extract Web Data""" | |
import os | |
from typing import Optional | |
from langchain_community.document_transformers import BeautifulSoupTransformer | |
from langchain_community.document_loaders import AsyncChromiumLoader | |
from genai_voice.config.defaults import Config | |
from genai_voice.logger.log_utils import log | |
from genai_voice.data_utils.urls import SAMPLE_URLS, HTML_TAGS_TO_TARGET | |
# Set the user agent for http requests identification | |
os.environ["USER_AGENT"] = "myagent" | |
def extract_webpage_data(out_file: Optional[str]): | |
"""Extract Web Page Data""" | |
if not out_file: | |
log(f"No output file, falling back to default: {Config.WEB_SCRAPER_OUTPUT_FILE}") | |
out_file = Config.WEB_SCRAPER_OUTPUT_FILE | |
if not os.path.exists(out_file): | |
os.makedirs(os.path.dirname(out_file), exist_ok=True) | |
# Load HTML content using AsyncChromiumLoader | |
log(f"Creating the AsyncChromiumLoader with #{len(SAMPLE_URLS)} urls...") | |
try: | |
loader = AsyncChromiumLoader(SAMPLE_URLS) | |
docs = loader.load() | |
log("Documents scraped.") | |
# Transform the loaded HTML using BeautifulSoupTransformer | |
log(f"Using BeautifulSoupTransformer to extract {HTML_TAGS_TO_TARGET}.") | |
bs_transformer = BeautifulSoupTransformer() | |
docs_transformed = bs_transformer.transform_documents( | |
docs, tags_to_extract=HTML_TAGS_TO_TARGET | |
) | |
except Exception as e: | |
log("Failed to scrap data.") | |
raise ValueError("Failed to scrap data successfully.") from e | |
log(f"Transformed #{len(docs_transformed)} urls.") | |
data = [doc.page_content for doc in docs_transformed] | |
data = "".join(str(x + "\n\n") for x in data) | |
log("Writing to output file.") | |
with open(out_file, "w", encoding="utf-8") as file: | |
file.write(data) | |
log(f"Successfully written data to '{out_file}'") | |
# poetry run ExtractWebPagesAndSaveData | |
def run(): | |
"""Run Web scraper""" | |
log("Starting the web scraper...") | |
extract_webpage_data(out_file=Config.WEB_SCRAPER_OUTPUT_FILE) | |
log("Completed the web scraper...") | |