from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from typing import Iterable def load_documents(website: str) -> list[Document]: """ Loads documents from a given website. Args: website (str): The URL of the website to load documents from. Returns: list[Document]: A list of loaded documents. """ loader = WebBaseLoader(website) return loader.load() def format_documents(docs: list[Document]) -> str: """ Formats a list of documents into a single string. Args: docs (list[Document]): The list of documents to format. Returns: str: The formatted documents as a single string. """ return "\n\n".join(doc.page_content for doc in docs) def split_documents(documents: Iterable[Document]) -> list[Document]: """ Splits documents into smaller chunks. Args: documents (Iterable[Document]): The documents to split. Returns: list[Document]: A list of split documents. """ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) return text_splitter.split_documents(documents)