AIVIZ-BOT / processing /documents.py
vaishnaveswar's picture
revert
e1cda2e
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import Iterable
def load_documents(website: str) -> list[Document]:
"""
Loads documents from a given website.
Args:
website (str): The URL of the website to load documents from.
Returns:
list[Document]: A list of loaded documents.
"""
loader = WebBaseLoader(website)
return loader.load()
def format_documents(docs: list[Document]) -> str:
"""
Formats a list of documents into a single string.
Args:
docs (list[Document]): The list of documents to format.
Returns:
str: The formatted documents as a single string.
"""
return "\n\n".join(doc.page_content for doc in docs)
def split_documents(documents: Iterable[Document]) -> list[Document]:
"""
Splits documents into smaller chunks.
Args:
documents (Iterable[Document]): The documents to split.
Returns:
list[Document]: A list of split documents.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
return text_splitter.split_documents(documents)