Spaces:
Sleeping
Sleeping
File size: 1,259 Bytes
e1cda2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import Iterable
def load_documents(website: str) -> list[Document]:
"""
Loads documents from a given website.
Args:
website (str): The URL of the website to load documents from.
Returns:
list[Document]: A list of loaded documents.
"""
loader = WebBaseLoader(website)
return loader.load()
def format_documents(docs: list[Document]) -> str:
"""
Formats a list of documents into a single string.
Args:
docs (list[Document]): The list of documents to format.
Returns:
str: The formatted documents as a single string.
"""
return "\n\n".join(doc.page_content for doc in docs)
def split_documents(documents: Iterable[Document]) -> list[Document]:
"""
Splits documents into smaller chunks.
Args:
documents (Iterable[Document]): The documents to split.
Returns:
list[Document]: A list of split documents.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
return text_splitter.split_documents(documents)
|