Chatbot-backend / document_loaders.py
mominah's picture
Upload 11 files
7b7cab6 verified
from langchain_community.document_loaders import (CSVLoader, WikipediaLoader, UnstructuredURLLoader,
YoutubeLoader, PyPDFLoader, BSHTMLLoader,
Docx2txtLoader, UnstructuredMarkdownLoader)
from langchain_unstructured import UnstructuredLoader
class DocumentLoader:
def load_unstructured(self, path):
"""
Load data from a file at the specified path:
supported files:
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx"
Args:
path (str): The file paths
Returns:
The loaded data.
Exceptions:
Prints an error message if the loading fails.
"""
try:
loader = UnstructuredLoader(path)
data = loader.load()
return data
except Exception as e:
print(f"Error loading Unstructured: {e}")
def load_csv(self, path):
"""
Load data from a CSV file at the specified path.
Args:
path (str): The file path to the CSV file.
Returns:
The loaded CSV data.
Exceptions:
Prints an error message if the CSV loading fails.
"""
try:
loader = CSVLoader(file_path=path)
data = loader.load()
return data
except Exception as e:
print(f"Error loading CSV: {e}")
def wikipedia_query(self, search_query):
"""
Query Wikipedia using a given search term and return the results.
Args:
search_query (str): The search term to query on Wikipedia.
Returns:
The query results.
Exceptions:
Prints an error message if the Wikipedia query fails.
"""
try:
data = WikipediaLoader(query=search_query, load_max_docs=2).load()
return data
except Exception as e:
print(f"Error querying Wikipedia: {e}")
def load_urls(self, urls):
"""
Load and parse content from a list of URLs.
Args:
urls (list): A list of URLs to load.
Returns:
The loaded data from the URLs.
Exceptions:
Prints an error message if loading URLs fails.
"""
try:
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()
return data
except Exception as e:
print(f"Error loading URLs: {e}")
def load_YouTubeVideo(self, urls):
"""
Load YouTube video information from provided URLs.
Args:
urls (list): A list of YouTube video URLs.
Returns:
The loaded documents from the YouTube URLs.
Exceptions:
Prints an error message if loading YouTube videos fails.
"""
try:
loader = YoutubeLoader.from_youtube_url(
urls, add_video_info=True, language=["en", "pt", "zh-Hans", "es", "ur", "hi"],
translation="en")
documents = loader.load()
return documents
except Exception as e:
print(f"Error loading YouTube video: {e}")
def load_pdf(self, path):
"""
Load data from a PDF file at the specified path.
Args:
path (str): The file path to the PDF file.
Returns:
The loaded and split PDF pages.
Exceptions:
Prints an error message if the PDF loading fails.
"""
try:
loader = PyPDFLoader(path)
pages = loader.load_and_split()
return pages
except Exception as e:
print(f"Error loading PDF: {e}")
def load_text_from_html(self, path):
"""
Load and parse text content from an HTML file at the specified path.
Args:
path (str): The file path to the HTML file.
Returns:
The loaded HTML data.
Exceptions:
Prints an error message if loading text from HTML fails.
"""
try:
loader = BSHTMLLoader(path)
data = loader.load()
return data
except Exception as e:
print(f"Error loading text from HTML: {e}")
def load_markdown(self, path):
"""
Load data from a Markdown file at the specified path.
Args:
path (str): The file path to the Markdown file.
Returns:
The loaded Markdown data.
Exceptions:
Prints an error message if loading Markdown fails.
"""
try:
loader = UnstructuredMarkdownLoader(path)
data = loader.load()
return data
except Exception as e:
print(f"Error loading Markdown: {e}")
def load_doc(self, path):
"""
Load data from a DOCX file at the specified path.
Args:
path (str): The file path to the DOCX file.
Returns:
The loaded DOCX data.
Exceptions:
Prints an error message if loading DOCX fails.
"""
try:
loader = Docx2txtLoader(path)
data = loader.load()
return data
except Exception as e:
print(f"Error loading DOCX: {e}")