Spaces:
Running
Running
from langchain_community.document_loaders import (CSVLoader, WikipediaLoader, UnstructuredURLLoader, | |
YoutubeLoader, PyPDFLoader, BSHTMLLoader, | |
Docx2txtLoader, UnstructuredMarkdownLoader) | |
from langchain_unstructured import UnstructuredLoader | |
class DocumentLoader: | |
def load_unstructured(self, path): | |
""" | |
Load data from a file at the specified path: | |
supported files: | |
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx" | |
Args: | |
path (str): The file paths | |
Returns: | |
The loaded data. | |
Exceptions: | |
Prints an error message if the loading fails. | |
""" | |
try: | |
loader = UnstructuredLoader(path) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading Unstructured: {e}") | |
def load_csv(self, path): | |
""" | |
Load data from a CSV file at the specified path. | |
Args: | |
path (str): The file path to the CSV file. | |
Returns: | |
The loaded CSV data. | |
Exceptions: | |
Prints an error message if the CSV loading fails. | |
""" | |
try: | |
loader = CSVLoader(file_path=path) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading CSV: {e}") | |
def wikipedia_query(self, search_query): | |
""" | |
Query Wikipedia using a given search term and return the results. | |
Args: | |
search_query (str): The search term to query on Wikipedia. | |
Returns: | |
The query results. | |
Exceptions: | |
Prints an error message if the Wikipedia query fails. | |
""" | |
try: | |
data = WikipediaLoader(query=search_query, load_max_docs=2).load() | |
return data | |
except Exception as e: | |
print(f"Error querying Wikipedia: {e}") | |
def load_urls(self, urls): | |
""" | |
Load and parse content from a list of URLs. | |
Args: | |
urls (list): A list of URLs to load. | |
Returns: | |
The loaded data from the URLs. | |
Exceptions: | |
Prints an error message if loading URLs fails. | |
""" | |
try: | |
loader = UnstructuredURLLoader(urls=urls) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading URLs: {e}") | |
def load_YouTubeVideo(self, urls): | |
""" | |
Load YouTube video information from provided URLs. | |
Args: | |
urls (list): A list of YouTube video URLs. | |
Returns: | |
The loaded documents from the YouTube URLs. | |
Exceptions: | |
Prints an error message if loading YouTube videos fails. | |
""" | |
try: | |
loader = YoutubeLoader.from_youtube_url( | |
urls, add_video_info=True, language=["en", "pt", "zh-Hans", "es", "ur", "hi"], | |
translation="en") | |
documents = loader.load() | |
return documents | |
except Exception as e: | |
print(f"Error loading YouTube video: {e}") | |
def load_pdf(self, path): | |
""" | |
Load data from a PDF file at the specified path. | |
Args: | |
path (str): The file path to the PDF file. | |
Returns: | |
The loaded and split PDF pages. | |
Exceptions: | |
Prints an error message if the PDF loading fails. | |
""" | |
try: | |
loader = PyPDFLoader(path) | |
pages = loader.load_and_split() | |
return pages | |
except Exception as e: | |
print(f"Error loading PDF: {e}") | |
def load_text_from_html(self, path): | |
""" | |
Load and parse text content from an HTML file at the specified path. | |
Args: | |
path (str): The file path to the HTML file. | |
Returns: | |
The loaded HTML data. | |
Exceptions: | |
Prints an error message if loading text from HTML fails. | |
""" | |
try: | |
loader = BSHTMLLoader(path) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading text from HTML: {e}") | |
def load_markdown(self, path): | |
""" | |
Load data from a Markdown file at the specified path. | |
Args: | |
path (str): The file path to the Markdown file. | |
Returns: | |
The loaded Markdown data. | |
Exceptions: | |
Prints an error message if loading Markdown fails. | |
""" | |
try: | |
loader = UnstructuredMarkdownLoader(path) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading Markdown: {e}") | |
def load_doc(self, path): | |
""" | |
Load data from a DOCX file at the specified path. | |
Args: | |
path (str): The file path to the DOCX file. | |
Returns: | |
The loaded DOCX data. | |
Exceptions: | |
Prints an error message if loading DOCX fails. | |
""" | |
try: | |
loader = Docx2txtLoader(path) | |
data = loader.load() | |
return data | |
except Exception as e: | |
print(f"Error loading DOCX: {e}") | |