Spaces:

mominah
/

Chatbot-backend

Running

File size: 5,414 Bytes

7b7cab6

from langchain_community.document_loaders  import (CSVLoader, WikipediaLoader, UnstructuredURLLoader,
                                        YoutubeLoader, PyPDFLoader, BSHTMLLoader,
                                        Docx2txtLoader, UnstructuredMarkdownLoader)

from langchain_unstructured import UnstructuredLoader


class DocumentLoader:
    def load_unstructured(self, path):
        """
        Load data from a file at the specified path:

        supported files:
        "csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx"


        Args:
            path (str): The file paths

        Returns:
            The loaded  data.

        Exceptions:
            Prints an error message if the loading fails.
        """
        try:
            loader = UnstructuredLoader(path)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading Unstructured: {e}")

    def load_csv(self, path):
        """
        Load data from a CSV file at the specified path.

        Args:
            path (str): The file path to the CSV file.

        Returns:
            The loaded CSV data.

        Exceptions:
            Prints an error message if the CSV loading fails.
        """
        try:
            loader = CSVLoader(file_path=path)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading CSV: {e}")

    def wikipedia_query(self, search_query):
        """
        Query Wikipedia using a given search term and return the results.

        Args:
            search_query (str): The search term to query on Wikipedia.

        Returns:
            The query results.

        Exceptions:
            Prints an error message if the Wikipedia query fails.
        """
        try:
            data = WikipediaLoader(query=search_query, load_max_docs=2).load()
            return data
        except Exception as e:
            print(f"Error querying Wikipedia: {e}")

    def load_urls(self, urls):
        """
        Load and parse content from a list of URLs.

        Args:
            urls (list): A list of URLs to load.

        Returns:
            The loaded data from the URLs.

        Exceptions:
            Prints an error message if loading URLs fails.
        """
        try:
            loader = UnstructuredURLLoader(urls=urls)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading URLs: {e}")

    def load_YouTubeVideo(self, urls):
        """
        Load YouTube video information from provided URLs.

        Args:
            urls (list): A list of YouTube video URLs.

        Returns:
            The loaded documents from the YouTube URLs.

        Exceptions:
            Prints an error message if loading YouTube videos fails.
        """
        try:
            loader = YoutubeLoader.from_youtube_url(
                urls, add_video_info=True, language=["en", "pt", "zh-Hans", "es", "ur", "hi"],
                translation="en")
            documents = loader.load()
            return documents
        except Exception as e:
            print(f"Error loading YouTube video: {e}")

    def load_pdf(self, path):
        """
        Load data from a PDF file at the specified path.

        Args:
            path (str): The file path to the PDF file.

        Returns:
            The loaded and split PDF pages.

        Exceptions:
            Prints an error message if the PDF loading fails.
        """
        try:
            loader = PyPDFLoader(path)
            pages = loader.load_and_split()
            return pages
        except Exception as e:
            print(f"Error loading PDF: {e}")

    def load_text_from_html(self, path):
        """
        Load and parse text content from an HTML file at the specified path.

        Args:
            path (str): The file path to the HTML file.

        Returns:
            The loaded HTML data.

        Exceptions:
            Prints an error message if loading text from HTML fails.
        """
        try:
            loader = BSHTMLLoader(path)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading text from HTML: {e}")

    def load_markdown(self, path):
        """
        Load data from a Markdown file at the specified path.

        Args:
            path (str): The file path to the Markdown file.

        Returns:
            The loaded Markdown data.

        Exceptions:
            Prints an error message if loading Markdown fails.
        """
        try:
            loader = UnstructuredMarkdownLoader(path)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading Markdown: {e}")

    def load_doc(self, path):
        """
        Load data from a DOCX file at the specified path.

        Args:
            path (str): The file path to the DOCX file.

        Returns:
            The loaded DOCX data.

        Exceptions:
            Prints an error message if loading DOCX fails.
        """
        try:
            loader = Docx2txtLoader(path)
            data = loader.load()
            return data
        except Exception as e:
            print(f"Error loading DOCX: {e}")