""" Parse documents, currently pdf and xml are supported. """ import os from langchain.document_loaders import ( PyMuPDFLoader, ) from langchain.docstore.document import Document from langchain.text_splitter import ( # RecursiveCharacterTextSplitter, SpacyTextSplitter, ) def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None): """Load and parse pdf file(s).""" if pdf_path.endswith(".pdf"): # single file pdf_docs = [pdf_path] else: # a directory pdf_docs = [ os.path.join(pdf_path, f) for f in os.listdir(pdf_path) if f.endswith(".pdf") ] if load_kwargs is None: load_kwargs = {} docs = [] if loader_module is None: # set pdf loader loader_module = PyMuPDFLoader for pdf in pdf_docs: loader = loader_module(pdf, **load_kwargs) doc = loader.load() docs.extend(doc) return docs def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None): """Load and parse xml file(s).""" from bs4 import BeautifulSoup from unstructured.cleaners.core import group_broken_paragraphs if xml_path.endswith(".xml"): # single file xml_docs = [xml_path] else: # a directory xml_docs = [ os.path.join(xml_path, f) for f in os.listdir(xml_path) if f.endswith(".xml") ] if load_kwargs is None: load_kwargs = {} docs = [] for xml_file in xml_docs: with open(xml_file) as fp: soup = BeautifulSoup( fp, features="xml" ) # txt is simply the a string with your XML file pageText = soup.findAll(string=True) parsed_text = "\n".join(pageText) # or " ".join, seems similar # Clean text parsed_text_grouped = group_broken_paragraphs(parsed_text) # get metadata try: from lxml import etree as ET tree = ET.parse(xml_file) # Define namespace ns = {"tei": "http://www.tei-c.org/ns/1.0"} # Read Author personal names as an example pers_name_elements = tree.xpath( "tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName", namespaces=ns, ) first_per = pers_name_elements[0].text author_info = first_per + " et al" title_elements = tree.xpath( "tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns ) title = title_elements[0].text # Combine source info source_info = "_".join([author_info, title]) except: source_info = "unknown" # maybe even better parsing method. TODO: discuss with TUD # first_author = soup.find("author") # publication_year = soup.find("date", attrs={'type': 'published'}) # title = soup.find("title") # source_info = [first_author, publication_year, title] # source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info]) doc = [ Document( page_content=parsed_text_grouped, metadata={"source": source_info} ) ] docs.extend(doc) return docs def get_doc_chunks(docs, splitter=None): """Split docs into chunks.""" if splitter is None: # splitter = RecursiveCharacterTextSplitter( # original default # # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256 # separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128 # ) # Spacy seems better splitter = SpacyTextSplitter.from_tiktoken_encoder( chunk_size=512, chunk_overlap=128, ) chunks = splitter.split_documents(docs) return chunks