File size: 4,043 Bytes
1a20a59 2fafc94 e607fab 2fafc94 e607fab 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 e607fab 2fafc94 e607fab 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 1a20a59 2fafc94 e607fab 1a20a59 2fafc94 e607fab 2fafc94 1a20a59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
"""
Parse documents, currently pdf and xml are supported.
"""
import os
from langchain.document_loaders import (
PyMuPDFLoader,
)
from langchain.docstore.document import Document
from langchain.text_splitter import (
# RecursiveCharacterTextSplitter,
SpacyTextSplitter,
)
def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None):
"""Load and parse pdf file(s)."""
if pdf_path.endswith(".pdf"): # single file
pdf_docs = [pdf_path]
else: # a directory
pdf_docs = [
os.path.join(pdf_path, f)
for f in os.listdir(pdf_path)
if f.endswith(".pdf")
]
if load_kwargs is None:
load_kwargs = {}
docs = []
if loader_module is None: # set pdf loader
loader_module = PyMuPDFLoader
for pdf in pdf_docs:
loader = loader_module(pdf, **load_kwargs)
doc = loader.load()
docs.extend(doc)
return docs
def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None):
"""Load and parse xml file(s)."""
from bs4 import BeautifulSoup
from unstructured.cleaners.core import group_broken_paragraphs
if xml_path.endswith(".xml"): # single file
xml_docs = [xml_path]
else: # a directory
xml_docs = [
os.path.join(xml_path, f)
for f in os.listdir(xml_path)
if f.endswith(".xml")
]
if load_kwargs is None:
load_kwargs = {}
docs = []
for xml_file in xml_docs:
with open(xml_file) as fp:
soup = BeautifulSoup(
fp, features="xml"
) # txt is simply the a string with your XML file
pageText = soup.findAll(string=True)
parsed_text = "\n".join(pageText) # or " ".join, seems similar
# Clean text
parsed_text_grouped = group_broken_paragraphs(parsed_text)
# get metadata
try:
from lxml import etree as ET
tree = ET.parse(xml_file)
# Define namespace
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# Read Author personal names as an example
pers_name_elements = tree.xpath(
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName",
namespaces=ns,
)
first_per = pers_name_elements[0].text
author_info = first_per + " et al"
title_elements = tree.xpath(
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns
)
title = title_elements[0].text
# Combine source info
source_info = "_".join([author_info, title])
except:
source_info = "unknown"
# maybe even better parsing method. TODO: discuss with TUD
# first_author = soup.find("author")
# publication_year = soup.find("date", attrs={'type': 'published'})
# title = soup.find("title")
# source_info = [first_author, publication_year, title]
# source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info])
doc = [
Document(
page_content=parsed_text_grouped, metadata={"source": source_info}
)
]
docs.extend(doc)
return docs
def get_doc_chunks(docs, splitter=None):
"""Split docs into chunks."""
if splitter is None:
# splitter = RecursiveCharacterTextSplitter( # original default
# # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256
# separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128
# )
# Spacy seems better
splitter = SpacyTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=128,
)
chunks = splitter.split_documents(docs)
return chunks
|