File size: 6,820 Bytes
0b75c79 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import os, requests, time
import oracledb
from langchain_text_splitters import (
Language,
RecursiveCharacterTextSplitter,
)
from langchain_upstage import UpstageLayoutAnalysisLoader
from bs4 import BeautifulSoup
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_upstage import UpstageEmbeddings
from langchain_community.vectorstores.oraclevs import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy
def upload_db_harrypotter():
global retriever
username=os.environ["DB_USER"]
password=os.environ["DB_PASSWORD"]
dsn=os.environ["DSN"]
con = oracledb.connect(user=username, password=password, dsn=dsn)
try:
conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
print("Connection successful!", conn23c.version)
except Exception as e:
print("Connection failed!")
file_path = "./Harry Potter and the Sorcerers Stone.pdf" # Path to the PDF file
#file_path = "https://en.wikipedia.org/wiki/Donald_Trump"
text_splitter = RecursiveCharacterTextSplitter.from_language(
chunk_size=1500, chunk_overlap=200, language=Language.HTML
)
if file_path.endswith(".pdf"): # Check if the document is PDF
#print("pdf file")
layzer = UpstageLayoutAnalysisLoader(file_path, split="page")
# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
docs = layzer.load() # or layzer.lazy_load()
docs = text_splitter.split_documents(docs)
elif file_path.startswith("http"): # Check if the document is from a website
#print("url")
# Specify the URL of the Wikipedia page you want to scrape
url = file_path
# Send a GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the main content element on the page
content = soup.find(id="mw-content-text")
# Attempt to find and remove the unnecessary section by looking for common identifiers
for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]:
elements = content.find(id=section_id)
if elements:
elements.decompose()
for section_class in ["reflist", "references", "navbox",]:
elements = content.find_all(class_=section_class)
for element in elements:
element.decompose()
# Extract the text from the modified content element
text = content.get_text()
docs = Document(page_content=text)
docs = text_splitter.split_documents([docs])
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
s1time = time.time()
knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c,
table_name="text_embeddings_HarryPotter",
distance_strategy=DistanceStrategy.DOT_PRODUCT)
s2time = time.time()
#print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")
vector_store = OracleVS(client=conn23c,
embedding_function=upstage_embeddings,
table_name="text_embeddings_HarryPotter",
distance_strategy=DistanceStrategy.DOT_PRODUCT)
retriever = vector_store.as_retriever()
def upload_db_Trump():
global retriever
username=os.environ["DB_USER"]
password=os.environ["DB_PASSWORD"]
dsn=os.environ["DSN"]
con = oracledb.connect(user=username, password=password, dsn=dsn)
try:
conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
print("Connection successful!", conn23c.version)
except Exception as e:
print("Connection failed!")
# file_path = "./Harry Potter and the Sorcerers Stone.pdf" # Path to the PDF file
file_path = "https://en.wikipedia.org/wiki/Donald_Trump"
text_splitter = RecursiveCharacterTextSplitter.from_language(
chunk_size=1500, chunk_overlap=200, language=Language.HTML
)
if file_path.endswith(".pdf"): # Check if the document is PDF
#print("pdf file")
layzer = UpstageLayoutAnalysisLoader(file_path, split="page")
# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
docs = layzer.load() # or layzer.lazy_load()
docs = text_splitter.split_documents(docs)
elif file_path.startswith("http"): # Check if the document is from a website
#print("url")
# Specify the URL of the Wikipedia page you want to scrape
url = file_path
# Send a GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the main content element on the page
content = soup.find(id="mw-content-text")
# Attempt to find and remove the unnecessary section by looking for common identifiers
for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]:
elements = content.find(id=section_id)
if elements:
elements.decompose()
for section_class in ["reflist", "references", "navbox",]:
elements = content.find_all(class_=section_class)
for element in elements:
element.decompose()
# Extract the text from the modified content element
text = content.get_text()
docs = Document(page_content=text)
docs = text_splitter.split_documents([docs])
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")
# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
s1time = time.time()
knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c,
table_name="text_embeddings_Trump",
distance_strategy=DistanceStrategy.DOT_PRODUCT)
s2time = time.time()
#print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")
vector_store = OracleVS(client=conn23c,
embedding_function=upstage_embeddings,
table_name="text_embeddings_Trump",
distance_strategy=DistanceStrategy.DOT_PRODUCT)
retriever = vector_store.as_retriever()
def main():
upload_db_harrypotter()
if __name__ == "__main__":
main() |