|
import os, requests, time |
|
import oracledb |
|
from langchain_text_splitters import ( |
|
Language, |
|
RecursiveCharacterTextSplitter, |
|
) |
|
from langchain_upstage import UpstageLayoutAnalysisLoader |
|
from bs4 import BeautifulSoup |
|
from langchain_core.documents import BaseDocumentTransformer, Document |
|
from langchain_upstage import UpstageEmbeddings |
|
from langchain_community.vectorstores.oraclevs import OracleVS |
|
from langchain_community.vectorstores.utils import DistanceStrategy |
|
|
|
|
|
def upload_db_harrypotter(): |
|
global retriever |
|
|
|
username=os.environ["DB_USER"] |
|
password=os.environ["DB_PASSWORD"] |
|
dsn=os.environ["DSN"] |
|
|
|
con = oracledb.connect(user=username, password=password, dsn=dsn) |
|
|
|
try: |
|
conn23c = oracledb.connect(user=username, password=password, dsn=dsn) |
|
print("Connection successful!", conn23c.version) |
|
except Exception as e: |
|
print("Connection failed!") |
|
|
|
file_path = "./Harry Potter and the Sorcerers Stone.pdf" |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_language( |
|
chunk_size=1500, chunk_overlap=200, language=Language.HTML |
|
) |
|
|
|
if file_path.endswith(".pdf"): |
|
|
|
layzer = UpstageLayoutAnalysisLoader(file_path, split="page") |
|
|
|
|
|
docs = layzer.load() |
|
|
|
docs = text_splitter.split_documents(docs) |
|
elif file_path.startswith("http"): |
|
|
|
|
|
url = file_path |
|
|
|
|
|
response = requests.get(url) |
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
content = soup.find(id="mw-content-text") |
|
|
|
|
|
for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]: |
|
elements = content.find(id=section_id) |
|
if elements: |
|
elements.decompose() |
|
|
|
for section_class in ["reflist", "references", "navbox",]: |
|
elements = content.find_all(class_=section_class) |
|
for element in elements: |
|
element.decompose() |
|
|
|
|
|
text = content.get_text() |
|
|
|
docs = Document(page_content=text) |
|
|
|
docs = text_splitter.split_documents([docs]) |
|
|
|
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large") |
|
|
|
|
|
s1time = time.time() |
|
|
|
knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c, |
|
table_name="text_embeddings_HarryPotter", |
|
distance_strategy=DistanceStrategy.DOT_PRODUCT) |
|
|
|
s2time = time.time() |
|
|
|
|
|
vector_store = OracleVS(client=conn23c, |
|
embedding_function=upstage_embeddings, |
|
table_name="text_embeddings_HarryPotter", |
|
distance_strategy=DistanceStrategy.DOT_PRODUCT) |
|
|
|
retriever = vector_store.as_retriever() |
|
|
|
|
|
def upload_db_Trump(): |
|
global retriever |
|
|
|
username=os.environ["DB_USER"] |
|
password=os.environ["DB_PASSWORD"] |
|
dsn=os.environ["DSN"] |
|
|
|
con = oracledb.connect(user=username, password=password, dsn=dsn) |
|
|
|
try: |
|
conn23c = oracledb.connect(user=username, password=password, dsn=dsn) |
|
print("Connection successful!", conn23c.version) |
|
except Exception as e: |
|
print("Connection failed!") |
|
|
|
|
|
file_path = "https://en.wikipedia.org/wiki/Donald_Trump" |
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_language( |
|
chunk_size=1500, chunk_overlap=200, language=Language.HTML |
|
) |
|
|
|
if file_path.endswith(".pdf"): |
|
|
|
layzer = UpstageLayoutAnalysisLoader(file_path, split="page") |
|
|
|
|
|
docs = layzer.load() |
|
|
|
docs = text_splitter.split_documents(docs) |
|
elif file_path.startswith("http"): |
|
|
|
|
|
url = file_path |
|
|
|
|
|
response = requests.get(url) |
|
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
content = soup.find(id="mw-content-text") |
|
|
|
|
|
for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]: |
|
elements = content.find(id=section_id) |
|
if elements: |
|
elements.decompose() |
|
|
|
for section_class in ["reflist", "references", "navbox",]: |
|
elements = content.find_all(class_=section_class) |
|
for element in elements: |
|
element.decompose() |
|
|
|
|
|
text = content.get_text() |
|
|
|
docs = Document(page_content=text) |
|
|
|
docs = text_splitter.split_documents([docs]) |
|
|
|
upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large") |
|
|
|
|
|
s1time = time.time() |
|
|
|
knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c, |
|
table_name="text_embeddings_Trump", |
|
distance_strategy=DistanceStrategy.DOT_PRODUCT) |
|
|
|
s2time = time.time() |
|
|
|
|
|
vector_store = OracleVS(client=conn23c, |
|
embedding_function=upstage_embeddings, |
|
table_name="text_embeddings_Trump", |
|
distance_strategy=DistanceStrategy.DOT_PRODUCT) |
|
|
|
retriever = vector_store.as_retriever() |
|
|
|
|
|
def main(): |
|
upload_db_harrypotter() |
|
|
|
if __name__ == "__main__": |
|
main() |