Spaces:

pizb
/

LRPG

Sleeping

App Files Files Community

LRPG / prototype /upload_db.py

pizb

Upload folder using huggingface_hub

0b75c79 verified 9 months ago

raw

history blame contribute delete

6.82 kB

	import os, requests, time
	import oracledb
	from langchain_text_splitters import (
	Language,
	RecursiveCharacterTextSplitter,
	)
	from langchain_upstage import UpstageLayoutAnalysisLoader
	from bs4 import BeautifulSoup
	from langchain_core.documents import BaseDocumentTransformer, Document
	from langchain_upstage import UpstageEmbeddings
	from langchain_community.vectorstores.oraclevs import OracleVS
	from langchain_community.vectorstores.utils import DistanceStrategy


	def upload_db_harrypotter():
	global retriever

	username=os.environ["DB_USER"]
	password=os.environ["DB_PASSWORD"]
	dsn=os.environ["DSN"]

	con = oracledb.connect(user=username, password=password, dsn=dsn)

	try:
	conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
	print("Connection successful!", conn23c.version)
	except Exception as e:
	print("Connection failed!")

	file_path = "./Harry Potter and the Sorcerers Stone.pdf" # Path to the PDF file
	#file_path = "https://en.wikipedia.org/wiki/Donald_Trump"

	text_splitter = RecursiveCharacterTextSplitter.from_language(
	chunk_size=1500, chunk_overlap=200, language=Language.HTML
	)

	if file_path.endswith(".pdf"): # Check if the document is PDF
	#print("pdf file")
	layzer = UpstageLayoutAnalysisLoader(file_path, split="page")

	# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
	docs = layzer.load() # or layzer.lazy_load()

	docs = text_splitter.split_documents(docs)
	elif file_path.startswith("http"): # Check if the document is from a website
	#print("url")
	# Specify the URL of the Wikipedia page you want to scrape
	url = file_path

	# Send a GET request to the URL
	response = requests.get(url)

	# Create a BeautifulSoup object to parse the HTML content
	soup = BeautifulSoup(response.content, "html.parser")

	# Find the main content element on the page
	content = soup.find(id="mw-content-text")

	# Attempt to find and remove the unnecessary section by looking for common identifiers
	for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]:
	elements = content.find(id=section_id)
	if elements:
	elements.decompose()

	for section_class in ["reflist", "references", "navbox",]:
	elements = content.find_all(class_=section_class)
	for element in elements:
	element.decompose()

	# Extract the text from the modified content element
	text = content.get_text()

	docs = Document(page_content=text)

	docs = text_splitter.split_documents([docs])

	upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

	# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
	s1time = time.time()

	knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c,
	table_name="text_embeddings_HarryPotter",
	distance_strategy=DistanceStrategy.DOT_PRODUCT)

	s2time = time.time()
	#print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")

	vector_store = OracleVS(client=conn23c,
	embedding_function=upstage_embeddings,
	table_name="text_embeddings_HarryPotter",
	distance_strategy=DistanceStrategy.DOT_PRODUCT)

	retriever = vector_store.as_retriever()


	def upload_db_Trump():
	global retriever

	username=os.environ["DB_USER"]
	password=os.environ["DB_PASSWORD"]
	dsn=os.environ["DSN"]

	con = oracledb.connect(user=username, password=password, dsn=dsn)

	try:
	conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
	print("Connection successful!", conn23c.version)
	except Exception as e:
	print("Connection failed!")

	# file_path = "./Harry Potter and the Sorcerers Stone.pdf" # Path to the PDF file
	file_path = "https://en.wikipedia.org/wiki/Donald_Trump"

	text_splitter = RecursiveCharacterTextSplitter.from_language(
	chunk_size=1500, chunk_overlap=200, language=Language.HTML
	)

	if file_path.endswith(".pdf"): # Check if the document is PDF
	#print("pdf file")
	layzer = UpstageLayoutAnalysisLoader(file_path, split="page")

	# For improved memory efficiency, consider using the lazy_load method to load documents page by page.
	docs = layzer.load() # or layzer.lazy_load()

	docs = text_splitter.split_documents(docs)
	elif file_path.startswith("http"): # Check if the document is from a website
	#print("url")
	# Specify the URL of the Wikipedia page you want to scrape
	url = file_path

	# Send a GET request to the URL
	response = requests.get(url)

	# Create a BeautifulSoup object to parse the HTML content
	soup = BeautifulSoup(response.content, "html.parser")

	# Find the main content element on the page
	content = soup.find(id="mw-content-text")

	# Attempt to find and remove the unnecessary section by looking for common identifiers
	for section_id in ["References", "references", "See_also", "External_links", "Footnotes", "Further_reading"]:
	elements = content.find(id=section_id)
	if elements:
	elements.decompose()

	for section_class in ["reflist", "references", "navbox",]:
	elements = content.find_all(class_=section_class)
	for element in elements:
	element.decompose()

	# Extract the text from the modified content element
	text = content.get_text()

	docs = Document(page_content=text)

	docs = text_splitter.split_documents([docs])

	upstage_embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

	# Configure the vector store with the model, table name, and using the indicated distance strategy for the similarity search and vectorize the chunks
	s1time = time.time()

	knowledge_base = OracleVS.from_documents(docs, upstage_embeddings, client=conn23c,
	table_name="text_embeddings_Trump",
	distance_strategy=DistanceStrategy.DOT_PRODUCT)

	s2time = time.time()
	#print( f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")

	vector_store = OracleVS(client=conn23c,
	embedding_function=upstage_embeddings,
	table_name="text_embeddings_Trump",
	distance_strategy=DistanceStrategy.DOT_PRODUCT)

	retriever = vector_store.as_retriever()


	def main():
	upload_db_harrypotter()

	if __name__ == "__main__":
	main()