Langchained_PGPS_RAG

Sleeping

App Files Files Community

SergeyO7 commited on Mar 25

Commit

cd75fd1

verified ·

1 Parent(s): 63219e4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -68

app.py CHANGED Viewed

@@ -1,71 +1,71 @@
-import asyncio
-from llama_index.core import Document
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.node_parser import SentenceSplitter
-from llama_index.core.ingestion import IngestionPipeline
-from llama_index.core import SimpleDirectoryReader
-reader = SimpleDirectoryReader(input_dir=r"C:\Users\so7\AppData\Local\Programs\Python\Python313\RAG")
-documents = reader.load_data()
-# create the pipeline with transformations
-pipeline = IngestionPipeline(
-    transformations=[
-        SentenceSplitter(chunk_overlap=0),
-        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
-    ]
-)
-# Define an async function to handle the pipeline
-async def main():
-    # Create the pipeline with transformations
-    pipeline = IngestionPipeline(
-        transformations=[
-            SentenceSplitter(chunk_overlap=0),
-            HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
-        ]
     )
-    # Use await inside the async function
-    nodes =
- await pipeline.arun(documents=[Document.example()])
-    # Optional: Do something with the nodes (e.g., print them)
-    print(nodes)
-# Run the async function using asyncio
 if __name__ == "__main__":
-    asyncio.run(main())
-import chromadb
-from llama_index.vector_stores.chroma import ChromaVectorStore
-from llama_index.core.ingestion import IngestionPipeline
-from llama_index.core.node_parser import SentenceSplitter
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-db = chromadb.PersistentClient(path="./pl_db")
-chroma_collection = db.get_or_create_collection("ppgpl")
-vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-pipeline = IngestionPipeline(
-    transformations=[
-        SentenceSplitter(chunk_size=25, chunk_overlap=0),
-        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
-    ],
-    vector_store=vector_store,
-)
-from llama_index.core import VectorStoreIndex
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
-from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
-llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
-query_engine = index.as_query_engine(
-    llm=llm,
-    response_mode="tree_summarize",
-)
-query_engine.query("Солнце на третей ступени")
-# The meaning of life is 42

+# from langchain.document_loaders import DirectoryLoader
+from langchain_community.document_loaders import DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+# from langchain.embeddings import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+import openai
+from dotenv import load_dotenv
+import os
+import shutil
+# Load environment variables. Assumes that project contains .env file with API keys
+load_dotenv()
+#---- Set OpenAI API key
+# Change environment variable name from "OPENAI_API_KEY" to the name given in
+# your .env file.
+openai.api_key = os.environ['OPENAI_API_KEY']
+CHROMA_PATH = "chroma"
+DATA_PATH = "RAG"
+def main():
+    generate_data_store()
+def generate_data_store():
+    documents = load_documents()
+    chunks = split_text(documents)
+    save_to_chroma(chunks)
+def load_documents():
+    loader = DirectoryLoader(DATA_PATH, glob="*.md")
+    documents = loader.load()
+    return documents
+def split_text(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=300,
+        chunk_overlap=100,
+        length_function=len,
+        add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
+    document = chunks[10]
+    print(document.page_content)
+    print(document.metadata)
+    return chunks
+def save_to_chroma(chunks: list[Document]):
+    # Clear out the database first.
+    if os.path.exists(CHROMA_PATH):
+        shutil.rmtree(CHROMA_PATH)
+    # Create a new DB from the documents.
+    db = Chroma.from_documents(
+        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
     )
+    db.persist()
+    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
 if __name__ == "__main__":
+    main()