SergeyO7 commited on
Commit
cd75fd1
·
verified ·
1 Parent(s): 63219e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -68
app.py CHANGED
@@ -1,71 +1,71 @@
1
- import asyncio
2
- from llama_index.core import Document
3
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
4
- from llama_index.core.node_parser import SentenceSplitter
5
- from llama_index.core.ingestion import IngestionPipeline
6
- from llama_index.core import SimpleDirectoryReader
7
-
8
- reader = SimpleDirectoryReader(input_dir=r"C:\Users\so7\AppData\Local\Programs\Python\Python313\RAG")
9
- documents = reader.load_data()
10
-
11
- # create the pipeline with transformations
12
- pipeline = IngestionPipeline(
13
- transformations=[
14
- SentenceSplitter(chunk_overlap=0),
15
- HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
16
- ]
17
- )
18
-
19
- # Define an async function to handle the pipeline
20
- async def main():
21
- # Create the pipeline with transformations
22
- pipeline = IngestionPipeline(
23
- transformations=[
24
- SentenceSplitter(chunk_overlap=0),
25
- HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
26
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  )
28
- # Use await inside the async function
29
- nodes =
30
- await pipeline.arun(documents=[Document.example()])
31
- # Optional: Do something with the nodes (e.g., print them)
32
- print(nodes)
33
 
34
- # Run the async function using asyncio
35
  if __name__ == "__main__":
36
- asyncio.run(main())
37
-
38
- import chromadb
39
- from llama_index.vector_stores.chroma import ChromaVectorStore
40
- from llama_index.core.ingestion import IngestionPipeline
41
- from llama_index.core.node_parser import SentenceSplitter
42
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
43
-
44
- db = chromadb.PersistentClient(path="./pl_db")
45
- chroma_collection = db.get_or_create_collection("ppgpl")
46
- vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
47
-
48
- pipeline = IngestionPipeline(
49
- transformations=[
50
- SentenceSplitter(chunk_size=25, chunk_overlap=0),
51
- HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
52
- ],
53
- vector_store=vector_store,
54
- )
55
-
56
-
57
- from llama_index.core import VectorStoreIndex
58
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
59
-
60
- embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
61
- index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)
62
-
63
- from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
64
-
65
- llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
66
- query_engine = index.as_query_engine(
67
- llm=llm,
68
- response_mode="tree_summarize",
69
- )
70
- query_engine.query("Солнце на третей ступени")
71
- # The meaning of life is 42
 
1
+ # from langchain.document_loaders import DirectoryLoader
2
+ from langchain_community.document_loaders import DirectoryLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.schema import Document
5
+ # from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_community.vectorstores import Chroma
8
+ import openai
9
+ from dotenv import load_dotenv
10
+ import os
11
+ import shutil
12
+
13
+ # Load environment variables. Assumes that project contains .env file with API keys
14
+ load_dotenv()
15
+ #---- Set OpenAI API key
16
+ # Change environment variable name from "OPENAI_API_KEY" to the name given in
17
+ # your .env file.
18
+ openai.api_key = os.environ['OPENAI_API_KEY']
19
+
20
+ CHROMA_PATH = "chroma"
21
+ DATA_PATH = "RAG"
22
+
23
+
24
+ def main():
25
+ generate_data_store()
26
+
27
+
28
+ def generate_data_store():
29
+ documents = load_documents()
30
+ chunks = split_text(documents)
31
+ save_to_chroma(chunks)
32
+
33
+
34
+ def load_documents():
35
+ loader = DirectoryLoader(DATA_PATH, glob="*.md")
36
+ documents = loader.load()
37
+ return documents
38
+
39
+
40
+ def split_text(documents: list[Document]):
41
+ text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=300,
43
+ chunk_overlap=100,
44
+ length_function=len,
45
+ add_start_index=True,
46
+ )
47
+ chunks = text_splitter.split_documents(documents)
48
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
49
+
50
+ document = chunks[10]
51
+ print(document.page_content)
52
+ print(document.metadata)
53
+
54
+ return chunks
55
+
56
+
57
+ def save_to_chroma(chunks: list[Document]):
58
+ # Clear out the database first.
59
+ if os.path.exists(CHROMA_PATH):
60
+ shutil.rmtree(CHROMA_PATH)
61
+
62
+ # Create a new DB from the documents.
63
+ db = Chroma.from_documents(
64
+ chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
65
  )
66
+ db.persist()
67
+ print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
68
+
 
 
69
 
 
70
  if __name__ == "__main__":
71
+ main()