File size: 2,290 Bytes
17bc83e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import json
import os
import numpy as np
import faiss
from dotenv import load_dotenv
import openai
from tqdm import tqdm
# === Steg 1: Ladda API-nyckel från .env ===
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY saknas i .env-filen!")
openai.api_key = api_key
# === Steg 2: Ladda kunskapsbas från JSONL ===
with open("knowledge_base.jsonl", "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
texts = []
metadata_list = []
for item in data:
if "text" in item and item["text"].strip():
text_clean = item["text"].strip()
texts.append(text_clean)
# Inkludera texten i metadata
metadata_item = {**item.get("metadata", {}), "text": text_clean}
metadata_list.append(metadata_item)
print(f"🔢 Totalt antal texter: {len(texts)}")
# === Steg 3: Skapa embeddings med ny syntax ===
def get_embeddings(texts, model="text-embedding-ada-002", batch_size=100):
all_embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Skapar embeddings via OpenAI..."):
batch = texts[i:i + batch_size]
try:
response = openai.embeddings.create(
model=model,
input=batch
)
except Exception as e:
print(f"❌ Fel vid API-anrop för batch {i} - {i+batch_size}: {e}")
continue
# Använd attributet .data istället för att subscript:a response
batch_embeddings = [item.embedding for item in response.data]
all_embeddings.extend(batch_embeddings)
return np.array(all_embeddings, dtype=np.float32)
embeddings = get_embeddings(texts)
if len(embeddings) == 0:
raise RuntimeError("Inga embeddings kunde skapas. Kontrollera API-nyckel och nätverksanslutning.")
# === Steg 4: Bygg FAISS-index ===
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
# === Steg 5: Spara index och metadata ===
faiss.write_index(index, "faiss.index")
with open("faiss_metadata.json", "w", encoding="utf-8") as f:
json.dump(metadata_list, f, ensure_ascii=False, indent=2)
print("✅ FAISS-index skapat och sparat som 'faiss.index'")
print("📄 Metadata sparad i 'faiss_metadata.json'")
|