|
import json |
|
import os |
|
import numpy as np |
|
import faiss |
|
from dotenv import load_dotenv |
|
import openai |
|
from tqdm import tqdm |
|
|
|
|
|
load_dotenv() |
|
api_key = os.getenv("OPENAI_API_KEY") |
|
if not api_key: |
|
raise ValueError("OPENAI_API_KEY saknas i .env-filen!") |
|
openai.api_key = api_key |
|
|
|
|
|
with open("knowledge_base.jsonl", "r", encoding="utf-8") as f: |
|
data = [json.loads(line) for line in f] |
|
|
|
texts = [] |
|
metadata_list = [] |
|
|
|
for item in data: |
|
if "text" in item and item["text"].strip(): |
|
text_clean = item["text"].strip() |
|
texts.append(text_clean) |
|
|
|
metadata_item = {**item.get("metadata", {}), "text": text_clean} |
|
metadata_list.append(metadata_item) |
|
|
|
print(f"🔢 Totalt antal texter: {len(texts)}") |
|
|
|
|
|
def get_embeddings(texts, model="text-embedding-ada-002", batch_size=100): |
|
all_embeddings = [] |
|
for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Skapar embeddings via OpenAI..."): |
|
batch = texts[i:i + batch_size] |
|
try: |
|
response = openai.embeddings.create( |
|
model=model, |
|
input=batch |
|
) |
|
except Exception as e: |
|
print(f"❌ Fel vid API-anrop för batch {i} - {i+batch_size}: {e}") |
|
continue |
|
|
|
|
|
batch_embeddings = [item.embedding for item in response.data] |
|
all_embeddings.extend(batch_embeddings) |
|
return np.array(all_embeddings, dtype=np.float32) |
|
|
|
embeddings = get_embeddings(texts) |
|
|
|
if len(embeddings) == 0: |
|
raise RuntimeError("Inga embeddings kunde skapas. Kontrollera API-nyckel och nätverksanslutning.") |
|
|
|
|
|
embedding_dim = embeddings.shape[1] |
|
index = faiss.IndexFlatL2(embedding_dim) |
|
index.add(embeddings) |
|
|
|
|
|
faiss.write_index(index, "faiss.index") |
|
|
|
with open("faiss_metadata.json", "w", encoding="utf-8") as f: |
|
json.dump(metadata_list, f, ensure_ascii=False, indent=2) |
|
|
|
print("✅ FAISS-index skapat och sparat som 'faiss.index'") |
|
print("📄 Metadata sparad i 'faiss_metadata.json'") |
|
|