import json import os import numpy as np import faiss from dotenv import load_dotenv import openai from tqdm import tqdm # === Steg 1: Ladda API-nyckel från .env === load_dotenv() api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY saknas i .env-filen!") openai.api_key = api_key # === Steg 2: Ladda kunskapsbas från JSONL === with open("knowledge_base.jsonl", "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] texts = [] metadata_list = [] for item in data: if "text" in item and item["text"].strip(): text_clean = item["text"].strip() texts.append(text_clean) # Inkludera texten i metadata metadata_item = {**item.get("metadata", {}), "text": text_clean} metadata_list.append(metadata_item) print(f"🔢 Totalt antal texter: {len(texts)}") # === Steg 3: Skapa embeddings med ny syntax === def get_embeddings(texts, model="text-embedding-ada-002", batch_size=100): all_embeddings = [] for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Skapar embeddings via OpenAI..."): batch = texts[i:i + batch_size] try: response = openai.embeddings.create( model=model, input=batch ) except Exception as e: print(f"❌ Fel vid API-anrop för batch {i} - {i+batch_size}: {e}") continue # Använd attributet .data istället för att subscript:a response batch_embeddings = [item.embedding for item in response.data] all_embeddings.extend(batch_embeddings) return np.array(all_embeddings, dtype=np.float32) embeddings = get_embeddings(texts) if len(embeddings) == 0: raise RuntimeError("Inga embeddings kunde skapas. Kontrollera API-nyckel och nätverksanslutning.") # === Steg 4: Bygg FAISS-index === embedding_dim = embeddings.shape[1] index = faiss.IndexFlatL2(embedding_dim) index.add(embeddings) # === Steg 5: Spara index och metadata === faiss.write_index(index, "faiss.index") with open("faiss_metadata.json", "w", encoding="utf-8") as f: json.dump(metadata_list, f, ensure_ascii=False, indent=2) print("✅ FAISS-index skapat och sparat som 'faiss.index'") print("📄 Metadata sparad i 'faiss_metadata.json'")