File size: 2,290 Bytes
17bc83e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import os
import numpy as np
import faiss
from dotenv import load_dotenv
import openai
from tqdm import tqdm

# === Steg 1: Ladda API-nyckel från .env ===
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY saknas i .env-filen!")
openai.api_key = api_key

# === Steg 2: Ladda kunskapsbas från JSONL ===
with open("knowledge_base.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

texts = []
metadata_list = []

for item in data:
    if "text" in item and item["text"].strip():
        text_clean = item["text"].strip()
        texts.append(text_clean)
        # Inkludera texten i metadata
        metadata_item = {**item.get("metadata", {}), "text": text_clean}
        metadata_list.append(metadata_item)

print(f"🔢 Totalt antal texter: {len(texts)}")

# === Steg 3: Skapa embeddings med ny syntax ===
def get_embeddings(texts, model="text-embedding-ada-002", batch_size=100):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Skapar embeddings via OpenAI..."):
        batch = texts[i:i + batch_size]
        try:
            response = openai.embeddings.create(
                model=model,
                input=batch
            )
        except Exception as e:
            print(f"❌ Fel vid API-anrop för batch {i} - {i+batch_size}: {e}")
            continue
        
        # Använd attributet .data istället för att subscript:a response
        batch_embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embeddings)
    return np.array(all_embeddings, dtype=np.float32)

embeddings = get_embeddings(texts)

if len(embeddings) == 0:
    raise RuntimeError("Inga embeddings kunde skapas. Kontrollera API-nyckel och nätverksanslutning.")

# === Steg 4: Bygg FAISS-index ===
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# === Steg 5: Spara index och metadata ===
faiss.write_index(index, "faiss.index")

with open("faiss_metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)

print("✅ FAISS-index skapat och sparat som 'faiss.index'")
print("📄 Metadata sparad i 'faiss_metadata.json'")