|
import pandas as pd |
|
import json |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
df = pd.read_excel("artiklar_rensad_sammanfattad.xlsx") |
|
|
|
|
|
columns_to_drop = ["(Ändra inte) Kunskapsbasartikel", "(Ändra inte) Kontrollsumma för rad"] |
|
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore') |
|
|
|
|
|
records = [] |
|
|
|
for _, row in df.iterrows(): |
|
|
|
article_id = str(row.get("Offentligt artikelnummer", "")).strip() |
|
title = str(row.get("Rubrik", "")).strip() |
|
content = str(row.get("Innehåll", "")).strip() |
|
description = row.get("Beskrivning", "") |
|
keywords = str(row.get("Nyckelord", "")).strip() |
|
|
|
|
|
content = BeautifulSoup(content, "html.parser").get_text(separator="\n").strip() |
|
if pd.notna(description): |
|
description = BeautifulSoup(str(description), "html.parser").get_text(separator="\n").strip() |
|
else: |
|
description = "" |
|
|
|
|
|
text_parts = [content] |
|
if description: |
|
text_parts.append(description) |
|
full_text = "\n\n".join(text_parts).strip() |
|
|
|
|
|
record = { |
|
"text": full_text, |
|
"metadata": { |
|
"id": article_id, |
|
"rubrik": title, |
|
"nyckelord": keywords, |
|
} |
|
} |
|
|
|
|
|
if full_text: |
|
records.append(record) |
|
|
|
|
|
with open("knowledge_base.jsonl", "w", encoding="utf-8") as f: |
|
for r in records: |
|
f.write(json.dumps(r, ensure_ascii=False) + "\n") |
|
|
|
print(f"✅ Klar! Sparade {len(records)} artiklar i knowledge_base.jsonl") |
|
|