chatbot-QA / prepare.py
vumichien's picture
init
628d1d2
import pandas as pd
import json
import re
def remove_prefix(text, prefix_pattern):
"""
Removes the prefix matching the given pattern from the text.
"""
return re.sub(prefix_pattern, "", text).strip()
def main():
# Read the Excel file
try:
df = pd.read_excel("data/manabi.xlsx")
print("Excel file read successfully.")
except FileNotFoundError:
print("The file 'data/manabi.xlsx' was not found. Please check the file path.")
return
except Exception as e:
print(f"An error occurred while reading the Excel file: {e}")
return
# Check if the necessary columns exist
if "θ³ͺ問" not in df.columns or "ε›žη­”" not in df.columns:
print("The Excel file must contain 'θ³ͺ問' and 'ε›žη­”' columns.")
return
# Initialize the list to store processed data
qa_list = []
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
raw_question = str(row["θ³ͺ問"])
raw_answer = str(row["ε›žη­”"])
# Remove prefixes using regex patterns
question = remove_prefix(raw_question, r"^Q\d+\.\s*")
answer = remove_prefix(raw_answer, r"^A\.\s*")
qa_list.append({"question": question, "answer": answer})
# Save the list to a JSON file
try:
with open("data/qa_data.json", "w", encoding="utf-8") as json_file:
json.dump(qa_list, json_file, ensure_ascii=False, indent=2)
print("Data has been successfully saved to 'data/qa_data.json'.")
except Exception as e:
print(f"An error occurred while writing to JSON file: {e}")
if __name__ == "__main__":
main()