import pandas as pd import json import re def remove_prefix(text, prefix_pattern): """ Removes the prefix matching the given pattern from the text. """ return re.sub(prefix_pattern, "", text).strip() def main(): # Read the Excel file try: df = pd.read_excel("data/manabi.xlsx") print("Excel file read successfully.") except FileNotFoundError: print("The file 'data/manabi.xlsx' was not found. Please check the file path.") return except Exception as e: print(f"An error occurred while reading the Excel file: {e}") return # Check if the necessary columns exist if "質問" not in df.columns or "回答" not in df.columns: print("The Excel file must contain '質問' and '回答' columns.") return # Initialize the list to store processed data qa_list = [] # Iterate over each row in the DataFrame for index, row in df.iterrows(): raw_question = str(row["質問"]) raw_answer = str(row["回答"]) # Remove prefixes using regex patterns question = remove_prefix(raw_question, r"^Q\d+\.\s*") answer = remove_prefix(raw_answer, r"^A\.\s*") qa_list.append({"question": question, "answer": answer}) # Save the list to a JSON file try: with open("data/qa_data.json", "w", encoding="utf-8") as json_file: json.dump(qa_list, json_file, ensure_ascii=False, indent=2) print("Data has been successfully saved to 'data/qa_data.json'.") except Exception as e: print(f"An error occurred while writing to JSON file: {e}") if __name__ == "__main__": main()