Spaces:
Runtime error
Runtime error
import os | |
import re | |
import glob | |
def extract_assistant_answers(input_file): | |
"""Extract the text after 'Assistant:' from the input file.""" | |
with open(input_file, 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Split content by "Assistant:" to get all sections after it | |
sections = content.split("Assistant:") | |
# Process each section to get clean answers | |
answers = [] | |
for section in sections[1:]: # Skip the first split as it's before first "Assistant:" | |
# Get text up to next "Q" or "User:" or end of string | |
answer = section.split("Q")[0].split("User:")[0].strip() | |
if answer: | |
answers.append(answer) | |
return answers | |
def process_all_files(): | |
"""Process all image_*.txt files in the qa_outputs directory.""" | |
# Get all image_*.txt files | |
input_files = glob.glob("qa_outputs/image_*.txt") | |
for input_file in input_files: | |
# Extract the base name without extension | |
base_name = os.path.splitext(input_file)[0] | |
output_file = f"{base_name}_extr.txt" | |
# Extract answers | |
answers = extract_assistant_answers(input_file) | |
# Write answers to the output file | |
with open(output_file, 'w', encoding='utf-8') as f: | |
for i, answer in enumerate(answers, 1): | |
f.write(f"{answer}\n") | |
print(f"Processed {input_file} -> {output_file}") | |
if __name__ == "__main__": | |
process_all_files() | |
print("Extraction complete! Check the files with '_extr' suffix.") |