""" File: module_ocr.py Description: Gradio module to interact the tesseract OCR code. Author: Didier Guillevic Date: 2024-11-23 """ import gradio as gr import os import uuid import shutil import threading import time import pathlib import ocr import lang_codes # Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user) output_dir = "tmp_results" os.makedirs(output_dir, exist_ok=True) # Define age limit for newly created files (in seconds, 24 hours = 86400 seconds) AGE_LIMIT = 3600 # Function to clean up old PDF files def cleanup_old_files(): while True: current_time = time.time() for filename in os.listdir(output_dir): file_path = os.path.join(output_dir, filename) if filename.endswith(".pdf"): # Check if the file is older than the age limit file_age = current_time - os.path.getmtime(file_path) if file_age > AGE_LIMIT: print(f"Removing old file: {file_path}") os.remove(file_path) # Sleep for an hour before checking again time.sleep(3600) # Start the cleanup thread cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True) cleanup_thread.start() # # Process one file # def process( input_file: str, src_langs: list[str], # list of ISO 639-3 language codes output_type: str ): """Process given file with OCR using given languages." """ # default result output_text = '' output_pdf = None # format language as expected by tesseract package, e.g. 'eng+fra' language = '+'.join(src_langs) # PDF file or image file? input_file_suffix = pathlib.Path(input_file).suffix.lower() # output text? if output_type in ['text', 'text+pdf']: if input_file_suffix == '.pdf': texts = ocr.pdf_scanner.pdf_to_text( # on text per page pdf_path=input_file.name, language=language ) output_text = '\n\n'.join(texts) else: output_text = ocr.pdf_scanner.image_to_text( image_path=input_file, language=language, psm=3 ) # output pdf? if output_type in ['pdf', 'text+pdf']: # Create a path for output PDF file base_filename = os.path.basename(input_file) base_filename, _ = os.path.splitext(base_filename) output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf" output_path = os.path.join(output_dir, output_path) if input_file_suffix == '.pdf': output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf( pdf_path=input_file, output_path=output_path, language=language, attempt_repair=True ) else: output_pdf = ocr.pdf_scanner.image_to_searchable_pdf( image_path=input_file, output_path=output_path, language=language, psm=3 ) return output_text, output_pdf # # User interface # with gr.Blocks() as demo: def update_visibility(file): return gr.update(visible=True) if file else gr.update(visible=False) # Upload file to process with gr.Row(): with gr.Column(): input_file = gr.File( label="Upload an image or a PDF file of a scanned document", height=160 ) output_file = gr.File( label="Download OCR'ed PDF", visible=False # Initially not visible ) with gr.Column(): output_text = gr.Textbox(label="OCR output") # Input: anguage(s) used in document, output types with gr.Row(): src_langs = gr.Dropdown( label='Language(s) of document', choices=lang_codes.tesseract_lang_codes.items(), multiselect=True, value=['eng', 'fra'], scale=4 ) output_type = gr.Dropdown( label='Output type', choices=['text', 'pdf', 'text+pdf'], multiselect=False, value='text+pdf', scale=1 ) # Buttons with gr.Row(): ocr_btn = gr.Button(value="OCR", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") # Examples with gr.Accordion("Examples", open=False): examples = gr.Examples( [ ['./Non-text-searchable.pdf', ['eng','fra']], ['./sample_ID.jpeg', ['eng','fra']], ], inputs=[input_file, src_langs, output_type], outputs=[output_text, output_file], fn=process, cache_examples=False, label="Examples" ) # Documentation with gr.Accordion("Documentation", open=False): gr.Markdown(f""" - Model: using the tesseract package for OCR 1.0 (traditional) """) # Functions ocr_btn.click( fn=process, inputs=[input_file, src_langs, output_type], outputs=[output_text, output_file] ).then( update_visibility, inputs=output_file, outputs=output_file ) clear_btn.click( fn=lambda : (None, '', None), inputs=[], outputs=[input_file, output_text, output_file] # input_file, output_text, output_file ).then( update_visibility, inputs=output_file, outputs=output_file ) if __name__ == '__main__': demo.launch()