|
""" |
|
File: module_ocr.py |
|
|
|
Description: Gradio module to interact the tesseract OCR code. |
|
|
|
Author: Didier Guillevic |
|
Date: 2024-11-23 |
|
""" |
|
|
|
import gradio as gr |
|
import os |
|
import uuid |
|
import shutil |
|
import threading |
|
import time |
|
import pathlib |
|
|
|
import ocr |
|
import lang_codes |
|
|
|
|
|
|
|
output_dir = "tmp_results" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
AGE_LIMIT = 3600 |
|
|
|
|
|
def cleanup_old_files(): |
|
while True: |
|
current_time = time.time() |
|
for filename in os.listdir(output_dir): |
|
file_path = os.path.join(output_dir, filename) |
|
if filename.endswith(".pdf"): |
|
|
|
file_age = current_time - os.path.getmtime(file_path) |
|
if file_age > AGE_LIMIT: |
|
print(f"Removing old file: {file_path}") |
|
os.remove(file_path) |
|
|
|
time.sleep(3600) |
|
|
|
|
|
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True) |
|
cleanup_thread.start() |
|
|
|
|
|
|
|
|
|
def process( |
|
input_file: str, |
|
src_langs: list[str], |
|
output_type: str |
|
): |
|
"""Process given file with OCR using given languages." |
|
""" |
|
|
|
output_text = '' |
|
output_pdf = None |
|
|
|
|
|
language = '+'.join(src_langs) |
|
|
|
|
|
input_file_suffix = pathlib.Path(input_file).suffix.lower() |
|
|
|
|
|
if output_type in ['text', 'text+pdf']: |
|
if input_file_suffix == '.pdf': |
|
texts = ocr.pdf_scanner.pdf_to_text( |
|
pdf_path=input_file.name, |
|
language=language |
|
) |
|
output_text = '\n\n'.join(texts) |
|
else: |
|
output_text = ocr.pdf_scanner.image_to_text( |
|
image_path=input_file, |
|
language=language, |
|
psm=3 |
|
) |
|
|
|
|
|
if output_type in ['pdf', 'text+pdf']: |
|
|
|
base_filename = os.path.basename(input_file) |
|
base_filename, _ = os.path.splitext(base_filename) |
|
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf" |
|
output_path = os.path.join(output_dir, output_path) |
|
|
|
if input_file_suffix == '.pdf': |
|
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf( |
|
pdf_path=input_file, |
|
output_path=output_path, |
|
language=language, |
|
attempt_repair=True |
|
) |
|
else: |
|
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf( |
|
image_path=input_file, |
|
output_path=output_path, |
|
language=language, |
|
psm=3 |
|
) |
|
|
|
return output_text, output_pdf |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
def update_visibility(file): |
|
return gr.update(visible=True) if file else gr.update(visible=False) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
input_file = gr.File( |
|
label="Upload an image or a PDF file of a scanned document", |
|
height=160 |
|
) |
|
output_file = gr.File( |
|
label="Download OCR'ed PDF", |
|
visible=False |
|
) |
|
with gr.Column(): |
|
output_text = gr.Textbox(label="OCR output") |
|
|
|
|
|
with gr.Row(): |
|
src_langs = gr.Dropdown( |
|
label='Language(s) of document', |
|
choices=lang_codes.tesseract_lang_codes.items(), |
|
multiselect=True, |
|
value=['eng', 'fra'], |
|
scale=4 |
|
) |
|
output_type = gr.Dropdown( |
|
label='Output type', |
|
choices=['text', 'pdf', 'text+pdf'], |
|
multiselect=False, |
|
value='text+pdf', |
|
scale=1 |
|
) |
|
|
|
|
|
with gr.Row(): |
|
ocr_btn = gr.Button(value="OCR", variant="primary") |
|
clear_btn = gr.Button("Clear", variant="secondary") |
|
|
|
|
|
with gr.Accordion("Examples", open=False): |
|
examples = gr.Examples( |
|
[ |
|
['./Non-text-searchable.pdf', ['eng','fra']], |
|
['./sample_ID.jpeg', ['eng','fra']], |
|
], |
|
inputs=[input_file, src_langs, output_type], |
|
outputs=[output_text, output_file], |
|
fn=process, |
|
cache_examples=False, |
|
label="Examples" |
|
) |
|
|
|
|
|
with gr.Accordion("Documentation", open=False): |
|
gr.Markdown(f""" |
|
- Model: using the tesseract package for OCR 1.0 (traditional) |
|
""") |
|
|
|
|
|
ocr_btn.click( |
|
fn=process, |
|
inputs=[input_file, src_langs, output_type], |
|
outputs=[output_text, output_file] |
|
).then( |
|
update_visibility, |
|
inputs=output_file, |
|
outputs=output_file |
|
) |
|
clear_btn.click( |
|
fn=lambda : (None, '', None), |
|
inputs=[], |
|
outputs=[input_file, output_text, output_file] |
|
).then( |
|
update_visibility, |
|
inputs=output_file, |
|
outputs=output_file |
|
) |
|
|
|
if __name__ == '__main__': |
|
demo.launch() |
|
|