Didier's picture
Update module_ocr.py
5b41ca2 verified
"""
File: module_ocr.py
Description: Gradio module to interact the tesseract OCR code.
Author: Didier Guillevic
Date: 2024-11-23
"""
import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import ocr
import lang_codes
# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)
# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600
# Function to clean up old PDF files
def cleanup_old_files():
while True:
current_time = time.time()
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if filename.endswith(".pdf"):
# Check if the file is older than the age limit
file_age = current_time - os.path.getmtime(file_path)
if file_age > AGE_LIMIT:
print(f"Removing old file: {file_path}")
os.remove(file_path)
# Sleep for an hour before checking again
time.sleep(3600)
# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()
#
# Process one file
#
def process(
input_file: str,
src_langs: list[str], # list of ISO 639-3 language codes
output_type: str
):
"""Process given file with OCR using given languages."
"""
# default result
output_text = ''
output_pdf = None
# format language as expected by tesseract package, e.g. 'eng+fra'
language = '+'.join(src_langs)
# PDF file or image file?
input_file_suffix = pathlib.Path(input_file).suffix.lower()
# output text?
if output_type in ['text', 'text+pdf']:
if input_file_suffix == '.pdf':
texts = ocr.pdf_scanner.pdf_to_text( # on text per page
pdf_path=input_file.name,
language=language
)
output_text = '\n\n'.join(texts)
else:
output_text = ocr.pdf_scanner.image_to_text(
image_path=input_file,
language=language,
psm=3
)
# output pdf?
if output_type in ['pdf', 'text+pdf']:
# Create a path for output PDF file
base_filename = os.path.basename(input_file)
base_filename, _ = os.path.splitext(base_filename)
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
output_path = os.path.join(output_dir, output_path)
if input_file_suffix == '.pdf':
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
pdf_path=input_file,
output_path=output_path,
language=language,
attempt_repair=True
)
else:
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
image_path=input_file,
output_path=output_path,
language=language,
psm=3
)
return output_text, output_pdf
#
# User interface
#
with gr.Blocks() as demo:
def update_visibility(file):
return gr.update(visible=True) if file else gr.update(visible=False)
# Upload file to process
with gr.Row():
with gr.Column():
input_file = gr.File(
label="Upload an image or a PDF file of a scanned document",
height=160
)
output_file = gr.File(
label="Download OCR'ed PDF",
visible=False # Initially not visible
)
with gr.Column():
output_text = gr.Textbox(label="OCR output")
# Input: anguage(s) used in document, output types
with gr.Row():
src_langs = gr.Dropdown(
label='Language(s) of document',
choices=lang_codes.tesseract_lang_codes.items(),
multiselect=True,
value=['eng', 'fra'],
scale=4
)
output_type = gr.Dropdown(
label='Output type',
choices=['text', 'pdf', 'text+pdf'],
multiselect=False,
value='text+pdf',
scale=1
)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./Non-text-searchable.pdf', ['eng','fra']],
['./sample_ID.jpeg', ['eng','fra']],
],
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file],
fn=process,
cache_examples=False,
label="Examples"
)
# Documentation
with gr.Accordion("Documentation", open=False):
gr.Markdown(f"""
- Model: using the tesseract package for OCR 1.0 (traditional)
""")
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file]
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
clear_btn.click(
fn=lambda : (None, '', None),
inputs=[],
outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
if __name__ == '__main__':
demo.launch()